genesis-flow 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genesis_flow-1.0.0.dist-info/METADATA +822 -0
- genesis_flow-1.0.0.dist-info/RECORD +645 -0
- genesis_flow-1.0.0.dist-info/WHEEL +5 -0
- genesis_flow-1.0.0.dist-info/entry_points.txt +19 -0
- genesis_flow-1.0.0.dist-info/licenses/LICENSE.txt +202 -0
- genesis_flow-1.0.0.dist-info/top_level.txt +1 -0
- mlflow/__init__.py +367 -0
- mlflow/__main__.py +3 -0
- mlflow/ag2/__init__.py +56 -0
- mlflow/ag2/ag2_logger.py +294 -0
- mlflow/anthropic/__init__.py +40 -0
- mlflow/anthropic/autolog.py +129 -0
- mlflow/anthropic/chat.py +144 -0
- mlflow/artifacts/__init__.py +268 -0
- mlflow/autogen/__init__.py +144 -0
- mlflow/autogen/chat.py +142 -0
- mlflow/azure/__init__.py +26 -0
- mlflow/azure/auth_handler.py +257 -0
- mlflow/azure/client.py +319 -0
- mlflow/azure/config.py +120 -0
- mlflow/azure/connection_factory.py +340 -0
- mlflow/azure/exceptions.py +27 -0
- mlflow/azure/stores.py +327 -0
- mlflow/azure/utils.py +183 -0
- mlflow/bedrock/__init__.py +45 -0
- mlflow/bedrock/_autolog.py +202 -0
- mlflow/bedrock/chat.py +122 -0
- mlflow/bedrock/stream.py +160 -0
- mlflow/bedrock/utils.py +43 -0
- mlflow/cli.py +707 -0
- mlflow/client.py +12 -0
- mlflow/config/__init__.py +56 -0
- mlflow/crewai/__init__.py +79 -0
- mlflow/crewai/autolog.py +253 -0
- mlflow/crewai/chat.py +29 -0
- mlflow/data/__init__.py +75 -0
- mlflow/data/artifact_dataset_sources.py +170 -0
- mlflow/data/code_dataset_source.py +40 -0
- mlflow/data/dataset.py +123 -0
- mlflow/data/dataset_registry.py +168 -0
- mlflow/data/dataset_source.py +110 -0
- mlflow/data/dataset_source_registry.py +219 -0
- mlflow/data/delta_dataset_source.py +167 -0
- mlflow/data/digest_utils.py +108 -0
- mlflow/data/evaluation_dataset.py +562 -0
- mlflow/data/filesystem_dataset_source.py +81 -0
- mlflow/data/http_dataset_source.py +145 -0
- mlflow/data/huggingface_dataset.py +258 -0
- mlflow/data/huggingface_dataset_source.py +118 -0
- mlflow/data/meta_dataset.py +104 -0
- mlflow/data/numpy_dataset.py +223 -0
- mlflow/data/pandas_dataset.py +231 -0
- mlflow/data/polars_dataset.py +352 -0
- mlflow/data/pyfunc_dataset_mixin.py +31 -0
- mlflow/data/schema.py +76 -0
- mlflow/data/sources.py +1 -0
- mlflow/data/spark_dataset.py +406 -0
- mlflow/data/spark_dataset_source.py +74 -0
- mlflow/data/spark_delta_utils.py +118 -0
- mlflow/data/tensorflow_dataset.py +350 -0
- mlflow/data/uc_volume_dataset_source.py +81 -0
- mlflow/db.py +27 -0
- mlflow/dspy/__init__.py +17 -0
- mlflow/dspy/autolog.py +197 -0
- mlflow/dspy/callback.py +398 -0
- mlflow/dspy/constant.py +1 -0
- mlflow/dspy/load.py +93 -0
- mlflow/dspy/save.py +393 -0
- mlflow/dspy/util.py +109 -0
- mlflow/dspy/wrapper.py +226 -0
- mlflow/entities/__init__.py +104 -0
- mlflow/entities/_mlflow_object.py +52 -0
- mlflow/entities/assessment.py +545 -0
- mlflow/entities/assessment_error.py +80 -0
- mlflow/entities/assessment_source.py +141 -0
- mlflow/entities/dataset.py +92 -0
- mlflow/entities/dataset_input.py +51 -0
- mlflow/entities/dataset_summary.py +62 -0
- mlflow/entities/document.py +48 -0
- mlflow/entities/experiment.py +109 -0
- mlflow/entities/experiment_tag.py +35 -0
- mlflow/entities/file_info.py +45 -0
- mlflow/entities/input_tag.py +35 -0
- mlflow/entities/lifecycle_stage.py +35 -0
- mlflow/entities/logged_model.py +228 -0
- mlflow/entities/logged_model_input.py +26 -0
- mlflow/entities/logged_model_output.py +32 -0
- mlflow/entities/logged_model_parameter.py +46 -0
- mlflow/entities/logged_model_status.py +74 -0
- mlflow/entities/logged_model_tag.py +33 -0
- mlflow/entities/metric.py +200 -0
- mlflow/entities/model_registry/__init__.py +29 -0
- mlflow/entities/model_registry/_model_registry_entity.py +13 -0
- mlflow/entities/model_registry/model_version.py +243 -0
- mlflow/entities/model_registry/model_version_deployment_job_run_state.py +44 -0
- mlflow/entities/model_registry/model_version_deployment_job_state.py +70 -0
- mlflow/entities/model_registry/model_version_search.py +25 -0
- mlflow/entities/model_registry/model_version_stages.py +25 -0
- mlflow/entities/model_registry/model_version_status.py +35 -0
- mlflow/entities/model_registry/model_version_tag.py +35 -0
- mlflow/entities/model_registry/prompt.py +73 -0
- mlflow/entities/model_registry/prompt_version.py +244 -0
- mlflow/entities/model_registry/registered_model.py +175 -0
- mlflow/entities/model_registry/registered_model_alias.py +35 -0
- mlflow/entities/model_registry/registered_model_deployment_job_state.py +39 -0
- mlflow/entities/model_registry/registered_model_search.py +25 -0
- mlflow/entities/model_registry/registered_model_tag.py +35 -0
- mlflow/entities/multipart_upload.py +74 -0
- mlflow/entities/param.py +49 -0
- mlflow/entities/run.py +97 -0
- mlflow/entities/run_data.py +84 -0
- mlflow/entities/run_info.py +188 -0
- mlflow/entities/run_inputs.py +59 -0
- mlflow/entities/run_outputs.py +43 -0
- mlflow/entities/run_status.py +41 -0
- mlflow/entities/run_tag.py +36 -0
- mlflow/entities/source_type.py +31 -0
- mlflow/entities/span.py +774 -0
- mlflow/entities/span_event.py +96 -0
- mlflow/entities/span_status.py +102 -0
- mlflow/entities/trace.py +317 -0
- mlflow/entities/trace_data.py +71 -0
- mlflow/entities/trace_info.py +220 -0
- mlflow/entities/trace_info_v2.py +162 -0
- mlflow/entities/trace_location.py +173 -0
- mlflow/entities/trace_state.py +39 -0
- mlflow/entities/trace_status.py +68 -0
- mlflow/entities/view_type.py +51 -0
- mlflow/environment_variables.py +866 -0
- mlflow/evaluation/__init__.py +16 -0
- mlflow/evaluation/assessment.py +369 -0
- mlflow/evaluation/evaluation.py +411 -0
- mlflow/evaluation/evaluation_tag.py +61 -0
- mlflow/evaluation/fluent.py +48 -0
- mlflow/evaluation/utils.py +201 -0
- mlflow/exceptions.py +213 -0
- mlflow/experiments.py +140 -0
- mlflow/gemini/__init__.py +81 -0
- mlflow/gemini/autolog.py +186 -0
- mlflow/gemini/chat.py +261 -0
- mlflow/genai/__init__.py +71 -0
- mlflow/genai/datasets/__init__.py +67 -0
- mlflow/genai/datasets/evaluation_dataset.py +131 -0
- mlflow/genai/evaluation/__init__.py +3 -0
- mlflow/genai/evaluation/base.py +411 -0
- mlflow/genai/evaluation/constant.py +23 -0
- mlflow/genai/evaluation/utils.py +244 -0
- mlflow/genai/judges/__init__.py +21 -0
- mlflow/genai/judges/databricks.py +404 -0
- mlflow/genai/label_schemas/__init__.py +153 -0
- mlflow/genai/label_schemas/label_schemas.py +209 -0
- mlflow/genai/labeling/__init__.py +159 -0
- mlflow/genai/labeling/labeling.py +250 -0
- mlflow/genai/optimize/__init__.py +13 -0
- mlflow/genai/optimize/base.py +198 -0
- mlflow/genai/optimize/optimizers/__init__.py +4 -0
- mlflow/genai/optimize/optimizers/base_optimizer.py +38 -0
- mlflow/genai/optimize/optimizers/dspy_mipro_optimizer.py +221 -0
- mlflow/genai/optimize/optimizers/dspy_optimizer.py +91 -0
- mlflow/genai/optimize/optimizers/utils/dspy_mipro_callback.py +76 -0
- mlflow/genai/optimize/optimizers/utils/dspy_mipro_utils.py +18 -0
- mlflow/genai/optimize/types.py +75 -0
- mlflow/genai/optimize/util.py +30 -0
- mlflow/genai/prompts/__init__.py +206 -0
- mlflow/genai/scheduled_scorers.py +431 -0
- mlflow/genai/scorers/__init__.py +26 -0
- mlflow/genai/scorers/base.py +492 -0
- mlflow/genai/scorers/builtin_scorers.py +765 -0
- mlflow/genai/scorers/scorer_utils.py +138 -0
- mlflow/genai/scorers/validation.py +165 -0
- mlflow/genai/utils/data_validation.py +146 -0
- mlflow/genai/utils/enum_utils.py +23 -0
- mlflow/genai/utils/trace_utils.py +211 -0
- mlflow/groq/__init__.py +42 -0
- mlflow/groq/_groq_autolog.py +74 -0
- mlflow/johnsnowlabs/__init__.py +888 -0
- mlflow/langchain/__init__.py +24 -0
- mlflow/langchain/api_request_parallel_processor.py +330 -0
- mlflow/langchain/autolog.py +147 -0
- mlflow/langchain/chat_agent_langgraph.py +340 -0
- mlflow/langchain/constant.py +1 -0
- mlflow/langchain/constants.py +1 -0
- mlflow/langchain/databricks_dependencies.py +444 -0
- mlflow/langchain/langchain_tracer.py +597 -0
- mlflow/langchain/model.py +919 -0
- mlflow/langchain/output_parsers.py +142 -0
- mlflow/langchain/retriever_chain.py +153 -0
- mlflow/langchain/runnables.py +527 -0
- mlflow/langchain/utils/chat.py +402 -0
- mlflow/langchain/utils/logging.py +671 -0
- mlflow/langchain/utils/serialization.py +36 -0
- mlflow/legacy_databricks_cli/__init__.py +0 -0
- mlflow/legacy_databricks_cli/configure/__init__.py +0 -0
- mlflow/legacy_databricks_cli/configure/provider.py +482 -0
- mlflow/litellm/__init__.py +175 -0
- mlflow/llama_index/__init__.py +22 -0
- mlflow/llama_index/autolog.py +55 -0
- mlflow/llama_index/chat.py +43 -0
- mlflow/llama_index/constant.py +1 -0
- mlflow/llama_index/model.py +577 -0
- mlflow/llama_index/pyfunc_wrapper.py +332 -0
- mlflow/llama_index/serialize_objects.py +188 -0
- mlflow/llama_index/tracer.py +561 -0
- mlflow/metrics/__init__.py +479 -0
- mlflow/metrics/base.py +39 -0
- mlflow/metrics/genai/__init__.py +25 -0
- mlflow/metrics/genai/base.py +101 -0
- mlflow/metrics/genai/genai_metric.py +771 -0
- mlflow/metrics/genai/metric_definitions.py +450 -0
- mlflow/metrics/genai/model_utils.py +371 -0
- mlflow/metrics/genai/prompt_template.py +68 -0
- mlflow/metrics/genai/prompts/__init__.py +0 -0
- mlflow/metrics/genai/prompts/v1.py +422 -0
- mlflow/metrics/genai/utils.py +6 -0
- mlflow/metrics/metric_definitions.py +619 -0
- mlflow/mismatch.py +34 -0
- mlflow/mistral/__init__.py +34 -0
- mlflow/mistral/autolog.py +71 -0
- mlflow/mistral/chat.py +135 -0
- mlflow/ml_package_versions.py +452 -0
- mlflow/models/__init__.py +97 -0
- mlflow/models/auth_policy.py +83 -0
- mlflow/models/cli.py +354 -0
- mlflow/models/container/__init__.py +294 -0
- mlflow/models/container/scoring_server/__init__.py +0 -0
- mlflow/models/container/scoring_server/nginx.conf +39 -0
- mlflow/models/dependencies_schemas.py +287 -0
- mlflow/models/display_utils.py +158 -0
- mlflow/models/docker_utils.py +211 -0
- mlflow/models/evaluation/__init__.py +23 -0
- mlflow/models/evaluation/_shap_patch.py +64 -0
- mlflow/models/evaluation/artifacts.py +194 -0
- mlflow/models/evaluation/base.py +1811 -0
- mlflow/models/evaluation/calibration_curve.py +109 -0
- mlflow/models/evaluation/default_evaluator.py +996 -0
- mlflow/models/evaluation/deprecated.py +23 -0
- mlflow/models/evaluation/evaluator_registry.py +80 -0
- mlflow/models/evaluation/evaluators/classifier.py +704 -0
- mlflow/models/evaluation/evaluators/default.py +233 -0
- mlflow/models/evaluation/evaluators/regressor.py +96 -0
- mlflow/models/evaluation/evaluators/shap.py +296 -0
- mlflow/models/evaluation/lift_curve.py +178 -0
- mlflow/models/evaluation/utils/metric.py +123 -0
- mlflow/models/evaluation/utils/trace.py +179 -0
- mlflow/models/evaluation/validation.py +434 -0
- mlflow/models/flavor_backend.py +93 -0
- mlflow/models/flavor_backend_registry.py +53 -0
- mlflow/models/model.py +1639 -0
- mlflow/models/model_config.py +150 -0
- mlflow/models/notebook_resources/agent_evaluation_template.html +235 -0
- mlflow/models/notebook_resources/eval_with_dataset_example.py +22 -0
- mlflow/models/notebook_resources/eval_with_synthetic_example.py +22 -0
- mlflow/models/python_api.py +369 -0
- mlflow/models/rag_signatures.py +128 -0
- mlflow/models/resources.py +321 -0
- mlflow/models/signature.py +662 -0
- mlflow/models/utils.py +2054 -0
- mlflow/models/wheeled_model.py +280 -0
- mlflow/openai/__init__.py +57 -0
- mlflow/openai/_agent_tracer.py +364 -0
- mlflow/openai/api_request_parallel_processor.py +131 -0
- mlflow/openai/autolog.py +509 -0
- mlflow/openai/constant.py +1 -0
- mlflow/openai/model.py +824 -0
- mlflow/openai/utils/chat_schema.py +367 -0
- mlflow/optuna/__init__.py +3 -0
- mlflow/optuna/storage.py +646 -0
- mlflow/plugins/__init__.py +72 -0
- mlflow/plugins/base.py +358 -0
- mlflow/plugins/builtin/__init__.py +24 -0
- mlflow/plugins/builtin/pytorch_plugin.py +150 -0
- mlflow/plugins/builtin/sklearn_plugin.py +158 -0
- mlflow/plugins/builtin/transformers_plugin.py +187 -0
- mlflow/plugins/cli.py +321 -0
- mlflow/plugins/discovery.py +340 -0
- mlflow/plugins/manager.py +465 -0
- mlflow/plugins/registry.py +316 -0
- mlflow/plugins/templates/framework_plugin_template.py +329 -0
- mlflow/prompt/constants.py +20 -0
- mlflow/prompt/promptlab_model.py +197 -0
- mlflow/prompt/registry_utils.py +248 -0
- mlflow/promptflow/__init__.py +495 -0
- mlflow/protos/__init__.py +0 -0
- mlflow/protos/assessments_pb2.py +174 -0
- mlflow/protos/databricks_artifacts_pb2.py +489 -0
- mlflow/protos/databricks_filesystem_service_pb2.py +196 -0
- mlflow/protos/databricks_managed_catalog_messages_pb2.py +95 -0
- mlflow/protos/databricks_managed_catalog_service_pb2.py +86 -0
- mlflow/protos/databricks_pb2.py +267 -0
- mlflow/protos/databricks_trace_server_pb2.py +374 -0
- mlflow/protos/databricks_uc_registry_messages_pb2.py +1249 -0
- mlflow/protos/databricks_uc_registry_service_pb2.py +170 -0
- mlflow/protos/facet_feature_statistics_pb2.py +296 -0
- mlflow/protos/internal_pb2.py +77 -0
- mlflow/protos/mlflow_artifacts_pb2.py +336 -0
- mlflow/protos/model_registry_pb2.py +1073 -0
- mlflow/protos/scalapb/__init__.py +0 -0
- mlflow/protos/scalapb/scalapb_pb2.py +104 -0
- mlflow/protos/service_pb2.py +2600 -0
- mlflow/protos/unity_catalog_oss_messages_pb2.py +457 -0
- mlflow/protos/unity_catalog_oss_service_pb2.py +130 -0
- mlflow/protos/unity_catalog_prompt_messages_pb2.py +447 -0
- mlflow/protos/unity_catalog_prompt_messages_pb2_grpc.py +24 -0
- mlflow/protos/unity_catalog_prompt_service_pb2.py +164 -0
- mlflow/protos/unity_catalog_prompt_service_pb2_grpc.py +785 -0
- mlflow/py.typed +0 -0
- mlflow/pydantic_ai/__init__.py +57 -0
- mlflow/pydantic_ai/autolog.py +173 -0
- mlflow/pyfunc/__init__.py +3844 -0
- mlflow/pyfunc/_mlflow_pyfunc_backend_predict.py +61 -0
- mlflow/pyfunc/backend.py +523 -0
- mlflow/pyfunc/context.py +78 -0
- mlflow/pyfunc/dbconnect_artifact_cache.py +144 -0
- mlflow/pyfunc/loaders/__init__.py +7 -0
- mlflow/pyfunc/loaders/chat_agent.py +117 -0
- mlflow/pyfunc/loaders/chat_model.py +125 -0
- mlflow/pyfunc/loaders/code_model.py +31 -0
- mlflow/pyfunc/loaders/responses_agent.py +112 -0
- mlflow/pyfunc/mlserver.py +46 -0
- mlflow/pyfunc/model.py +1473 -0
- mlflow/pyfunc/scoring_server/__init__.py +604 -0
- mlflow/pyfunc/scoring_server/app.py +7 -0
- mlflow/pyfunc/scoring_server/client.py +146 -0
- mlflow/pyfunc/spark_model_cache.py +48 -0
- mlflow/pyfunc/stdin_server.py +44 -0
- mlflow/pyfunc/utils/__init__.py +3 -0
- mlflow/pyfunc/utils/data_validation.py +224 -0
- mlflow/pyfunc/utils/environment.py +22 -0
- mlflow/pyfunc/utils/input_converter.py +47 -0
- mlflow/pyfunc/utils/serving_data_parser.py +11 -0
- mlflow/pytorch/__init__.py +1171 -0
- mlflow/pytorch/_lightning_autolog.py +580 -0
- mlflow/pytorch/_pytorch_autolog.py +50 -0
- mlflow/pytorch/pickle_module.py +35 -0
- mlflow/rfunc/__init__.py +42 -0
- mlflow/rfunc/backend.py +134 -0
- mlflow/runs.py +89 -0
- mlflow/server/__init__.py +302 -0
- mlflow/server/auth/__init__.py +1224 -0
- mlflow/server/auth/__main__.py +4 -0
- mlflow/server/auth/basic_auth.ini +6 -0
- mlflow/server/auth/cli.py +11 -0
- mlflow/server/auth/client.py +537 -0
- mlflow/server/auth/config.py +34 -0
- mlflow/server/auth/db/__init__.py +0 -0
- mlflow/server/auth/db/cli.py +18 -0
- mlflow/server/auth/db/migrations/__init__.py +0 -0
- mlflow/server/auth/db/migrations/alembic.ini +110 -0
- mlflow/server/auth/db/migrations/env.py +76 -0
- mlflow/server/auth/db/migrations/versions/8606fa83a998_initial_migration.py +51 -0
- mlflow/server/auth/db/migrations/versions/__init__.py +0 -0
- mlflow/server/auth/db/models.py +67 -0
- mlflow/server/auth/db/utils.py +37 -0
- mlflow/server/auth/entities.py +165 -0
- mlflow/server/auth/logo.py +14 -0
- mlflow/server/auth/permissions.py +65 -0
- mlflow/server/auth/routes.py +18 -0
- mlflow/server/auth/sqlalchemy_store.py +263 -0
- mlflow/server/graphql/__init__.py +0 -0
- mlflow/server/graphql/autogenerated_graphql_schema.py +353 -0
- mlflow/server/graphql/graphql_custom_scalars.py +24 -0
- mlflow/server/graphql/graphql_errors.py +15 -0
- mlflow/server/graphql/graphql_no_batching.py +89 -0
- mlflow/server/graphql/graphql_schema_extensions.py +74 -0
- mlflow/server/handlers.py +3217 -0
- mlflow/server/prometheus_exporter.py +17 -0
- mlflow/server/validation.py +30 -0
- mlflow/shap/__init__.py +691 -0
- mlflow/sklearn/__init__.py +1994 -0
- mlflow/sklearn/utils.py +1041 -0
- mlflow/smolagents/__init__.py +66 -0
- mlflow/smolagents/autolog.py +139 -0
- mlflow/smolagents/chat.py +29 -0
- mlflow/store/__init__.py +10 -0
- mlflow/store/_unity_catalog/__init__.py +1 -0
- mlflow/store/_unity_catalog/lineage/__init__.py +1 -0
- mlflow/store/_unity_catalog/lineage/constants.py +2 -0
- mlflow/store/_unity_catalog/registry/__init__.py +6 -0
- mlflow/store/_unity_catalog/registry/prompt_info.py +75 -0
- mlflow/store/_unity_catalog/registry/rest_store.py +1740 -0
- mlflow/store/_unity_catalog/registry/uc_oss_rest_store.py +507 -0
- mlflow/store/_unity_catalog/registry/utils.py +121 -0
- mlflow/store/artifact/__init__.py +0 -0
- mlflow/store/artifact/artifact_repo.py +472 -0
- mlflow/store/artifact/artifact_repository_registry.py +154 -0
- mlflow/store/artifact/azure_blob_artifact_repo.py +275 -0
- mlflow/store/artifact/azure_data_lake_artifact_repo.py +295 -0
- mlflow/store/artifact/cli.py +141 -0
- mlflow/store/artifact/cloud_artifact_repo.py +332 -0
- mlflow/store/artifact/databricks_artifact_repo.py +729 -0
- mlflow/store/artifact/databricks_artifact_repo_resources.py +301 -0
- mlflow/store/artifact/databricks_logged_model_artifact_repo.py +93 -0
- mlflow/store/artifact/databricks_models_artifact_repo.py +216 -0
- mlflow/store/artifact/databricks_sdk_artifact_repo.py +134 -0
- mlflow/store/artifact/databricks_sdk_models_artifact_repo.py +97 -0
- mlflow/store/artifact/dbfs_artifact_repo.py +240 -0
- mlflow/store/artifact/ftp_artifact_repo.py +132 -0
- mlflow/store/artifact/gcs_artifact_repo.py +296 -0
- mlflow/store/artifact/hdfs_artifact_repo.py +209 -0
- mlflow/store/artifact/http_artifact_repo.py +218 -0
- mlflow/store/artifact/local_artifact_repo.py +142 -0
- mlflow/store/artifact/mlflow_artifacts_repo.py +94 -0
- mlflow/store/artifact/models_artifact_repo.py +259 -0
- mlflow/store/artifact/optimized_s3_artifact_repo.py +356 -0
- mlflow/store/artifact/presigned_url_artifact_repo.py +173 -0
- mlflow/store/artifact/r2_artifact_repo.py +70 -0
- mlflow/store/artifact/runs_artifact_repo.py +265 -0
- mlflow/store/artifact/s3_artifact_repo.py +330 -0
- mlflow/store/artifact/sftp_artifact_repo.py +141 -0
- mlflow/store/artifact/uc_volume_artifact_repo.py +76 -0
- mlflow/store/artifact/unity_catalog_models_artifact_repo.py +168 -0
- mlflow/store/artifact/unity_catalog_oss_models_artifact_repo.py +168 -0
- mlflow/store/artifact/utils/__init__.py +0 -0
- mlflow/store/artifact/utils/models.py +148 -0
- mlflow/store/db/__init__.py +0 -0
- mlflow/store/db/base_sql_model.py +3 -0
- mlflow/store/db/db_types.py +10 -0
- mlflow/store/db/utils.py +314 -0
- mlflow/store/db_migrations/__init__.py +0 -0
- mlflow/store/db_migrations/alembic.ini +74 -0
- mlflow/store/db_migrations/env.py +84 -0
- mlflow/store/db_migrations/versions/0584bdc529eb_add_cascading_deletion_to_datasets_from_experiments.py +88 -0
- mlflow/store/db_migrations/versions/0a8213491aaa_drop_duplicate_killed_constraint.py +49 -0
- mlflow/store/db_migrations/versions/0c779009ac13_add_deleted_time_field_to_runs_table.py +24 -0
- mlflow/store/db_migrations/versions/181f10493468_allow_nulls_for_metric_values.py +35 -0
- mlflow/store/db_migrations/versions/27a6a02d2cf1_add_model_version_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/2b4d017a5e9b_add_model_registry_tables_to_db.py +77 -0
- mlflow/store/db_migrations/versions/2d6e25af4d3e_increase_max_param_val_length.py +33 -0
- mlflow/store/db_migrations/versions/3500859a5d39_add_model_aliases_table.py +50 -0
- mlflow/store/db_migrations/versions/39d1c3be5f05_add_is_nan_constraint_for_metrics_tables_if_necessary.py +41 -0
- mlflow/store/db_migrations/versions/400f98739977_add_logged_model_tables.py +123 -0
- mlflow/store/db_migrations/versions/4465047574b1_increase_max_dataset_schema_size.py +38 -0
- mlflow/store/db_migrations/versions/451aebb31d03_add_metric_step.py +35 -0
- mlflow/store/db_migrations/versions/5b0e9adcef9c_add_cascade_deletion_to_trace_tables_fk.py +40 -0
- mlflow/store/db_migrations/versions/6953534de441_add_step_to_inputs_table.py +25 -0
- mlflow/store/db_migrations/versions/728d730b5ebd_add_registered_model_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/7ac759974ad8_update_run_tags_with_larger_limit.py +36 -0
- mlflow/store/db_migrations/versions/7f2a7d5fae7d_add_datasets_inputs_input_tags_tables.py +82 -0
- mlflow/store/db_migrations/versions/84291f40a231_add_run_link_to_model_version.py +26 -0
- mlflow/store/db_migrations/versions/867495a8f9d4_add_trace_tables.py +90 -0
- mlflow/store/db_migrations/versions/89d4b8295536_create_latest_metrics_table.py +169 -0
- mlflow/store/db_migrations/versions/90e64c465722_migrate_user_column_to_tags.py +64 -0
- mlflow/store/db_migrations/versions/97727af70f4d_creation_time_last_update_time_experiments.py +25 -0
- mlflow/store/db_migrations/versions/__init__.py +0 -0
- mlflow/store/db_migrations/versions/a8c4a736bde6_allow_nulls_for_run_id.py +27 -0
- mlflow/store/db_migrations/versions/acf3f17fdcc7_add_storage_location_field_to_model_.py +29 -0
- mlflow/store/db_migrations/versions/bd07f7e963c5_create_index_on_run_uuid.py +26 -0
- mlflow/store/db_migrations/versions/bda7b8c39065_increase_model_version_tag_value_limit.py +38 -0
- mlflow/store/db_migrations/versions/c48cb773bb87_reset_default_value_for_is_nan_in_metrics_table_for_mysql.py +41 -0
- mlflow/store/db_migrations/versions/cbc13b556ace_add_v3_trace_schema_columns.py +31 -0
- mlflow/store/db_migrations/versions/cc1f77228345_change_param_value_length_to_500.py +34 -0
- mlflow/store/db_migrations/versions/cfd24bdc0731_update_run_status_constraint_with_killed.py +78 -0
- mlflow/store/db_migrations/versions/df50e92ffc5e_add_experiment_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/f5a4f2784254_increase_run_tag_value_limit.py +36 -0
- mlflow/store/entities/__init__.py +3 -0
- mlflow/store/entities/paged_list.py +18 -0
- mlflow/store/model_registry/__init__.py +10 -0
- mlflow/store/model_registry/abstract_store.py +1081 -0
- mlflow/store/model_registry/base_rest_store.py +44 -0
- mlflow/store/model_registry/databricks_workspace_model_registry_rest_store.py +37 -0
- mlflow/store/model_registry/dbmodels/__init__.py +0 -0
- mlflow/store/model_registry/dbmodels/models.py +206 -0
- mlflow/store/model_registry/file_store.py +1091 -0
- mlflow/store/model_registry/rest_store.py +481 -0
- mlflow/store/model_registry/sqlalchemy_store.py +1286 -0
- mlflow/store/tracking/__init__.py +23 -0
- mlflow/store/tracking/abstract_store.py +816 -0
- mlflow/store/tracking/dbmodels/__init__.py +0 -0
- mlflow/store/tracking/dbmodels/initial_models.py +243 -0
- mlflow/store/tracking/dbmodels/models.py +1073 -0
- mlflow/store/tracking/file_store.py +2438 -0
- mlflow/store/tracking/postgres_managed_identity.py +146 -0
- mlflow/store/tracking/rest_store.py +1131 -0
- mlflow/store/tracking/sqlalchemy_store.py +2785 -0
- mlflow/system_metrics/__init__.py +61 -0
- mlflow/system_metrics/metrics/__init__.py +0 -0
- mlflow/system_metrics/metrics/base_metrics_monitor.py +32 -0
- mlflow/system_metrics/metrics/cpu_monitor.py +23 -0
- mlflow/system_metrics/metrics/disk_monitor.py +21 -0
- mlflow/system_metrics/metrics/gpu_monitor.py +71 -0
- mlflow/system_metrics/metrics/network_monitor.py +34 -0
- mlflow/system_metrics/metrics/rocm_monitor.py +123 -0
- mlflow/system_metrics/system_metrics_monitor.py +198 -0
- mlflow/tracing/__init__.py +16 -0
- mlflow/tracing/assessment.py +356 -0
- mlflow/tracing/client.py +531 -0
- mlflow/tracing/config.py +125 -0
- mlflow/tracing/constant.py +105 -0
- mlflow/tracing/destination.py +81 -0
- mlflow/tracing/display/__init__.py +40 -0
- mlflow/tracing/display/display_handler.py +196 -0
- mlflow/tracing/export/async_export_queue.py +186 -0
- mlflow/tracing/export/inference_table.py +138 -0
- mlflow/tracing/export/mlflow_v3.py +137 -0
- mlflow/tracing/export/utils.py +70 -0
- mlflow/tracing/fluent.py +1417 -0
- mlflow/tracing/processor/base_mlflow.py +199 -0
- mlflow/tracing/processor/inference_table.py +175 -0
- mlflow/tracing/processor/mlflow_v3.py +47 -0
- mlflow/tracing/processor/otel.py +73 -0
- mlflow/tracing/provider.py +487 -0
- mlflow/tracing/trace_manager.py +200 -0
- mlflow/tracing/utils/__init__.py +616 -0
- mlflow/tracing/utils/artifact_utils.py +28 -0
- mlflow/tracing/utils/copy.py +55 -0
- mlflow/tracing/utils/environment.py +55 -0
- mlflow/tracing/utils/exception.py +21 -0
- mlflow/tracing/utils/once.py +35 -0
- mlflow/tracing/utils/otlp.py +63 -0
- mlflow/tracing/utils/processor.py +54 -0
- mlflow/tracing/utils/search.py +292 -0
- mlflow/tracing/utils/timeout.py +250 -0
- mlflow/tracing/utils/token.py +19 -0
- mlflow/tracing/utils/truncation.py +124 -0
- mlflow/tracing/utils/warning.py +76 -0
- mlflow/tracking/__init__.py +39 -0
- mlflow/tracking/_model_registry/__init__.py +1 -0
- mlflow/tracking/_model_registry/client.py +764 -0
- mlflow/tracking/_model_registry/fluent.py +853 -0
- mlflow/tracking/_model_registry/registry.py +67 -0
- mlflow/tracking/_model_registry/utils.py +251 -0
- mlflow/tracking/_tracking_service/__init__.py +0 -0
- mlflow/tracking/_tracking_service/client.py +883 -0
- mlflow/tracking/_tracking_service/registry.py +56 -0
- mlflow/tracking/_tracking_service/utils.py +275 -0
- mlflow/tracking/artifact_utils.py +179 -0
- mlflow/tracking/client.py +5900 -0
- mlflow/tracking/context/__init__.py +0 -0
- mlflow/tracking/context/abstract_context.py +35 -0
- mlflow/tracking/context/databricks_cluster_context.py +15 -0
- mlflow/tracking/context/databricks_command_context.py +15 -0
- mlflow/tracking/context/databricks_job_context.py +49 -0
- mlflow/tracking/context/databricks_notebook_context.py +41 -0
- mlflow/tracking/context/databricks_repo_context.py +43 -0
- mlflow/tracking/context/default_context.py +51 -0
- mlflow/tracking/context/git_context.py +32 -0
- mlflow/tracking/context/registry.py +98 -0
- mlflow/tracking/context/system_environment_context.py +15 -0
- mlflow/tracking/default_experiment/__init__.py +1 -0
- mlflow/tracking/default_experiment/abstract_context.py +43 -0
- mlflow/tracking/default_experiment/databricks_notebook_experiment_provider.py +44 -0
- mlflow/tracking/default_experiment/registry.py +75 -0
- mlflow/tracking/fluent.py +3595 -0
- mlflow/tracking/metric_value_conversion_utils.py +93 -0
- mlflow/tracking/multimedia.py +206 -0
- mlflow/tracking/registry.py +86 -0
- mlflow/tracking/request_auth/__init__.py +0 -0
- mlflow/tracking/request_auth/abstract_request_auth_provider.py +34 -0
- mlflow/tracking/request_auth/registry.py +60 -0
- mlflow/tracking/request_header/__init__.py +0 -0
- mlflow/tracking/request_header/abstract_request_header_provider.py +36 -0
- mlflow/tracking/request_header/databricks_request_header_provider.py +38 -0
- mlflow/tracking/request_header/default_request_header_provider.py +17 -0
- mlflow/tracking/request_header/registry.py +79 -0
- mlflow/transformers/__init__.py +2982 -0
- mlflow/transformers/flavor_config.py +258 -0
- mlflow/transformers/hub_utils.py +83 -0
- mlflow/transformers/llm_inference_utils.py +468 -0
- mlflow/transformers/model_io.py +301 -0
- mlflow/transformers/peft.py +51 -0
- mlflow/transformers/signature.py +183 -0
- mlflow/transformers/torch_utils.py +55 -0
- mlflow/types/__init__.py +21 -0
- mlflow/types/agent.py +270 -0
- mlflow/types/chat.py +240 -0
- mlflow/types/llm.py +935 -0
- mlflow/types/responses.py +139 -0
- mlflow/types/responses_helpers.py +416 -0
- mlflow/types/schema.py +1505 -0
- mlflow/types/type_hints.py +647 -0
- mlflow/types/utils.py +753 -0
- mlflow/utils/__init__.py +283 -0
- mlflow/utils/_capture_modules.py +256 -0
- mlflow/utils/_capture_transformers_modules.py +75 -0
- mlflow/utils/_spark_utils.py +201 -0
- mlflow/utils/_unity_catalog_oss_utils.py +97 -0
- mlflow/utils/_unity_catalog_utils.py +479 -0
- mlflow/utils/annotations.py +218 -0
- mlflow/utils/arguments_utils.py +16 -0
- mlflow/utils/async_logging/__init__.py +1 -0
- mlflow/utils/async_logging/async_artifacts_logging_queue.py +258 -0
- mlflow/utils/async_logging/async_logging_queue.py +366 -0
- mlflow/utils/async_logging/run_artifact.py +38 -0
- mlflow/utils/async_logging/run_batch.py +58 -0
- mlflow/utils/async_logging/run_operations.py +49 -0
- mlflow/utils/autologging_utils/__init__.py +737 -0
- mlflow/utils/autologging_utils/client.py +432 -0
- mlflow/utils/autologging_utils/config.py +33 -0
- mlflow/utils/autologging_utils/events.py +294 -0
- mlflow/utils/autologging_utils/logging_and_warnings.py +328 -0
- mlflow/utils/autologging_utils/metrics_queue.py +71 -0
- mlflow/utils/autologging_utils/safety.py +1104 -0
- mlflow/utils/autologging_utils/versioning.py +95 -0
- mlflow/utils/checkpoint_utils.py +206 -0
- mlflow/utils/class_utils.py +6 -0
- mlflow/utils/cli_args.py +257 -0
- mlflow/utils/conda.py +354 -0
- mlflow/utils/credentials.py +231 -0
- mlflow/utils/data_utils.py +17 -0
- mlflow/utils/databricks_utils.py +1436 -0
- mlflow/utils/docstring_utils.py +477 -0
- mlflow/utils/doctor.py +133 -0
- mlflow/utils/download_cloud_file_chunk.py +43 -0
- mlflow/utils/env_manager.py +16 -0
- mlflow/utils/env_pack.py +131 -0
- mlflow/utils/environment.py +1009 -0
- mlflow/utils/exception_utils.py +14 -0
- mlflow/utils/file_utils.py +978 -0
- mlflow/utils/git_utils.py +77 -0
- mlflow/utils/gorilla.py +797 -0
- mlflow/utils/import_hooks/__init__.py +363 -0
- mlflow/utils/lazy_load.py +51 -0
- mlflow/utils/logging_utils.py +168 -0
- mlflow/utils/mime_type_utils.py +58 -0
- mlflow/utils/mlflow_tags.py +103 -0
- mlflow/utils/model_utils.py +486 -0
- mlflow/utils/name_utils.py +346 -0
- mlflow/utils/nfs_on_spark.py +62 -0
- mlflow/utils/openai_utils.py +164 -0
- mlflow/utils/os.py +12 -0
- mlflow/utils/oss_registry_utils.py +29 -0
- mlflow/utils/plugins.py +17 -0
- mlflow/utils/process.py +182 -0
- mlflow/utils/promptlab_utils.py +146 -0
- mlflow/utils/proto_json_utils.py +743 -0
- mlflow/utils/pydantic_utils.py +54 -0
- mlflow/utils/request_utils.py +279 -0
- mlflow/utils/requirements_utils.py +704 -0
- mlflow/utils/rest_utils.py +673 -0
- mlflow/utils/search_logged_model_utils.py +127 -0
- mlflow/utils/search_utils.py +2111 -0
- mlflow/utils/secure_loading.py +221 -0
- mlflow/utils/security_validation.py +384 -0
- mlflow/utils/server_cli_utils.py +61 -0
- mlflow/utils/spark_utils.py +15 -0
- mlflow/utils/string_utils.py +138 -0
- mlflow/utils/thread_utils.py +63 -0
- mlflow/utils/time.py +54 -0
- mlflow/utils/timeout.py +42 -0
- mlflow/utils/uri.py +572 -0
- mlflow/utils/validation.py +662 -0
- mlflow/utils/virtualenv.py +458 -0
- mlflow/utils/warnings_utils.py +25 -0
- mlflow/utils/yaml_utils.py +179 -0
- mlflow/version.py +24 -0
@@ -0,0 +1,406 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from functools import cached_property
|
4
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
5
|
+
|
6
|
+
from packaging.version import Version
|
7
|
+
|
8
|
+
from mlflow.data.dataset import Dataset
|
9
|
+
from mlflow.data.dataset_source import DatasetSource
|
10
|
+
from mlflow.data.delta_dataset_source import DeltaDatasetSource
|
11
|
+
from mlflow.data.digest_utils import get_normalized_md5_digest
|
12
|
+
from mlflow.data.evaluation_dataset import EvaluationDataset
|
13
|
+
from mlflow.data.pyfunc_dataset_mixin import PyFuncConvertibleDatasetMixin, PyFuncInputsOutputs
|
14
|
+
from mlflow.data.spark_dataset_source import SparkDatasetSource
|
15
|
+
from mlflow.exceptions import MlflowException
|
16
|
+
from mlflow.protos.databricks_pb2 import INTERNAL_ERROR, INVALID_PARAMETER_VALUE
|
17
|
+
from mlflow.types import Schema
|
18
|
+
from mlflow.types.utils import _infer_schema
|
19
|
+
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
import pyspark
|
22
|
+
|
23
|
+
_logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class SparkDataset(Dataset, PyFuncConvertibleDatasetMixin):
|
27
|
+
"""
|
28
|
+
Represents a Spark dataset (e.g. data derived from a Spark Table / file directory or Delta
|
29
|
+
Table) for use with MLflow Tracking.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def __init__(
|
33
|
+
self,
|
34
|
+
df: "pyspark.sql.DataFrame",
|
35
|
+
source: DatasetSource,
|
36
|
+
targets: Optional[str] = None,
|
37
|
+
name: Optional[str] = None,
|
38
|
+
digest: Optional[str] = None,
|
39
|
+
predictions: Optional[str] = None,
|
40
|
+
):
|
41
|
+
if targets is not None and targets not in df.columns:
|
42
|
+
raise MlflowException(
|
43
|
+
f"The specified Spark dataset does not contain the specified targets column"
|
44
|
+
f" '{targets}'.",
|
45
|
+
INVALID_PARAMETER_VALUE,
|
46
|
+
)
|
47
|
+
if predictions is not None and predictions not in df.columns:
|
48
|
+
raise MlflowException(
|
49
|
+
f"The specified Spark dataset does not contain the specified predictions column"
|
50
|
+
f" '{predictions}'.",
|
51
|
+
INVALID_PARAMETER_VALUE,
|
52
|
+
)
|
53
|
+
|
54
|
+
self._df = df
|
55
|
+
self._targets = targets
|
56
|
+
self._predictions = predictions
|
57
|
+
super().__init__(source=source, name=name, digest=digest)
|
58
|
+
|
59
|
+
def _compute_digest(self) -> str:
|
60
|
+
"""
|
61
|
+
Computes a digest for the dataset. Called if the user doesn't supply
|
62
|
+
a digest when constructing the dataset.
|
63
|
+
"""
|
64
|
+
# Retrieve a semantic hash of the DataFrame's logical plan, which is much more efficient
|
65
|
+
# and deterministic than hashing DataFrame records
|
66
|
+
import numpy as np
|
67
|
+
import pyspark
|
68
|
+
|
69
|
+
# Spark 3.1.0+ has a semanticHash() method on DataFrame
|
70
|
+
if Version(pyspark.__version__) >= Version("3.1.0"):
|
71
|
+
semantic_hash = self._df.semanticHash()
|
72
|
+
else:
|
73
|
+
semantic_hash = self._df._jdf.queryExecution().analyzed().semanticHash()
|
74
|
+
return get_normalized_md5_digest([np.int64(semantic_hash)])
|
75
|
+
|
76
|
+
def to_dict(self) -> dict[str, str]:
|
77
|
+
"""Create config dictionary for the dataset.
|
78
|
+
|
79
|
+
Returns a string dictionary containing the following fields: name, digest, source, source
|
80
|
+
type, schema, and profile.
|
81
|
+
"""
|
82
|
+
schema = json.dumps({"mlflow_colspec": self.schema.to_dict()}) if self.schema else None
|
83
|
+
config = super().to_dict()
|
84
|
+
config.update(
|
85
|
+
{
|
86
|
+
"schema": schema,
|
87
|
+
"profile": json.dumps(self.profile),
|
88
|
+
}
|
89
|
+
)
|
90
|
+
return config
|
91
|
+
|
92
|
+
@property
|
93
|
+
def df(self):
|
94
|
+
"""The Spark DataFrame instance.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
The Spark DataFrame instance.
|
98
|
+
|
99
|
+
"""
|
100
|
+
return self._df
|
101
|
+
|
102
|
+
@property
|
103
|
+
def targets(self) -> Optional[str]:
|
104
|
+
"""The name of the Spark DataFrame column containing targets (labels) for supervised
|
105
|
+
learning.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
The string name of the Spark DataFrame column containing targets.
|
109
|
+
"""
|
110
|
+
return self._targets
|
111
|
+
|
112
|
+
@property
|
113
|
+
def predictions(self) -> Optional[str]:
|
114
|
+
"""
|
115
|
+
The name of the predictions column. May be ``None`` if no predictions column
|
116
|
+
was specified when the dataset was created.
|
117
|
+
"""
|
118
|
+
return self._predictions
|
119
|
+
|
120
|
+
@property
|
121
|
+
def source(self) -> Union[SparkDatasetSource, DeltaDatasetSource]:
|
122
|
+
"""
|
123
|
+
Spark dataset source information.
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
An instance of
|
127
|
+
:py:class:`SparkDatasetSource <mlflow.data.spark_dataset_source.SparkDatasetSource>` or
|
128
|
+
:py:class:`DeltaDatasetSource <mlflow.data.delta_dataset_source.DeltaDatasetSource>`.
|
129
|
+
"""
|
130
|
+
return self._source
|
131
|
+
|
132
|
+
@property
|
133
|
+
def profile(self) -> Optional[Any]:
|
134
|
+
"""
|
135
|
+
A profile of the dataset. May be None if no profile is available.
|
136
|
+
"""
|
137
|
+
try:
|
138
|
+
from pyspark.rdd import BoundedFloat
|
139
|
+
|
140
|
+
# Use Spark RDD countApprox to get approximate count since count() may be expensive.
|
141
|
+
# Note that we call the Scala RDD API because the PySpark API does not respect the
|
142
|
+
# specified timeout. Reference code:
|
143
|
+
# https://spark.apache.org/docs/3.4.0/api/python/_modules/pyspark/rdd.html
|
144
|
+
# #RDD.countApprox. This is confirmed to work in all Spark 3.x versions
|
145
|
+
py_rdd = self.df.rdd
|
146
|
+
drdd = py_rdd.mapPartitions(lambda it: [float(sum(1 for i in it))])
|
147
|
+
jrdd = drdd.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd()
|
148
|
+
jdrdd = drdd.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())
|
149
|
+
timeout_millis = 5000
|
150
|
+
confidence = 0.9
|
151
|
+
approx_count_operation = jdrdd.sumApprox(timeout_millis, confidence)
|
152
|
+
approx_count_result = approx_count_operation.initialValue()
|
153
|
+
approx_count_float = BoundedFloat(
|
154
|
+
mean=approx_count_result.mean(),
|
155
|
+
confidence=approx_count_result.confidence(),
|
156
|
+
low=approx_count_result.low(),
|
157
|
+
high=approx_count_result.high(),
|
158
|
+
)
|
159
|
+
approx_count = int(approx_count_float)
|
160
|
+
if approx_count <= 0:
|
161
|
+
# An approximate count of zero likely indicates that the count timed
|
162
|
+
# out before an estimate could be made. In this case, we use the value
|
163
|
+
# "unknown" so that users don't think the dataset is empty
|
164
|
+
approx_count = "unknown"
|
165
|
+
|
166
|
+
return {
|
167
|
+
"approx_count": approx_count,
|
168
|
+
}
|
169
|
+
except Exception as e:
|
170
|
+
_logger.warning(
|
171
|
+
"Encountered an unexpected exception while computing Spark dataset profile."
|
172
|
+
" Exception: %s",
|
173
|
+
e,
|
174
|
+
)
|
175
|
+
|
176
|
+
@cached_property
|
177
|
+
def schema(self) -> Optional[Schema]:
|
178
|
+
"""
|
179
|
+
The MLflow ColSpec schema of the Spark dataset.
|
180
|
+
"""
|
181
|
+
try:
|
182
|
+
return _infer_schema(self._df)
|
183
|
+
except Exception as e:
|
184
|
+
_logger.warning("Failed to infer schema for Spark dataset. Exception: %s", e)
|
185
|
+
return None
|
186
|
+
|
187
|
+
def to_pyfunc(self) -> PyFuncInputsOutputs:
|
188
|
+
"""
|
189
|
+
Converts the Spark DataFrame to pandas and splits the resulting
|
190
|
+
:py:class:`pandas.DataFrame` into: 1. a :py:class:`pandas.DataFrame` of features and
|
191
|
+
2. a :py:class:`pandas.Series` of targets.
|
192
|
+
|
193
|
+
To avoid overuse of driver memory, only the first 10,000 DataFrame rows are selected.
|
194
|
+
"""
|
195
|
+
df = self._df.limit(10000).toPandas()
|
196
|
+
if self._targets is not None:
|
197
|
+
if self._targets not in df.columns:
|
198
|
+
raise MlflowException(
|
199
|
+
f"Failed to convert Spark dataset to pyfunc inputs and outputs because"
|
200
|
+
f" the pandas representation of the Spark dataset does not contain the"
|
201
|
+
f" specified targets column '{self._targets}'.",
|
202
|
+
# This is an internal error because we should have validated the presence of
|
203
|
+
# the target column in the Hugging Face dataset at construction time
|
204
|
+
INTERNAL_ERROR,
|
205
|
+
)
|
206
|
+
inputs = df.drop(columns=self._targets)
|
207
|
+
outputs = df[self._targets]
|
208
|
+
return PyFuncInputsOutputs(inputs=inputs, outputs=outputs)
|
209
|
+
else:
|
210
|
+
return PyFuncInputsOutputs(inputs=df, outputs=None)
|
211
|
+
|
212
|
+
def to_evaluation_dataset(self, path=None, feature_names=None) -> EvaluationDataset:
|
213
|
+
"""
|
214
|
+
Converts the dataset to an EvaluationDataset for model evaluation. Required
|
215
|
+
for use with mlflow.evaluate().
|
216
|
+
"""
|
217
|
+
return EvaluationDataset(
|
218
|
+
data=self._df.limit(10000).toPandas(),
|
219
|
+
targets=self._targets,
|
220
|
+
path=path,
|
221
|
+
feature_names=feature_names,
|
222
|
+
predictions=self._predictions,
|
223
|
+
name=self.name,
|
224
|
+
digest=self.digest,
|
225
|
+
)
|
226
|
+
|
227
|
+
|
228
|
+
def load_delta(
|
229
|
+
path: Optional[str] = None,
|
230
|
+
table_name: Optional[str] = None,
|
231
|
+
version: Optional[str] = None,
|
232
|
+
targets: Optional[str] = None,
|
233
|
+
name: Optional[str] = None,
|
234
|
+
digest: Optional[str] = None,
|
235
|
+
) -> SparkDataset:
|
236
|
+
"""
|
237
|
+
Loads a :py:class:`SparkDataset <mlflow.data.spark_dataset.SparkDataset>` from a Delta table
|
238
|
+
for use with MLflow Tracking.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
path: The path to the Delta table. Either ``path`` or ``table_name`` must be specified.
|
242
|
+
table_name: The name of the Delta table. Either ``path`` or ``table_name`` must be
|
243
|
+
specified.
|
244
|
+
version: The Delta table version. If not specified, the version will be inferred.
|
245
|
+
targets: Optional. The name of the Delta table column containing targets (labels) for
|
246
|
+
supervised learning.
|
247
|
+
name: The name of the dataset. E.g. "wiki_train". If unspecified, a name is
|
248
|
+
automatically generated.
|
249
|
+
digest: The digest (hash, fingerprint) of the dataset. If unspecified, a digest
|
250
|
+
is automatically computed.
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
An instance of :py:class:`SparkDataset <mlflow.data.spark_dataset.SparkDataset>`.
|
254
|
+
"""
|
255
|
+
from mlflow.data.spark_delta_utils import (
|
256
|
+
_try_get_delta_table_latest_version_from_path,
|
257
|
+
_try_get_delta_table_latest_version_from_table_name,
|
258
|
+
)
|
259
|
+
|
260
|
+
if (path, table_name).count(None) != 1:
|
261
|
+
raise MlflowException(
|
262
|
+
"Must specify exactly one of `table_name` or `path`.",
|
263
|
+
INVALID_PARAMETER_VALUE,
|
264
|
+
)
|
265
|
+
|
266
|
+
if version is None:
|
267
|
+
if path is not None:
|
268
|
+
version = _try_get_delta_table_latest_version_from_path(path)
|
269
|
+
else:
|
270
|
+
version = _try_get_delta_table_latest_version_from_table_name(table_name)
|
271
|
+
|
272
|
+
if name is None and table_name is not None:
|
273
|
+
name = table_name + (f"@v{version}" if version is not None else "")
|
274
|
+
|
275
|
+
source = DeltaDatasetSource(path=path, delta_table_name=table_name, delta_table_version=version)
|
276
|
+
df = source.load()
|
277
|
+
|
278
|
+
return SparkDataset(
|
279
|
+
df=df,
|
280
|
+
source=source,
|
281
|
+
targets=targets,
|
282
|
+
name=name,
|
283
|
+
digest=digest,
|
284
|
+
)
|
285
|
+
|
286
|
+
|
287
|
+
def from_spark(
|
288
|
+
df: "pyspark.sql.DataFrame",
|
289
|
+
path: Optional[str] = None,
|
290
|
+
table_name: Optional[str] = None,
|
291
|
+
version: Optional[str] = None,
|
292
|
+
sql: Optional[str] = None,
|
293
|
+
targets: Optional[str] = None,
|
294
|
+
name: Optional[str] = None,
|
295
|
+
digest: Optional[str] = None,
|
296
|
+
predictions: Optional[str] = None,
|
297
|
+
) -> SparkDataset:
|
298
|
+
"""
|
299
|
+
Given a Spark DataFrame, constructs a
|
300
|
+
:py:class:`SparkDataset <mlflow.data.spark_dataset.SparkDataset>` object for use with
|
301
|
+
MLflow Tracking.
|
302
|
+
|
303
|
+
Args:
|
304
|
+
df: The Spark DataFrame from which to construct a SparkDataset.
|
305
|
+
path: The path of the Spark or Delta source that the DataFrame originally came from. Note
|
306
|
+
that the path does not have to match the DataFrame exactly, since the DataFrame may have
|
307
|
+
been modified by Spark operations. This is used to reload the dataset upon request via
|
308
|
+
:py:func:`SparkDataset.source.load()
|
309
|
+
<mlflow.data.spark_dataset_source.SparkDatasetSource.load>`. If none of ``path``,
|
310
|
+
``table_name``, or ``sql`` are specified, a CodeDatasetSource is used, which will source
|
311
|
+
information from the run context.
|
312
|
+
table_name: The name of the Spark or Delta table that the DataFrame originally came from.
|
313
|
+
Note that the table does not have to match the DataFrame exactly, since the DataFrame
|
314
|
+
may have been modified by Spark operations. This is used to reload the dataset upon
|
315
|
+
request via :py:func:`SparkDataset.source.load()
|
316
|
+
<mlflow.data.spark_dataset_source.SparkDatasetSource.load>`. If none of ``path``,
|
317
|
+
``table_name``, or ``sql`` are specified, a CodeDatasetSource is used, which will source
|
318
|
+
information from the run context.
|
319
|
+
version: If the DataFrame originally came from a Delta table, specifies the version of the
|
320
|
+
Delta table. This is used to reload the dataset upon request via
|
321
|
+
:py:func:`SparkDataset.source.load()
|
322
|
+
<mlflow.data.spark_dataset_source.SparkDatasetSource.load>`. ``version`` cannot be
|
323
|
+
specified if ``sql`` is specified.
|
324
|
+
sql: The Spark SQL statement that was originally used to construct the DataFrame. Note that
|
325
|
+
the Spark SQL statement does not have to match the DataFrame exactly, since the
|
326
|
+
DataFrame may have been modified by Spark operations. This is used to reload the dataset
|
327
|
+
upon request via :py:func:`SparkDataset.source.load()
|
328
|
+
<mlflow.data.spark_dataset_source.SparkDatasetSource.load>`. If none of ``path``,
|
329
|
+
``table_name``, or ``sql`` are specified, a CodeDatasetSource is used, which will source
|
330
|
+
information from the run context.
|
331
|
+
targets: Optional. The name of the Data Frame column containing targets (labels) for
|
332
|
+
supervised learning.
|
333
|
+
name: The name of the dataset. E.g. "wiki_train". If unspecified, a name is automatically
|
334
|
+
generated.
|
335
|
+
digest: The digest (hash, fingerprint) of the dataset. If unspecified, a digest is
|
336
|
+
automatically computed.
|
337
|
+
predictions: Optional. The name of the column containing model predictions,
|
338
|
+
if the dataset contains model predictions. If specified, this column
|
339
|
+
must be present in the dataframe (``df``).
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
An instance of :py:class:`SparkDataset <mlflow.data.spark_dataset.SparkDataset>`.
|
343
|
+
"""
|
344
|
+
from mlflow.data.code_dataset_source import CodeDatasetSource
|
345
|
+
from mlflow.data.spark_delta_utils import (
|
346
|
+
_is_delta_table,
|
347
|
+
_is_delta_table_path,
|
348
|
+
_try_get_delta_table_latest_version_from_path,
|
349
|
+
_try_get_delta_table_latest_version_from_table_name,
|
350
|
+
)
|
351
|
+
from mlflow.tracking.context import registry
|
352
|
+
|
353
|
+
if (path, table_name, sql).count(None) < 2:
|
354
|
+
raise MlflowException(
|
355
|
+
"Must specify at most one of `path`, `table_name`, or `sql`.",
|
356
|
+
INVALID_PARAMETER_VALUE,
|
357
|
+
)
|
358
|
+
|
359
|
+
if (sql, version).count(None) == 0:
|
360
|
+
raise MlflowException(
|
361
|
+
"`version` may not be specified when `sql` is specified. `version` may only be"
|
362
|
+
" specified when `table_name` or `path` is specified.",
|
363
|
+
INVALID_PARAMETER_VALUE,
|
364
|
+
)
|
365
|
+
|
366
|
+
if sql is not None:
|
367
|
+
source = SparkDatasetSource(sql=sql)
|
368
|
+
elif path is not None:
|
369
|
+
if _is_delta_table_path(path):
|
370
|
+
version = version or _try_get_delta_table_latest_version_from_path(path)
|
371
|
+
source = DeltaDatasetSource(path=path, delta_table_version=version)
|
372
|
+
elif version is None:
|
373
|
+
source = SparkDatasetSource(path=path)
|
374
|
+
else:
|
375
|
+
raise MlflowException(
|
376
|
+
f"Version '{version}' was specified, but the path '{path}' does not refer"
|
377
|
+
f" to a Delta table.",
|
378
|
+
INVALID_PARAMETER_VALUE,
|
379
|
+
)
|
380
|
+
elif table_name is not None:
|
381
|
+
if _is_delta_table(table_name):
|
382
|
+
version = version or _try_get_delta_table_latest_version_from_table_name(table_name)
|
383
|
+
source = DeltaDatasetSource(
|
384
|
+
delta_table_name=table_name,
|
385
|
+
delta_table_version=version,
|
386
|
+
)
|
387
|
+
elif version is None:
|
388
|
+
source = SparkDatasetSource(table_name=table_name)
|
389
|
+
else:
|
390
|
+
raise MlflowException(
|
391
|
+
f"Version '{version}' was specified, but could not find a Delta table with name"
|
392
|
+
f" '{table_name}'.",
|
393
|
+
INVALID_PARAMETER_VALUE,
|
394
|
+
)
|
395
|
+
else:
|
396
|
+
context_tags = registry.resolve_tags()
|
397
|
+
source = CodeDatasetSource(tags=context_tags)
|
398
|
+
|
399
|
+
return SparkDataset(
|
400
|
+
df=df,
|
401
|
+
source=source,
|
402
|
+
targets=targets,
|
403
|
+
name=name,
|
404
|
+
digest=digest,
|
405
|
+
predictions=predictions,
|
406
|
+
)
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from typing import Any, Optional
|
2
|
+
|
3
|
+
from mlflow.data.dataset_source import DatasetSource
|
4
|
+
from mlflow.exceptions import MlflowException
|
5
|
+
from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
|
6
|
+
|
7
|
+
|
8
|
+
class SparkDatasetSource(DatasetSource):
|
9
|
+
"""
|
10
|
+
Represents the source of a dataset stored in a spark table.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
path: Optional[str] = None,
|
16
|
+
table_name: Optional[str] = None,
|
17
|
+
sql: Optional[str] = None,
|
18
|
+
):
|
19
|
+
if (path, table_name, sql).count(None) != 2:
|
20
|
+
raise MlflowException(
|
21
|
+
'Must specify exactly one of "path", "table_name", or "sql"',
|
22
|
+
INVALID_PARAMETER_VALUE,
|
23
|
+
)
|
24
|
+
self._path = path
|
25
|
+
self._table_name = table_name
|
26
|
+
self._sql = sql
|
27
|
+
|
28
|
+
@staticmethod
|
29
|
+
def _get_source_type() -> str:
|
30
|
+
return "spark"
|
31
|
+
|
32
|
+
def load(self, **kwargs):
|
33
|
+
"""Loads the dataset source as a Spark Dataset Source.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
An instance of ``pyspark.sql.DataFrame``.
|
37
|
+
|
38
|
+
"""
|
39
|
+
from pyspark.sql import SparkSession
|
40
|
+
|
41
|
+
spark = SparkSession.builder.getOrCreate()
|
42
|
+
|
43
|
+
if self._path:
|
44
|
+
return spark.read.parquet(self._path)
|
45
|
+
if self._table_name:
|
46
|
+
return spark.read.table(self._table_name)
|
47
|
+
if self._sql:
|
48
|
+
return spark.sql(self._sql)
|
49
|
+
|
50
|
+
@staticmethod
|
51
|
+
def _can_resolve(raw_source: Any):
|
52
|
+
return False
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def _resolve(cls, raw_source: str) -> "SparkDatasetSource":
|
56
|
+
raise NotImplementedError
|
57
|
+
|
58
|
+
def to_dict(self) -> dict[Any, Any]:
|
59
|
+
info = {}
|
60
|
+
if self._path is not None:
|
61
|
+
info["path"] = self._path
|
62
|
+
elif self._table_name is not None:
|
63
|
+
info["table_name"] = self._table_name
|
64
|
+
elif self._sql is not None:
|
65
|
+
info["sql"] = self._sql
|
66
|
+
return info
|
67
|
+
|
68
|
+
@classmethod
|
69
|
+
def from_dict(cls, source_dict: dict[Any, Any]) -> "SparkDatasetSource":
|
70
|
+
return cls(
|
71
|
+
path=source_dict.get("path"),
|
72
|
+
table_name=source_dict.get("table_name"),
|
73
|
+
sql=source_dict.get("sql"),
|
74
|
+
)
|
@@ -0,0 +1,118 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from mlflow.utils.string_utils import _backtick_quote
|
6
|
+
|
7
|
+
_logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
def _is_delta_table(table_name: str) -> bool:
|
11
|
+
"""Checks if a Delta table exists with the specified table name.
|
12
|
+
|
13
|
+
Returns:
|
14
|
+
True if a Delta table exists with the specified table name. False otherwise.
|
15
|
+
|
16
|
+
"""
|
17
|
+
from pyspark.sql import SparkSession
|
18
|
+
from pyspark.sql.utils import AnalysisException
|
19
|
+
|
20
|
+
spark = SparkSession.builder.getOrCreate()
|
21
|
+
|
22
|
+
try:
|
23
|
+
# use DESCRIBE DETAIL to check if the table is a Delta table
|
24
|
+
# https://docs.databricks.com/delta/delta-utility.html#describe-detail
|
25
|
+
# format will be `delta` for delta tables
|
26
|
+
spark.sql(f"DESCRIBE DETAIL {table_name}").filter("format = 'delta'").count()
|
27
|
+
return True
|
28
|
+
except AnalysisException:
|
29
|
+
return False
|
30
|
+
|
31
|
+
|
32
|
+
def _is_delta_table_path(path: str) -> bool:
|
33
|
+
"""Checks if the specified filesystem path is a Delta table.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
True if the specified path is a Delta table. False otherwise.
|
37
|
+
"""
|
38
|
+
if os.path.exists(path) and os.path.isdir(path) and "_delta_log" in os.listdir(path):
|
39
|
+
return True
|
40
|
+
from mlflow.utils.uri import dbfs_hdfs_uri_to_fuse_path
|
41
|
+
|
42
|
+
try:
|
43
|
+
dbfs_path = dbfs_hdfs_uri_to_fuse_path(path)
|
44
|
+
return os.path.exists(dbfs_path) and "_delta_log" in os.listdir(dbfs_path)
|
45
|
+
except Exception:
|
46
|
+
return False
|
47
|
+
|
48
|
+
|
49
|
+
def _try_get_delta_table_latest_version_from_path(path: str) -> Optional[int]:
|
50
|
+
"""Gets the latest version of the Delta table located at the specified path.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
path: The path to the Delta table.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
The version of the Delta table, or None if it cannot be resolved (e.g. because the
|
57
|
+
Delta core library is not installed or the specified path does not refer to a Delta
|
58
|
+
table).
|
59
|
+
|
60
|
+
"""
|
61
|
+
from pyspark.sql import SparkSession
|
62
|
+
|
63
|
+
try:
|
64
|
+
spark = SparkSession.builder.getOrCreate()
|
65
|
+
j_delta_table = spark._jvm.io.delta.tables.DeltaTable.forPath(spark._jsparkSession, path)
|
66
|
+
return _get_delta_table_latest_version(j_delta_table)
|
67
|
+
except Exception as e:
|
68
|
+
_logger.warning(
|
69
|
+
"Failed to obtain version information for Delta table at path '%s'. Version information"
|
70
|
+
" may not be included in the dataset source for MLflow Tracking. Exception: %s",
|
71
|
+
path,
|
72
|
+
e,
|
73
|
+
)
|
74
|
+
|
75
|
+
|
76
|
+
def _try_get_delta_table_latest_version_from_table_name(table_name: str) -> Optional[int]:
|
77
|
+
"""Gets the latest version of the Delta table with the specified name.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
table_name: The name of the Delta table.
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
The version of the Delta table, or None if it cannot be resolved (e.g. because the
|
84
|
+
Delta core library is not installed or no such table exists).
|
85
|
+
"""
|
86
|
+
from pyspark.sql import SparkSession
|
87
|
+
|
88
|
+
try:
|
89
|
+
spark = SparkSession.builder.getOrCreate()
|
90
|
+
backticked_table_name = ".".join(map(_backtick_quote, table_name.split(".")))
|
91
|
+
j_delta_table = spark._jvm.io.delta.tables.DeltaTable.forName(
|
92
|
+
spark._jsparkSession, backticked_table_name
|
93
|
+
)
|
94
|
+
return _get_delta_table_latest_version(j_delta_table)
|
95
|
+
except Exception as e:
|
96
|
+
_logger.warning(
|
97
|
+
"Failed to obtain version information for Delta table with name '%s'. Version"
|
98
|
+
" information may not be included in the dataset source for MLflow Tracking."
|
99
|
+
" Exception: %s",
|
100
|
+
table_name,
|
101
|
+
e,
|
102
|
+
)
|
103
|
+
|
104
|
+
|
105
|
+
def _get_delta_table_latest_version(j_delta_table) -> int:
|
106
|
+
"""Obtains the latest version of the specified Delta table Java class.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
j_delta_table: A Java DeltaTable class instance.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
The version of the Delta table.
|
113
|
+
|
114
|
+
"""
|
115
|
+
latest_commit_jdf = j_delta_table.history(1)
|
116
|
+
latest_commit_row = latest_commit_jdf.head()
|
117
|
+
version_field_idx = latest_commit_row.fieldIndex("version")
|
118
|
+
return latest_commit_row.get(version_field_idx)
|