genesis-flow 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genesis_flow-1.0.0.dist-info/METADATA +822 -0
- genesis_flow-1.0.0.dist-info/RECORD +645 -0
- genesis_flow-1.0.0.dist-info/WHEEL +5 -0
- genesis_flow-1.0.0.dist-info/entry_points.txt +19 -0
- genesis_flow-1.0.0.dist-info/licenses/LICENSE.txt +202 -0
- genesis_flow-1.0.0.dist-info/top_level.txt +1 -0
- mlflow/__init__.py +367 -0
- mlflow/__main__.py +3 -0
- mlflow/ag2/__init__.py +56 -0
- mlflow/ag2/ag2_logger.py +294 -0
- mlflow/anthropic/__init__.py +40 -0
- mlflow/anthropic/autolog.py +129 -0
- mlflow/anthropic/chat.py +144 -0
- mlflow/artifacts/__init__.py +268 -0
- mlflow/autogen/__init__.py +144 -0
- mlflow/autogen/chat.py +142 -0
- mlflow/azure/__init__.py +26 -0
- mlflow/azure/auth_handler.py +257 -0
- mlflow/azure/client.py +319 -0
- mlflow/azure/config.py +120 -0
- mlflow/azure/connection_factory.py +340 -0
- mlflow/azure/exceptions.py +27 -0
- mlflow/azure/stores.py +327 -0
- mlflow/azure/utils.py +183 -0
- mlflow/bedrock/__init__.py +45 -0
- mlflow/bedrock/_autolog.py +202 -0
- mlflow/bedrock/chat.py +122 -0
- mlflow/bedrock/stream.py +160 -0
- mlflow/bedrock/utils.py +43 -0
- mlflow/cli.py +707 -0
- mlflow/client.py +12 -0
- mlflow/config/__init__.py +56 -0
- mlflow/crewai/__init__.py +79 -0
- mlflow/crewai/autolog.py +253 -0
- mlflow/crewai/chat.py +29 -0
- mlflow/data/__init__.py +75 -0
- mlflow/data/artifact_dataset_sources.py +170 -0
- mlflow/data/code_dataset_source.py +40 -0
- mlflow/data/dataset.py +123 -0
- mlflow/data/dataset_registry.py +168 -0
- mlflow/data/dataset_source.py +110 -0
- mlflow/data/dataset_source_registry.py +219 -0
- mlflow/data/delta_dataset_source.py +167 -0
- mlflow/data/digest_utils.py +108 -0
- mlflow/data/evaluation_dataset.py +562 -0
- mlflow/data/filesystem_dataset_source.py +81 -0
- mlflow/data/http_dataset_source.py +145 -0
- mlflow/data/huggingface_dataset.py +258 -0
- mlflow/data/huggingface_dataset_source.py +118 -0
- mlflow/data/meta_dataset.py +104 -0
- mlflow/data/numpy_dataset.py +223 -0
- mlflow/data/pandas_dataset.py +231 -0
- mlflow/data/polars_dataset.py +352 -0
- mlflow/data/pyfunc_dataset_mixin.py +31 -0
- mlflow/data/schema.py +76 -0
- mlflow/data/sources.py +1 -0
- mlflow/data/spark_dataset.py +406 -0
- mlflow/data/spark_dataset_source.py +74 -0
- mlflow/data/spark_delta_utils.py +118 -0
- mlflow/data/tensorflow_dataset.py +350 -0
- mlflow/data/uc_volume_dataset_source.py +81 -0
- mlflow/db.py +27 -0
- mlflow/dspy/__init__.py +17 -0
- mlflow/dspy/autolog.py +197 -0
- mlflow/dspy/callback.py +398 -0
- mlflow/dspy/constant.py +1 -0
- mlflow/dspy/load.py +93 -0
- mlflow/dspy/save.py +393 -0
- mlflow/dspy/util.py +109 -0
- mlflow/dspy/wrapper.py +226 -0
- mlflow/entities/__init__.py +104 -0
- mlflow/entities/_mlflow_object.py +52 -0
- mlflow/entities/assessment.py +545 -0
- mlflow/entities/assessment_error.py +80 -0
- mlflow/entities/assessment_source.py +141 -0
- mlflow/entities/dataset.py +92 -0
- mlflow/entities/dataset_input.py +51 -0
- mlflow/entities/dataset_summary.py +62 -0
- mlflow/entities/document.py +48 -0
- mlflow/entities/experiment.py +109 -0
- mlflow/entities/experiment_tag.py +35 -0
- mlflow/entities/file_info.py +45 -0
- mlflow/entities/input_tag.py +35 -0
- mlflow/entities/lifecycle_stage.py +35 -0
- mlflow/entities/logged_model.py +228 -0
- mlflow/entities/logged_model_input.py +26 -0
- mlflow/entities/logged_model_output.py +32 -0
- mlflow/entities/logged_model_parameter.py +46 -0
- mlflow/entities/logged_model_status.py +74 -0
- mlflow/entities/logged_model_tag.py +33 -0
- mlflow/entities/metric.py +200 -0
- mlflow/entities/model_registry/__init__.py +29 -0
- mlflow/entities/model_registry/_model_registry_entity.py +13 -0
- mlflow/entities/model_registry/model_version.py +243 -0
- mlflow/entities/model_registry/model_version_deployment_job_run_state.py +44 -0
- mlflow/entities/model_registry/model_version_deployment_job_state.py +70 -0
- mlflow/entities/model_registry/model_version_search.py +25 -0
- mlflow/entities/model_registry/model_version_stages.py +25 -0
- mlflow/entities/model_registry/model_version_status.py +35 -0
- mlflow/entities/model_registry/model_version_tag.py +35 -0
- mlflow/entities/model_registry/prompt.py +73 -0
- mlflow/entities/model_registry/prompt_version.py +244 -0
- mlflow/entities/model_registry/registered_model.py +175 -0
- mlflow/entities/model_registry/registered_model_alias.py +35 -0
- mlflow/entities/model_registry/registered_model_deployment_job_state.py +39 -0
- mlflow/entities/model_registry/registered_model_search.py +25 -0
- mlflow/entities/model_registry/registered_model_tag.py +35 -0
- mlflow/entities/multipart_upload.py +74 -0
- mlflow/entities/param.py +49 -0
- mlflow/entities/run.py +97 -0
- mlflow/entities/run_data.py +84 -0
- mlflow/entities/run_info.py +188 -0
- mlflow/entities/run_inputs.py +59 -0
- mlflow/entities/run_outputs.py +43 -0
- mlflow/entities/run_status.py +41 -0
- mlflow/entities/run_tag.py +36 -0
- mlflow/entities/source_type.py +31 -0
- mlflow/entities/span.py +774 -0
- mlflow/entities/span_event.py +96 -0
- mlflow/entities/span_status.py +102 -0
- mlflow/entities/trace.py +317 -0
- mlflow/entities/trace_data.py +71 -0
- mlflow/entities/trace_info.py +220 -0
- mlflow/entities/trace_info_v2.py +162 -0
- mlflow/entities/trace_location.py +173 -0
- mlflow/entities/trace_state.py +39 -0
- mlflow/entities/trace_status.py +68 -0
- mlflow/entities/view_type.py +51 -0
- mlflow/environment_variables.py +866 -0
- mlflow/evaluation/__init__.py +16 -0
- mlflow/evaluation/assessment.py +369 -0
- mlflow/evaluation/evaluation.py +411 -0
- mlflow/evaluation/evaluation_tag.py +61 -0
- mlflow/evaluation/fluent.py +48 -0
- mlflow/evaluation/utils.py +201 -0
- mlflow/exceptions.py +213 -0
- mlflow/experiments.py +140 -0
- mlflow/gemini/__init__.py +81 -0
- mlflow/gemini/autolog.py +186 -0
- mlflow/gemini/chat.py +261 -0
- mlflow/genai/__init__.py +71 -0
- mlflow/genai/datasets/__init__.py +67 -0
- mlflow/genai/datasets/evaluation_dataset.py +131 -0
- mlflow/genai/evaluation/__init__.py +3 -0
- mlflow/genai/evaluation/base.py +411 -0
- mlflow/genai/evaluation/constant.py +23 -0
- mlflow/genai/evaluation/utils.py +244 -0
- mlflow/genai/judges/__init__.py +21 -0
- mlflow/genai/judges/databricks.py +404 -0
- mlflow/genai/label_schemas/__init__.py +153 -0
- mlflow/genai/label_schemas/label_schemas.py +209 -0
- mlflow/genai/labeling/__init__.py +159 -0
- mlflow/genai/labeling/labeling.py +250 -0
- mlflow/genai/optimize/__init__.py +13 -0
- mlflow/genai/optimize/base.py +198 -0
- mlflow/genai/optimize/optimizers/__init__.py +4 -0
- mlflow/genai/optimize/optimizers/base_optimizer.py +38 -0
- mlflow/genai/optimize/optimizers/dspy_mipro_optimizer.py +221 -0
- mlflow/genai/optimize/optimizers/dspy_optimizer.py +91 -0
- mlflow/genai/optimize/optimizers/utils/dspy_mipro_callback.py +76 -0
- mlflow/genai/optimize/optimizers/utils/dspy_mipro_utils.py +18 -0
- mlflow/genai/optimize/types.py +75 -0
- mlflow/genai/optimize/util.py +30 -0
- mlflow/genai/prompts/__init__.py +206 -0
- mlflow/genai/scheduled_scorers.py +431 -0
- mlflow/genai/scorers/__init__.py +26 -0
- mlflow/genai/scorers/base.py +492 -0
- mlflow/genai/scorers/builtin_scorers.py +765 -0
- mlflow/genai/scorers/scorer_utils.py +138 -0
- mlflow/genai/scorers/validation.py +165 -0
- mlflow/genai/utils/data_validation.py +146 -0
- mlflow/genai/utils/enum_utils.py +23 -0
- mlflow/genai/utils/trace_utils.py +211 -0
- mlflow/groq/__init__.py +42 -0
- mlflow/groq/_groq_autolog.py +74 -0
- mlflow/johnsnowlabs/__init__.py +888 -0
- mlflow/langchain/__init__.py +24 -0
- mlflow/langchain/api_request_parallel_processor.py +330 -0
- mlflow/langchain/autolog.py +147 -0
- mlflow/langchain/chat_agent_langgraph.py +340 -0
- mlflow/langchain/constant.py +1 -0
- mlflow/langchain/constants.py +1 -0
- mlflow/langchain/databricks_dependencies.py +444 -0
- mlflow/langchain/langchain_tracer.py +597 -0
- mlflow/langchain/model.py +919 -0
- mlflow/langchain/output_parsers.py +142 -0
- mlflow/langchain/retriever_chain.py +153 -0
- mlflow/langchain/runnables.py +527 -0
- mlflow/langchain/utils/chat.py +402 -0
- mlflow/langchain/utils/logging.py +671 -0
- mlflow/langchain/utils/serialization.py +36 -0
- mlflow/legacy_databricks_cli/__init__.py +0 -0
- mlflow/legacy_databricks_cli/configure/__init__.py +0 -0
- mlflow/legacy_databricks_cli/configure/provider.py +482 -0
- mlflow/litellm/__init__.py +175 -0
- mlflow/llama_index/__init__.py +22 -0
- mlflow/llama_index/autolog.py +55 -0
- mlflow/llama_index/chat.py +43 -0
- mlflow/llama_index/constant.py +1 -0
- mlflow/llama_index/model.py +577 -0
- mlflow/llama_index/pyfunc_wrapper.py +332 -0
- mlflow/llama_index/serialize_objects.py +188 -0
- mlflow/llama_index/tracer.py +561 -0
- mlflow/metrics/__init__.py +479 -0
- mlflow/metrics/base.py +39 -0
- mlflow/metrics/genai/__init__.py +25 -0
- mlflow/metrics/genai/base.py +101 -0
- mlflow/metrics/genai/genai_metric.py +771 -0
- mlflow/metrics/genai/metric_definitions.py +450 -0
- mlflow/metrics/genai/model_utils.py +371 -0
- mlflow/metrics/genai/prompt_template.py +68 -0
- mlflow/metrics/genai/prompts/__init__.py +0 -0
- mlflow/metrics/genai/prompts/v1.py +422 -0
- mlflow/metrics/genai/utils.py +6 -0
- mlflow/metrics/metric_definitions.py +619 -0
- mlflow/mismatch.py +34 -0
- mlflow/mistral/__init__.py +34 -0
- mlflow/mistral/autolog.py +71 -0
- mlflow/mistral/chat.py +135 -0
- mlflow/ml_package_versions.py +452 -0
- mlflow/models/__init__.py +97 -0
- mlflow/models/auth_policy.py +83 -0
- mlflow/models/cli.py +354 -0
- mlflow/models/container/__init__.py +294 -0
- mlflow/models/container/scoring_server/__init__.py +0 -0
- mlflow/models/container/scoring_server/nginx.conf +39 -0
- mlflow/models/dependencies_schemas.py +287 -0
- mlflow/models/display_utils.py +158 -0
- mlflow/models/docker_utils.py +211 -0
- mlflow/models/evaluation/__init__.py +23 -0
- mlflow/models/evaluation/_shap_patch.py +64 -0
- mlflow/models/evaluation/artifacts.py +194 -0
- mlflow/models/evaluation/base.py +1811 -0
- mlflow/models/evaluation/calibration_curve.py +109 -0
- mlflow/models/evaluation/default_evaluator.py +996 -0
- mlflow/models/evaluation/deprecated.py +23 -0
- mlflow/models/evaluation/evaluator_registry.py +80 -0
- mlflow/models/evaluation/evaluators/classifier.py +704 -0
- mlflow/models/evaluation/evaluators/default.py +233 -0
- mlflow/models/evaluation/evaluators/regressor.py +96 -0
- mlflow/models/evaluation/evaluators/shap.py +296 -0
- mlflow/models/evaluation/lift_curve.py +178 -0
- mlflow/models/evaluation/utils/metric.py +123 -0
- mlflow/models/evaluation/utils/trace.py +179 -0
- mlflow/models/evaluation/validation.py +434 -0
- mlflow/models/flavor_backend.py +93 -0
- mlflow/models/flavor_backend_registry.py +53 -0
- mlflow/models/model.py +1639 -0
- mlflow/models/model_config.py +150 -0
- mlflow/models/notebook_resources/agent_evaluation_template.html +235 -0
- mlflow/models/notebook_resources/eval_with_dataset_example.py +22 -0
- mlflow/models/notebook_resources/eval_with_synthetic_example.py +22 -0
- mlflow/models/python_api.py +369 -0
- mlflow/models/rag_signatures.py +128 -0
- mlflow/models/resources.py +321 -0
- mlflow/models/signature.py +662 -0
- mlflow/models/utils.py +2054 -0
- mlflow/models/wheeled_model.py +280 -0
- mlflow/openai/__init__.py +57 -0
- mlflow/openai/_agent_tracer.py +364 -0
- mlflow/openai/api_request_parallel_processor.py +131 -0
- mlflow/openai/autolog.py +509 -0
- mlflow/openai/constant.py +1 -0
- mlflow/openai/model.py +824 -0
- mlflow/openai/utils/chat_schema.py +367 -0
- mlflow/optuna/__init__.py +3 -0
- mlflow/optuna/storage.py +646 -0
- mlflow/plugins/__init__.py +72 -0
- mlflow/plugins/base.py +358 -0
- mlflow/plugins/builtin/__init__.py +24 -0
- mlflow/plugins/builtin/pytorch_plugin.py +150 -0
- mlflow/plugins/builtin/sklearn_plugin.py +158 -0
- mlflow/plugins/builtin/transformers_plugin.py +187 -0
- mlflow/plugins/cli.py +321 -0
- mlflow/plugins/discovery.py +340 -0
- mlflow/plugins/manager.py +465 -0
- mlflow/plugins/registry.py +316 -0
- mlflow/plugins/templates/framework_plugin_template.py +329 -0
- mlflow/prompt/constants.py +20 -0
- mlflow/prompt/promptlab_model.py +197 -0
- mlflow/prompt/registry_utils.py +248 -0
- mlflow/promptflow/__init__.py +495 -0
- mlflow/protos/__init__.py +0 -0
- mlflow/protos/assessments_pb2.py +174 -0
- mlflow/protos/databricks_artifacts_pb2.py +489 -0
- mlflow/protos/databricks_filesystem_service_pb2.py +196 -0
- mlflow/protos/databricks_managed_catalog_messages_pb2.py +95 -0
- mlflow/protos/databricks_managed_catalog_service_pb2.py +86 -0
- mlflow/protos/databricks_pb2.py +267 -0
- mlflow/protos/databricks_trace_server_pb2.py +374 -0
- mlflow/protos/databricks_uc_registry_messages_pb2.py +1249 -0
- mlflow/protos/databricks_uc_registry_service_pb2.py +170 -0
- mlflow/protos/facet_feature_statistics_pb2.py +296 -0
- mlflow/protos/internal_pb2.py +77 -0
- mlflow/protos/mlflow_artifacts_pb2.py +336 -0
- mlflow/protos/model_registry_pb2.py +1073 -0
- mlflow/protos/scalapb/__init__.py +0 -0
- mlflow/protos/scalapb/scalapb_pb2.py +104 -0
- mlflow/protos/service_pb2.py +2600 -0
- mlflow/protos/unity_catalog_oss_messages_pb2.py +457 -0
- mlflow/protos/unity_catalog_oss_service_pb2.py +130 -0
- mlflow/protos/unity_catalog_prompt_messages_pb2.py +447 -0
- mlflow/protos/unity_catalog_prompt_messages_pb2_grpc.py +24 -0
- mlflow/protos/unity_catalog_prompt_service_pb2.py +164 -0
- mlflow/protos/unity_catalog_prompt_service_pb2_grpc.py +785 -0
- mlflow/py.typed +0 -0
- mlflow/pydantic_ai/__init__.py +57 -0
- mlflow/pydantic_ai/autolog.py +173 -0
- mlflow/pyfunc/__init__.py +3844 -0
- mlflow/pyfunc/_mlflow_pyfunc_backend_predict.py +61 -0
- mlflow/pyfunc/backend.py +523 -0
- mlflow/pyfunc/context.py +78 -0
- mlflow/pyfunc/dbconnect_artifact_cache.py +144 -0
- mlflow/pyfunc/loaders/__init__.py +7 -0
- mlflow/pyfunc/loaders/chat_agent.py +117 -0
- mlflow/pyfunc/loaders/chat_model.py +125 -0
- mlflow/pyfunc/loaders/code_model.py +31 -0
- mlflow/pyfunc/loaders/responses_agent.py +112 -0
- mlflow/pyfunc/mlserver.py +46 -0
- mlflow/pyfunc/model.py +1473 -0
- mlflow/pyfunc/scoring_server/__init__.py +604 -0
- mlflow/pyfunc/scoring_server/app.py +7 -0
- mlflow/pyfunc/scoring_server/client.py +146 -0
- mlflow/pyfunc/spark_model_cache.py +48 -0
- mlflow/pyfunc/stdin_server.py +44 -0
- mlflow/pyfunc/utils/__init__.py +3 -0
- mlflow/pyfunc/utils/data_validation.py +224 -0
- mlflow/pyfunc/utils/environment.py +22 -0
- mlflow/pyfunc/utils/input_converter.py +47 -0
- mlflow/pyfunc/utils/serving_data_parser.py +11 -0
- mlflow/pytorch/__init__.py +1171 -0
- mlflow/pytorch/_lightning_autolog.py +580 -0
- mlflow/pytorch/_pytorch_autolog.py +50 -0
- mlflow/pytorch/pickle_module.py +35 -0
- mlflow/rfunc/__init__.py +42 -0
- mlflow/rfunc/backend.py +134 -0
- mlflow/runs.py +89 -0
- mlflow/server/__init__.py +302 -0
- mlflow/server/auth/__init__.py +1224 -0
- mlflow/server/auth/__main__.py +4 -0
- mlflow/server/auth/basic_auth.ini +6 -0
- mlflow/server/auth/cli.py +11 -0
- mlflow/server/auth/client.py +537 -0
- mlflow/server/auth/config.py +34 -0
- mlflow/server/auth/db/__init__.py +0 -0
- mlflow/server/auth/db/cli.py +18 -0
- mlflow/server/auth/db/migrations/__init__.py +0 -0
- mlflow/server/auth/db/migrations/alembic.ini +110 -0
- mlflow/server/auth/db/migrations/env.py +76 -0
- mlflow/server/auth/db/migrations/versions/8606fa83a998_initial_migration.py +51 -0
- mlflow/server/auth/db/migrations/versions/__init__.py +0 -0
- mlflow/server/auth/db/models.py +67 -0
- mlflow/server/auth/db/utils.py +37 -0
- mlflow/server/auth/entities.py +165 -0
- mlflow/server/auth/logo.py +14 -0
- mlflow/server/auth/permissions.py +65 -0
- mlflow/server/auth/routes.py +18 -0
- mlflow/server/auth/sqlalchemy_store.py +263 -0
- mlflow/server/graphql/__init__.py +0 -0
- mlflow/server/graphql/autogenerated_graphql_schema.py +353 -0
- mlflow/server/graphql/graphql_custom_scalars.py +24 -0
- mlflow/server/graphql/graphql_errors.py +15 -0
- mlflow/server/graphql/graphql_no_batching.py +89 -0
- mlflow/server/graphql/graphql_schema_extensions.py +74 -0
- mlflow/server/handlers.py +3217 -0
- mlflow/server/prometheus_exporter.py +17 -0
- mlflow/server/validation.py +30 -0
- mlflow/shap/__init__.py +691 -0
- mlflow/sklearn/__init__.py +1994 -0
- mlflow/sklearn/utils.py +1041 -0
- mlflow/smolagents/__init__.py +66 -0
- mlflow/smolagents/autolog.py +139 -0
- mlflow/smolagents/chat.py +29 -0
- mlflow/store/__init__.py +10 -0
- mlflow/store/_unity_catalog/__init__.py +1 -0
- mlflow/store/_unity_catalog/lineage/__init__.py +1 -0
- mlflow/store/_unity_catalog/lineage/constants.py +2 -0
- mlflow/store/_unity_catalog/registry/__init__.py +6 -0
- mlflow/store/_unity_catalog/registry/prompt_info.py +75 -0
- mlflow/store/_unity_catalog/registry/rest_store.py +1740 -0
- mlflow/store/_unity_catalog/registry/uc_oss_rest_store.py +507 -0
- mlflow/store/_unity_catalog/registry/utils.py +121 -0
- mlflow/store/artifact/__init__.py +0 -0
- mlflow/store/artifact/artifact_repo.py +472 -0
- mlflow/store/artifact/artifact_repository_registry.py +154 -0
- mlflow/store/artifact/azure_blob_artifact_repo.py +275 -0
- mlflow/store/artifact/azure_data_lake_artifact_repo.py +295 -0
- mlflow/store/artifact/cli.py +141 -0
- mlflow/store/artifact/cloud_artifact_repo.py +332 -0
- mlflow/store/artifact/databricks_artifact_repo.py +729 -0
- mlflow/store/artifact/databricks_artifact_repo_resources.py +301 -0
- mlflow/store/artifact/databricks_logged_model_artifact_repo.py +93 -0
- mlflow/store/artifact/databricks_models_artifact_repo.py +216 -0
- mlflow/store/artifact/databricks_sdk_artifact_repo.py +134 -0
- mlflow/store/artifact/databricks_sdk_models_artifact_repo.py +97 -0
- mlflow/store/artifact/dbfs_artifact_repo.py +240 -0
- mlflow/store/artifact/ftp_artifact_repo.py +132 -0
- mlflow/store/artifact/gcs_artifact_repo.py +296 -0
- mlflow/store/artifact/hdfs_artifact_repo.py +209 -0
- mlflow/store/artifact/http_artifact_repo.py +218 -0
- mlflow/store/artifact/local_artifact_repo.py +142 -0
- mlflow/store/artifact/mlflow_artifacts_repo.py +94 -0
- mlflow/store/artifact/models_artifact_repo.py +259 -0
- mlflow/store/artifact/optimized_s3_artifact_repo.py +356 -0
- mlflow/store/artifact/presigned_url_artifact_repo.py +173 -0
- mlflow/store/artifact/r2_artifact_repo.py +70 -0
- mlflow/store/artifact/runs_artifact_repo.py +265 -0
- mlflow/store/artifact/s3_artifact_repo.py +330 -0
- mlflow/store/artifact/sftp_artifact_repo.py +141 -0
- mlflow/store/artifact/uc_volume_artifact_repo.py +76 -0
- mlflow/store/artifact/unity_catalog_models_artifact_repo.py +168 -0
- mlflow/store/artifact/unity_catalog_oss_models_artifact_repo.py +168 -0
- mlflow/store/artifact/utils/__init__.py +0 -0
- mlflow/store/artifact/utils/models.py +148 -0
- mlflow/store/db/__init__.py +0 -0
- mlflow/store/db/base_sql_model.py +3 -0
- mlflow/store/db/db_types.py +10 -0
- mlflow/store/db/utils.py +314 -0
- mlflow/store/db_migrations/__init__.py +0 -0
- mlflow/store/db_migrations/alembic.ini +74 -0
- mlflow/store/db_migrations/env.py +84 -0
- mlflow/store/db_migrations/versions/0584bdc529eb_add_cascading_deletion_to_datasets_from_experiments.py +88 -0
- mlflow/store/db_migrations/versions/0a8213491aaa_drop_duplicate_killed_constraint.py +49 -0
- mlflow/store/db_migrations/versions/0c779009ac13_add_deleted_time_field_to_runs_table.py +24 -0
- mlflow/store/db_migrations/versions/181f10493468_allow_nulls_for_metric_values.py +35 -0
- mlflow/store/db_migrations/versions/27a6a02d2cf1_add_model_version_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/2b4d017a5e9b_add_model_registry_tables_to_db.py +77 -0
- mlflow/store/db_migrations/versions/2d6e25af4d3e_increase_max_param_val_length.py +33 -0
- mlflow/store/db_migrations/versions/3500859a5d39_add_model_aliases_table.py +50 -0
- mlflow/store/db_migrations/versions/39d1c3be5f05_add_is_nan_constraint_for_metrics_tables_if_necessary.py +41 -0
- mlflow/store/db_migrations/versions/400f98739977_add_logged_model_tables.py +123 -0
- mlflow/store/db_migrations/versions/4465047574b1_increase_max_dataset_schema_size.py +38 -0
- mlflow/store/db_migrations/versions/451aebb31d03_add_metric_step.py +35 -0
- mlflow/store/db_migrations/versions/5b0e9adcef9c_add_cascade_deletion_to_trace_tables_fk.py +40 -0
- mlflow/store/db_migrations/versions/6953534de441_add_step_to_inputs_table.py +25 -0
- mlflow/store/db_migrations/versions/728d730b5ebd_add_registered_model_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/7ac759974ad8_update_run_tags_with_larger_limit.py +36 -0
- mlflow/store/db_migrations/versions/7f2a7d5fae7d_add_datasets_inputs_input_tags_tables.py +82 -0
- mlflow/store/db_migrations/versions/84291f40a231_add_run_link_to_model_version.py +26 -0
- mlflow/store/db_migrations/versions/867495a8f9d4_add_trace_tables.py +90 -0
- mlflow/store/db_migrations/versions/89d4b8295536_create_latest_metrics_table.py +169 -0
- mlflow/store/db_migrations/versions/90e64c465722_migrate_user_column_to_tags.py +64 -0
- mlflow/store/db_migrations/versions/97727af70f4d_creation_time_last_update_time_experiments.py +25 -0
- mlflow/store/db_migrations/versions/__init__.py +0 -0
- mlflow/store/db_migrations/versions/a8c4a736bde6_allow_nulls_for_run_id.py +27 -0
- mlflow/store/db_migrations/versions/acf3f17fdcc7_add_storage_location_field_to_model_.py +29 -0
- mlflow/store/db_migrations/versions/bd07f7e963c5_create_index_on_run_uuid.py +26 -0
- mlflow/store/db_migrations/versions/bda7b8c39065_increase_model_version_tag_value_limit.py +38 -0
- mlflow/store/db_migrations/versions/c48cb773bb87_reset_default_value_for_is_nan_in_metrics_table_for_mysql.py +41 -0
- mlflow/store/db_migrations/versions/cbc13b556ace_add_v3_trace_schema_columns.py +31 -0
- mlflow/store/db_migrations/versions/cc1f77228345_change_param_value_length_to_500.py +34 -0
- mlflow/store/db_migrations/versions/cfd24bdc0731_update_run_status_constraint_with_killed.py +78 -0
- mlflow/store/db_migrations/versions/df50e92ffc5e_add_experiment_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/f5a4f2784254_increase_run_tag_value_limit.py +36 -0
- mlflow/store/entities/__init__.py +3 -0
- mlflow/store/entities/paged_list.py +18 -0
- mlflow/store/model_registry/__init__.py +10 -0
- mlflow/store/model_registry/abstract_store.py +1081 -0
- mlflow/store/model_registry/base_rest_store.py +44 -0
- mlflow/store/model_registry/databricks_workspace_model_registry_rest_store.py +37 -0
- mlflow/store/model_registry/dbmodels/__init__.py +0 -0
- mlflow/store/model_registry/dbmodels/models.py +206 -0
- mlflow/store/model_registry/file_store.py +1091 -0
- mlflow/store/model_registry/rest_store.py +481 -0
- mlflow/store/model_registry/sqlalchemy_store.py +1286 -0
- mlflow/store/tracking/__init__.py +23 -0
- mlflow/store/tracking/abstract_store.py +816 -0
- mlflow/store/tracking/dbmodels/__init__.py +0 -0
- mlflow/store/tracking/dbmodels/initial_models.py +243 -0
- mlflow/store/tracking/dbmodels/models.py +1073 -0
- mlflow/store/tracking/file_store.py +2438 -0
- mlflow/store/tracking/postgres_managed_identity.py +146 -0
- mlflow/store/tracking/rest_store.py +1131 -0
- mlflow/store/tracking/sqlalchemy_store.py +2785 -0
- mlflow/system_metrics/__init__.py +61 -0
- mlflow/system_metrics/metrics/__init__.py +0 -0
- mlflow/system_metrics/metrics/base_metrics_monitor.py +32 -0
- mlflow/system_metrics/metrics/cpu_monitor.py +23 -0
- mlflow/system_metrics/metrics/disk_monitor.py +21 -0
- mlflow/system_metrics/metrics/gpu_monitor.py +71 -0
- mlflow/system_metrics/metrics/network_monitor.py +34 -0
- mlflow/system_metrics/metrics/rocm_monitor.py +123 -0
- mlflow/system_metrics/system_metrics_monitor.py +198 -0
- mlflow/tracing/__init__.py +16 -0
- mlflow/tracing/assessment.py +356 -0
- mlflow/tracing/client.py +531 -0
- mlflow/tracing/config.py +125 -0
- mlflow/tracing/constant.py +105 -0
- mlflow/tracing/destination.py +81 -0
- mlflow/tracing/display/__init__.py +40 -0
- mlflow/tracing/display/display_handler.py +196 -0
- mlflow/tracing/export/async_export_queue.py +186 -0
- mlflow/tracing/export/inference_table.py +138 -0
- mlflow/tracing/export/mlflow_v3.py +137 -0
- mlflow/tracing/export/utils.py +70 -0
- mlflow/tracing/fluent.py +1417 -0
- mlflow/tracing/processor/base_mlflow.py +199 -0
- mlflow/tracing/processor/inference_table.py +175 -0
- mlflow/tracing/processor/mlflow_v3.py +47 -0
- mlflow/tracing/processor/otel.py +73 -0
- mlflow/tracing/provider.py +487 -0
- mlflow/tracing/trace_manager.py +200 -0
- mlflow/tracing/utils/__init__.py +616 -0
- mlflow/tracing/utils/artifact_utils.py +28 -0
- mlflow/tracing/utils/copy.py +55 -0
- mlflow/tracing/utils/environment.py +55 -0
- mlflow/tracing/utils/exception.py +21 -0
- mlflow/tracing/utils/once.py +35 -0
- mlflow/tracing/utils/otlp.py +63 -0
- mlflow/tracing/utils/processor.py +54 -0
- mlflow/tracing/utils/search.py +292 -0
- mlflow/tracing/utils/timeout.py +250 -0
- mlflow/tracing/utils/token.py +19 -0
- mlflow/tracing/utils/truncation.py +124 -0
- mlflow/tracing/utils/warning.py +76 -0
- mlflow/tracking/__init__.py +39 -0
- mlflow/tracking/_model_registry/__init__.py +1 -0
- mlflow/tracking/_model_registry/client.py +764 -0
- mlflow/tracking/_model_registry/fluent.py +853 -0
- mlflow/tracking/_model_registry/registry.py +67 -0
- mlflow/tracking/_model_registry/utils.py +251 -0
- mlflow/tracking/_tracking_service/__init__.py +0 -0
- mlflow/tracking/_tracking_service/client.py +883 -0
- mlflow/tracking/_tracking_service/registry.py +56 -0
- mlflow/tracking/_tracking_service/utils.py +275 -0
- mlflow/tracking/artifact_utils.py +179 -0
- mlflow/tracking/client.py +5900 -0
- mlflow/tracking/context/__init__.py +0 -0
- mlflow/tracking/context/abstract_context.py +35 -0
- mlflow/tracking/context/databricks_cluster_context.py +15 -0
- mlflow/tracking/context/databricks_command_context.py +15 -0
- mlflow/tracking/context/databricks_job_context.py +49 -0
- mlflow/tracking/context/databricks_notebook_context.py +41 -0
- mlflow/tracking/context/databricks_repo_context.py +43 -0
- mlflow/tracking/context/default_context.py +51 -0
- mlflow/tracking/context/git_context.py +32 -0
- mlflow/tracking/context/registry.py +98 -0
- mlflow/tracking/context/system_environment_context.py +15 -0
- mlflow/tracking/default_experiment/__init__.py +1 -0
- mlflow/tracking/default_experiment/abstract_context.py +43 -0
- mlflow/tracking/default_experiment/databricks_notebook_experiment_provider.py +44 -0
- mlflow/tracking/default_experiment/registry.py +75 -0
- mlflow/tracking/fluent.py +3595 -0
- mlflow/tracking/metric_value_conversion_utils.py +93 -0
- mlflow/tracking/multimedia.py +206 -0
- mlflow/tracking/registry.py +86 -0
- mlflow/tracking/request_auth/__init__.py +0 -0
- mlflow/tracking/request_auth/abstract_request_auth_provider.py +34 -0
- mlflow/tracking/request_auth/registry.py +60 -0
- mlflow/tracking/request_header/__init__.py +0 -0
- mlflow/tracking/request_header/abstract_request_header_provider.py +36 -0
- mlflow/tracking/request_header/databricks_request_header_provider.py +38 -0
- mlflow/tracking/request_header/default_request_header_provider.py +17 -0
- mlflow/tracking/request_header/registry.py +79 -0
- mlflow/transformers/__init__.py +2982 -0
- mlflow/transformers/flavor_config.py +258 -0
- mlflow/transformers/hub_utils.py +83 -0
- mlflow/transformers/llm_inference_utils.py +468 -0
- mlflow/transformers/model_io.py +301 -0
- mlflow/transformers/peft.py +51 -0
- mlflow/transformers/signature.py +183 -0
- mlflow/transformers/torch_utils.py +55 -0
- mlflow/types/__init__.py +21 -0
- mlflow/types/agent.py +270 -0
- mlflow/types/chat.py +240 -0
- mlflow/types/llm.py +935 -0
- mlflow/types/responses.py +139 -0
- mlflow/types/responses_helpers.py +416 -0
- mlflow/types/schema.py +1505 -0
- mlflow/types/type_hints.py +647 -0
- mlflow/types/utils.py +753 -0
- mlflow/utils/__init__.py +283 -0
- mlflow/utils/_capture_modules.py +256 -0
- mlflow/utils/_capture_transformers_modules.py +75 -0
- mlflow/utils/_spark_utils.py +201 -0
- mlflow/utils/_unity_catalog_oss_utils.py +97 -0
- mlflow/utils/_unity_catalog_utils.py +479 -0
- mlflow/utils/annotations.py +218 -0
- mlflow/utils/arguments_utils.py +16 -0
- mlflow/utils/async_logging/__init__.py +1 -0
- mlflow/utils/async_logging/async_artifacts_logging_queue.py +258 -0
- mlflow/utils/async_logging/async_logging_queue.py +366 -0
- mlflow/utils/async_logging/run_artifact.py +38 -0
- mlflow/utils/async_logging/run_batch.py +58 -0
- mlflow/utils/async_logging/run_operations.py +49 -0
- mlflow/utils/autologging_utils/__init__.py +737 -0
- mlflow/utils/autologging_utils/client.py +432 -0
- mlflow/utils/autologging_utils/config.py +33 -0
- mlflow/utils/autologging_utils/events.py +294 -0
- mlflow/utils/autologging_utils/logging_and_warnings.py +328 -0
- mlflow/utils/autologging_utils/metrics_queue.py +71 -0
- mlflow/utils/autologging_utils/safety.py +1104 -0
- mlflow/utils/autologging_utils/versioning.py +95 -0
- mlflow/utils/checkpoint_utils.py +206 -0
- mlflow/utils/class_utils.py +6 -0
- mlflow/utils/cli_args.py +257 -0
- mlflow/utils/conda.py +354 -0
- mlflow/utils/credentials.py +231 -0
- mlflow/utils/data_utils.py +17 -0
- mlflow/utils/databricks_utils.py +1436 -0
- mlflow/utils/docstring_utils.py +477 -0
- mlflow/utils/doctor.py +133 -0
- mlflow/utils/download_cloud_file_chunk.py +43 -0
- mlflow/utils/env_manager.py +16 -0
- mlflow/utils/env_pack.py +131 -0
- mlflow/utils/environment.py +1009 -0
- mlflow/utils/exception_utils.py +14 -0
- mlflow/utils/file_utils.py +978 -0
- mlflow/utils/git_utils.py +77 -0
- mlflow/utils/gorilla.py +797 -0
- mlflow/utils/import_hooks/__init__.py +363 -0
- mlflow/utils/lazy_load.py +51 -0
- mlflow/utils/logging_utils.py +168 -0
- mlflow/utils/mime_type_utils.py +58 -0
- mlflow/utils/mlflow_tags.py +103 -0
- mlflow/utils/model_utils.py +486 -0
- mlflow/utils/name_utils.py +346 -0
- mlflow/utils/nfs_on_spark.py +62 -0
- mlflow/utils/openai_utils.py +164 -0
- mlflow/utils/os.py +12 -0
- mlflow/utils/oss_registry_utils.py +29 -0
- mlflow/utils/plugins.py +17 -0
- mlflow/utils/process.py +182 -0
- mlflow/utils/promptlab_utils.py +146 -0
- mlflow/utils/proto_json_utils.py +743 -0
- mlflow/utils/pydantic_utils.py +54 -0
- mlflow/utils/request_utils.py +279 -0
- mlflow/utils/requirements_utils.py +704 -0
- mlflow/utils/rest_utils.py +673 -0
- mlflow/utils/search_logged_model_utils.py +127 -0
- mlflow/utils/search_utils.py +2111 -0
- mlflow/utils/secure_loading.py +221 -0
- mlflow/utils/security_validation.py +384 -0
- mlflow/utils/server_cli_utils.py +61 -0
- mlflow/utils/spark_utils.py +15 -0
- mlflow/utils/string_utils.py +138 -0
- mlflow/utils/thread_utils.py +63 -0
- mlflow/utils/time.py +54 -0
- mlflow/utils/timeout.py +42 -0
- mlflow/utils/uri.py +572 -0
- mlflow/utils/validation.py +662 -0
- mlflow/utils/virtualenv.py +458 -0
- mlflow/utils/warnings_utils.py +25 -0
- mlflow/utils/yaml_utils.py +179 -0
- mlflow/version.py +24 -0
@@ -0,0 +1,145 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
from typing import Any
|
4
|
+
from urllib.parse import urlparse
|
5
|
+
|
6
|
+
from mlflow.data.dataset_source import DatasetSource
|
7
|
+
from mlflow.exceptions import MlflowException
|
8
|
+
from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
|
9
|
+
from mlflow.utils.file_utils import create_tmp_dir
|
10
|
+
from mlflow.utils.rest_utils import augmented_raise_for_status, cloud_storage_http_request
|
11
|
+
|
12
|
+
|
13
|
+
def _is_path(filename: str) -> bool:
|
14
|
+
"""
|
15
|
+
Return True if `filename` is a path, False otherwise. For example,
|
16
|
+
"foo/bar" is a path, but "bar" is not.
|
17
|
+
"""
|
18
|
+
return os.path.basename(filename) != filename
|
19
|
+
|
20
|
+
|
21
|
+
class HTTPDatasetSource(DatasetSource):
|
22
|
+
"""
|
23
|
+
Represents the source of a dataset stored at a web location and referred to
|
24
|
+
by an HTTP or HTTPS URL.
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(self, url):
|
28
|
+
self._url = url
|
29
|
+
|
30
|
+
@property
|
31
|
+
def url(self):
|
32
|
+
"""The HTTP/S URL referring to the dataset source location.
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
The HTTP/S URL referring to the dataset source location.
|
36
|
+
|
37
|
+
"""
|
38
|
+
return self._url
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def _get_source_type() -> str:
|
42
|
+
return "http"
|
43
|
+
|
44
|
+
def _extract_filename(self, response) -> str:
|
45
|
+
"""
|
46
|
+
Extracts a filename from the Content-Disposition header or the URL's path.
|
47
|
+
"""
|
48
|
+
if content_disposition := response.headers.get("Content-Disposition"):
|
49
|
+
for match in re.finditer(r"filename=(.+)", content_disposition):
|
50
|
+
filename = match[1].strip("'\"")
|
51
|
+
if _is_path(filename):
|
52
|
+
raise MlflowException.invalid_parameter_value(
|
53
|
+
f"Invalid filename in Content-Disposition header: {filename}. "
|
54
|
+
"It must be a file name, not a path."
|
55
|
+
)
|
56
|
+
return filename
|
57
|
+
|
58
|
+
# Extract basename from URL if no valid filename in Content-Disposition
|
59
|
+
return os.path.basename(urlparse(self.url).path)
|
60
|
+
|
61
|
+
def load(self, dst_path=None) -> str:
|
62
|
+
"""Downloads the dataset source to the local filesystem.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
dst_path: Path of the local filesystem destination directory to which to download the
|
66
|
+
dataset source. If the directory does not exist, it is created. If
|
67
|
+
unspecified, the dataset source is downloaded to a new uniquely-named
|
68
|
+
directory on the local filesystem.
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
The path to the downloaded dataset source on the local filesystem.
|
72
|
+
|
73
|
+
"""
|
74
|
+
resp = cloud_storage_http_request(
|
75
|
+
method="GET",
|
76
|
+
url=self.url,
|
77
|
+
stream=True,
|
78
|
+
)
|
79
|
+
augmented_raise_for_status(resp)
|
80
|
+
|
81
|
+
basename = self._extract_filename(resp)
|
82
|
+
|
83
|
+
if not basename:
|
84
|
+
basename = "dataset_source"
|
85
|
+
|
86
|
+
if dst_path is None:
|
87
|
+
dst_path = create_tmp_dir()
|
88
|
+
|
89
|
+
dst_path = os.path.join(dst_path, basename)
|
90
|
+
with open(dst_path, "wb") as f:
|
91
|
+
chunk_size = 1024 * 1024 # 1 MB
|
92
|
+
for chunk in resp.iter_content(chunk_size=chunk_size):
|
93
|
+
f.write(chunk)
|
94
|
+
|
95
|
+
return dst_path
|
96
|
+
|
97
|
+
@staticmethod
|
98
|
+
def _can_resolve(raw_source: Any) -> bool:
|
99
|
+
"""
|
100
|
+
Args:
|
101
|
+
raw_source: The raw source, e.g. a string like "http://mysite/mydata.tar.gz".
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
True if this DatasetSource can resolve the raw source, False otherwise.
|
105
|
+
"""
|
106
|
+
if not isinstance(raw_source, str):
|
107
|
+
return False
|
108
|
+
|
109
|
+
try:
|
110
|
+
parsed_source = urlparse(str(raw_source))
|
111
|
+
return parsed_source.scheme in ["http", "https"]
|
112
|
+
except Exception:
|
113
|
+
return False
|
114
|
+
|
115
|
+
@classmethod
|
116
|
+
def _resolve(cls, raw_source: Any) -> "HTTPDatasetSource":
|
117
|
+
"""
|
118
|
+
Args:
|
119
|
+
raw_source: The raw source, e.g. a string like "http://mysite/mydata.tar.gz".
|
120
|
+
"""
|
121
|
+
return HTTPDatasetSource(raw_source)
|
122
|
+
|
123
|
+
def to_dict(self) -> dict[Any, Any]:
|
124
|
+
"""
|
125
|
+
Returns:
|
126
|
+
A JSON-compatible dictionary representation of the HTTPDatasetSource.
|
127
|
+
"""
|
128
|
+
return {
|
129
|
+
"url": self.url,
|
130
|
+
}
|
131
|
+
|
132
|
+
@classmethod
|
133
|
+
def from_dict(cls, source_dict: dict[Any, Any]) -> "HTTPDatasetSource":
|
134
|
+
"""
|
135
|
+
Args:
|
136
|
+
source_dict: A dictionary representation of the HTTPDatasetSource.
|
137
|
+
"""
|
138
|
+
url = source_dict.get("url")
|
139
|
+
if url is None:
|
140
|
+
raise MlflowException(
|
141
|
+
'Failed to parse HTTPDatasetSource. Missing expected key: "url"',
|
142
|
+
INVALID_PARAMETER_VALUE,
|
143
|
+
)
|
144
|
+
|
145
|
+
return cls(url=url)
|
@@ -0,0 +1,258 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from functools import cached_property
|
4
|
+
from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Union
|
5
|
+
|
6
|
+
from mlflow.data.dataset import Dataset
|
7
|
+
from mlflow.data.dataset_source import DatasetSource
|
8
|
+
from mlflow.data.digest_utils import compute_pandas_digest
|
9
|
+
from mlflow.data.evaluation_dataset import EvaluationDataset
|
10
|
+
from mlflow.data.huggingface_dataset_source import HuggingFaceDatasetSource
|
11
|
+
from mlflow.data.pyfunc_dataset_mixin import PyFuncConvertibleDatasetMixin, PyFuncInputsOutputs
|
12
|
+
from mlflow.exceptions import MlflowException
|
13
|
+
from mlflow.protos.databricks_pb2 import INTERNAL_ERROR, INVALID_PARAMETER_VALUE
|
14
|
+
from mlflow.types import Schema
|
15
|
+
from mlflow.types.utils import _infer_schema
|
16
|
+
|
17
|
+
_logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
_MAX_ROWS_FOR_DIGEST_COMPUTATION_AND_SCHEMA_INFERENCE = 10000
|
20
|
+
|
21
|
+
if TYPE_CHECKING:
|
22
|
+
import datasets
|
23
|
+
|
24
|
+
|
25
|
+
class HuggingFaceDataset(Dataset, PyFuncConvertibleDatasetMixin):
|
26
|
+
"""
|
27
|
+
Represents a HuggingFace dataset for use with MLflow Tracking.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__( # noqa: D417
|
31
|
+
self,
|
32
|
+
ds: "datasets.Dataset",
|
33
|
+
source: HuggingFaceDatasetSource,
|
34
|
+
targets: Optional[str] = None,
|
35
|
+
name: Optional[str] = None,
|
36
|
+
digest: Optional[str] = None,
|
37
|
+
):
|
38
|
+
"""
|
39
|
+
Args:
|
40
|
+
ds: A Hugging Face dataset. Must be an instance of `datasets.Dataset`.
|
41
|
+
Other types, such as :py:class:`datasets.DatasetDict`, are not supported.
|
42
|
+
source: The source of the Hugging Face dataset.
|
43
|
+
name: The name of the dataset. E.g. "wiki_train". If unspecified, a name is
|
44
|
+
automatically generated.
|
45
|
+
digest: The digest (hash, fingerprint) of the dataset. If unspecified, a digest
|
46
|
+
is automatically computed.
|
47
|
+
"""
|
48
|
+
if targets is not None and targets not in ds.column_names:
|
49
|
+
raise MlflowException(
|
50
|
+
f"The specified Hugging Face dataset does not contain the specified targets column"
|
51
|
+
f" '{targets}'.",
|
52
|
+
INVALID_PARAMETER_VALUE,
|
53
|
+
)
|
54
|
+
|
55
|
+
self._ds = ds
|
56
|
+
self._targets = targets
|
57
|
+
super().__init__(source=source, name=name, digest=digest)
|
58
|
+
|
59
|
+
def _compute_digest(self) -> str:
|
60
|
+
"""
|
61
|
+
Computes a digest for the dataset. Called if the user doesn't supply
|
62
|
+
a digest when constructing the dataset.
|
63
|
+
"""
|
64
|
+
df = next(
|
65
|
+
self._ds.to_pandas(
|
66
|
+
batch_size=_MAX_ROWS_FOR_DIGEST_COMPUTATION_AND_SCHEMA_INFERENCE, batched=True
|
67
|
+
)
|
68
|
+
)
|
69
|
+
return compute_pandas_digest(df)
|
70
|
+
|
71
|
+
def to_dict(self) -> dict[str, str]:
|
72
|
+
"""Create config dictionary for the dataset.
|
73
|
+
|
74
|
+
Returns a string dictionary containing the following fields: name, digest, source, source
|
75
|
+
type, schema, and profile.
|
76
|
+
"""
|
77
|
+
schema = json.dumps({"mlflow_colspec": self.schema.to_dict()}) if self.schema else None
|
78
|
+
config = super().to_dict()
|
79
|
+
config.update(
|
80
|
+
{
|
81
|
+
"schema": schema,
|
82
|
+
"profile": json.dumps(self.profile),
|
83
|
+
}
|
84
|
+
)
|
85
|
+
return config
|
86
|
+
|
87
|
+
@property
|
88
|
+
def ds(self) -> "datasets.Dataset":
|
89
|
+
"""The Hugging Face ``datasets.Dataset`` instance.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
The Hugging Face ``datasets.Dataset`` instance.
|
93
|
+
|
94
|
+
"""
|
95
|
+
return self._ds
|
96
|
+
|
97
|
+
@property
|
98
|
+
def targets(self) -> Optional[str]:
|
99
|
+
"""
|
100
|
+
The name of the Hugging Face dataset column containing targets (labels) for supervised
|
101
|
+
learning.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
The string name of the Hugging Face dataset column containing targets.
|
105
|
+
"""
|
106
|
+
return self._targets
|
107
|
+
|
108
|
+
@property
|
109
|
+
def source(self) -> HuggingFaceDatasetSource:
|
110
|
+
"""Hugging Face dataset source information.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
A :py:class:`mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource`
|
114
|
+
"""
|
115
|
+
return self._source
|
116
|
+
|
117
|
+
@property
|
118
|
+
def profile(self) -> Optional[Any]:
|
119
|
+
"""
|
120
|
+
Summary statistics for the Hugging Face dataset, including the number of rows,
|
121
|
+
size, and size in bytes.
|
122
|
+
"""
|
123
|
+
return {
|
124
|
+
"num_rows": self._ds.num_rows,
|
125
|
+
"dataset_size": self._ds.dataset_size,
|
126
|
+
"size_in_bytes": self._ds.size_in_bytes,
|
127
|
+
}
|
128
|
+
|
129
|
+
@cached_property
|
130
|
+
def schema(self) -> Optional[Schema]:
|
131
|
+
"""
|
132
|
+
The MLflow ColSpec schema of the Hugging Face dataset.
|
133
|
+
"""
|
134
|
+
try:
|
135
|
+
df = next(
|
136
|
+
self._ds.to_pandas(
|
137
|
+
batch_size=_MAX_ROWS_FOR_DIGEST_COMPUTATION_AND_SCHEMA_INFERENCE, batched=True
|
138
|
+
)
|
139
|
+
)
|
140
|
+
return _infer_schema(df)
|
141
|
+
except Exception as e:
|
142
|
+
_logger.warning("Failed to infer schema for Hugging Face dataset. Exception: %s", e)
|
143
|
+
return None
|
144
|
+
|
145
|
+
def to_pyfunc(self) -> PyFuncInputsOutputs:
|
146
|
+
df = self._ds.to_pandas()
|
147
|
+
if self._targets is not None:
|
148
|
+
if self._targets not in df.columns:
|
149
|
+
raise MlflowException(
|
150
|
+
f"Failed to convert Hugging Face dataset to pyfunc inputs and outputs because"
|
151
|
+
f" the pandas representation of the Hugging Face dataset does not contain the"
|
152
|
+
f" specified targets column '{self._targets}'.",
|
153
|
+
# This is an internal error because we should have validated the presence of
|
154
|
+
# the target column in the Hugging Face dataset at construction time
|
155
|
+
INTERNAL_ERROR,
|
156
|
+
)
|
157
|
+
inputs = df.drop(columns=self._targets)
|
158
|
+
outputs = df[self._targets]
|
159
|
+
return PyFuncInputsOutputs(inputs=inputs, outputs=outputs)
|
160
|
+
else:
|
161
|
+
return PyFuncInputsOutputs(inputs=df, outputs=None)
|
162
|
+
|
163
|
+
def to_evaluation_dataset(self, path=None, feature_names=None) -> EvaluationDataset:
|
164
|
+
"""
|
165
|
+
Converts the dataset to an EvaluationDataset for model evaluation. Required
|
166
|
+
for use with mlflow.evaluate().
|
167
|
+
"""
|
168
|
+
return EvaluationDataset(
|
169
|
+
data=self._ds.to_pandas(),
|
170
|
+
targets=self._targets,
|
171
|
+
path=path,
|
172
|
+
feature_names=feature_names,
|
173
|
+
name=self.name,
|
174
|
+
digest=self.digest,
|
175
|
+
)
|
176
|
+
|
177
|
+
|
178
|
+
def from_huggingface(
|
179
|
+
ds,
|
180
|
+
path: Optional[str] = None,
|
181
|
+
targets: Optional[str] = None,
|
182
|
+
data_dir: Optional[str] = None,
|
183
|
+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
|
184
|
+
revision=None,
|
185
|
+
name: Optional[str] = None,
|
186
|
+
digest: Optional[str] = None,
|
187
|
+
trust_remote_code: Optional[bool] = None,
|
188
|
+
source: Optional[Union[str, DatasetSource]] = None,
|
189
|
+
) -> HuggingFaceDataset:
|
190
|
+
"""
|
191
|
+
Create a `mlflow.data.huggingface_dataset.HuggingFaceDataset` from a Hugging Face dataset.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
ds:
|
195
|
+
A Hugging Face dataset. Must be an instance of `datasets.Dataset`. Other types, such as
|
196
|
+
`datasets.DatasetDict`, are not supported.
|
197
|
+
path: The path of the Hugging Face dataset used to construct the source. This is the same
|
198
|
+
argument as `path` in `datasets.load_dataset()` function. To be able to reload the
|
199
|
+
dataset via MLflow, `path` must match the path of the dataset on the hub, e.g.,
|
200
|
+
"databricks/databricks-dolly-15k". If no path is specified, a `CodeDatasetSource` is,
|
201
|
+
used which will source information from the run context.
|
202
|
+
targets: The name of the Hugging Face `dataset.Dataset` column containing targets (labels)
|
203
|
+
for supervised learning.
|
204
|
+
data_dir: The `data_dir` of the Hugging Face dataset configuration. This is used by the
|
205
|
+
`datasets.load_dataset()` function to reload the dataset upon request via
|
206
|
+
:py:func:`HuggingFaceDataset.source.load()
|
207
|
+
<mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource.load>`.
|
208
|
+
data_files: Paths to source data file(s) for the Hugging Face dataset configuration.
|
209
|
+
This is used by the `datasets.load_dataset()` function to reload the
|
210
|
+
dataset upon request via :py:func:`HuggingFaceDataset.source.load()
|
211
|
+
<mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource.load>`.
|
212
|
+
revision: Version of the dataset script to load. This is used by the
|
213
|
+
`datasets.load_dataset()` function to reload the dataset upon request via
|
214
|
+
:py:func:`HuggingFaceDataset.source.load()
|
215
|
+
<mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource.load>`.
|
216
|
+
name: The name of the dataset. E.g. "wiki_train". If unspecified, a name is automatically
|
217
|
+
generated.
|
218
|
+
digest: The digest (hash, fingerprint) of the dataset. If unspecified, a digest is
|
219
|
+
automatically computed.
|
220
|
+
trust_remote_code: Whether to trust remote code from the dataset repo.
|
221
|
+
source: The source of the dataset, e.g. a S3 URI, an HTTPS URL etc.
|
222
|
+
"""
|
223
|
+
import datasets
|
224
|
+
|
225
|
+
from mlflow.data.code_dataset_source import CodeDatasetSource
|
226
|
+
from mlflow.data.dataset_source_registry import resolve_dataset_source
|
227
|
+
from mlflow.tracking.context import registry
|
228
|
+
|
229
|
+
if not isinstance(ds, datasets.Dataset):
|
230
|
+
raise MlflowException(
|
231
|
+
f"The specified Hugging Face dataset must be an instance of `datasets.Dataset`."
|
232
|
+
f" Instead, found an instance of: {type(ds)}",
|
233
|
+
INVALID_PARAMETER_VALUE,
|
234
|
+
)
|
235
|
+
|
236
|
+
# Set the source to a `HuggingFaceDatasetSource` if a path is specified, otherwise set it to a
|
237
|
+
# `CodeDatasetSource`.
|
238
|
+
if source is not None and path is not None:
|
239
|
+
_logger.warning(
|
240
|
+
"Both 'source' and 'path' are provided."
|
241
|
+
"'source' will take precedence, and 'path' will be ignored."
|
242
|
+
)
|
243
|
+
if source is not None:
|
244
|
+
source = source if isinstance(source, DatasetSource) else resolve_dataset_source(source)
|
245
|
+
elif path is not None:
|
246
|
+
source = HuggingFaceDatasetSource(
|
247
|
+
path=path,
|
248
|
+
config_name=ds.config_name,
|
249
|
+
data_dir=data_dir,
|
250
|
+
data_files=data_files,
|
251
|
+
split=ds.split,
|
252
|
+
revision=revision,
|
253
|
+
trust_remote_code=trust_remote_code,
|
254
|
+
)
|
255
|
+
else:
|
256
|
+
context_tags = registry.resolve_tags()
|
257
|
+
source = CodeDatasetSource(tags=context_tags)
|
258
|
+
return HuggingFaceDataset(ds=ds, targets=targets, source=source, name=name, digest=digest)
|
@@ -0,0 +1,118 @@
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Union
|
2
|
+
|
3
|
+
from mlflow.data.dataset_source import DatasetSource
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
import datasets
|
7
|
+
|
8
|
+
|
9
|
+
class HuggingFaceDatasetSource(DatasetSource):
|
10
|
+
"""Represents the source of a Hugging Face dataset used in MLflow Tracking."""
|
11
|
+
|
12
|
+
def __init__(
|
13
|
+
self,
|
14
|
+
path: str,
|
15
|
+
config_name: Optional[str] = None,
|
16
|
+
data_dir: Optional[str] = None,
|
17
|
+
data_files: Optional[
|
18
|
+
Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
|
19
|
+
] = None,
|
20
|
+
split: Optional[Union[str, "datasets.Split"]] = None,
|
21
|
+
revision: Optional[Union[str, "datasets.Version"]] = None,
|
22
|
+
trust_remote_code: Optional[bool] = None,
|
23
|
+
):
|
24
|
+
"""Create a `HuggingFaceDatasetSource` instance.
|
25
|
+
|
26
|
+
Arguments in `__init__` match arguments of the same name in
|
27
|
+
`datasets.load_dataset() <https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/loading_methods#datasets.load_dataset>`_.
|
28
|
+
The only exception is `config_name` matches `name` in `datasets.load_dataset()`, because
|
29
|
+
we need to differentiate from `mlflow.data.Dataset` `name` attribute.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
path: The path of the Hugging Face dataset, if it is a dataset from HuggingFace hub,
|
33
|
+
`path` must match the hub path, e.g., "databricks/databricks-dolly-15k".
|
34
|
+
config_name: The name of of the Hugging Face dataset configuration.
|
35
|
+
data_dir: The `data_dir` of the Hugging Face dataset configuration.
|
36
|
+
data_files: Paths to source data file(s) for the Hugging Face dataset configuration.
|
37
|
+
split: Which split of the data to load.
|
38
|
+
revision: Version of the dataset script to load.
|
39
|
+
trust_remote_code: Whether to trust remote code from the dataset repo.
|
40
|
+
"""
|
41
|
+
self.path = path
|
42
|
+
self.config_name = config_name
|
43
|
+
self.data_dir = data_dir
|
44
|
+
self.data_files = data_files
|
45
|
+
self.split = split
|
46
|
+
self.revision = revision
|
47
|
+
self.trust_remote_code = trust_remote_code
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def _get_source_type() -> str:
|
51
|
+
return "hugging_face"
|
52
|
+
|
53
|
+
def load(self, **kwargs):
|
54
|
+
"""Load the Hugging Face dataset based on `HuggingFaceDatasetSource`.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
kwargs: Additional keyword arguments used for loading the dataset with the Hugging Face
|
58
|
+
`datasets.load_dataset()` method.
|
59
|
+
|
60
|
+
Returns:
|
61
|
+
An instance of `datasets.Dataset`.
|
62
|
+
"""
|
63
|
+
import datasets
|
64
|
+
from packaging.version import Version
|
65
|
+
|
66
|
+
load_kwargs = {
|
67
|
+
"path": self.path,
|
68
|
+
"name": self.config_name,
|
69
|
+
"data_dir": self.data_dir,
|
70
|
+
"data_files": self.data_files,
|
71
|
+
"split": self.split,
|
72
|
+
"revision": self.revision,
|
73
|
+
}
|
74
|
+
|
75
|
+
# this argument only exists in >= 2.16.0
|
76
|
+
if Version(datasets.__version__) >= Version("2.16.0"):
|
77
|
+
load_kwargs["trust_remote_code"] = self.trust_remote_code
|
78
|
+
|
79
|
+
intersecting_keys = set(load_kwargs.keys()) & set(kwargs.keys())
|
80
|
+
if intersecting_keys:
|
81
|
+
raise KeyError(
|
82
|
+
f"Found duplicated arguments in `HuggingFaceDatasetSource` and "
|
83
|
+
f"`kwargs`: {intersecting_keys}. Please remove them from `kwargs`."
|
84
|
+
)
|
85
|
+
load_kwargs.update(kwargs)
|
86
|
+
return datasets.load_dataset(**load_kwargs)
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def _can_resolve(raw_source: Any):
|
90
|
+
# NB: Initially, we expect that Hugging Face dataset sources will only be used with
|
91
|
+
# Hugging Face datasets constructed by from_huggingface_dataset, which can create
|
92
|
+
# an instance of HuggingFaceDatasetSource directly without the need for resolution
|
93
|
+
return False
|
94
|
+
|
95
|
+
@classmethod
|
96
|
+
def _resolve(cls, raw_source: str) -> "HuggingFaceDatasetSource":
|
97
|
+
raise NotImplementedError
|
98
|
+
|
99
|
+
def to_dict(self) -> dict[Any, Any]:
|
100
|
+
return {
|
101
|
+
"path": self.path,
|
102
|
+
"config_name": self.config_name,
|
103
|
+
"data_dir": self.data_dir,
|
104
|
+
"data_files": self.data_files,
|
105
|
+
"split": str(self.split),
|
106
|
+
"revision": self.revision,
|
107
|
+
}
|
108
|
+
|
109
|
+
@classmethod
|
110
|
+
def from_dict(cls, source_dict: dict[Any, Any]) -> "HuggingFaceDatasetSource":
|
111
|
+
return cls(
|
112
|
+
path=source_dict.get("path"),
|
113
|
+
config_name=source_dict.get("config_name"),
|
114
|
+
data_dir=source_dict.get("data_dir"),
|
115
|
+
data_files=source_dict.get("data_files"),
|
116
|
+
split=source_dict.get("split"),
|
117
|
+
revision=source_dict.get("revision"),
|
118
|
+
)
|
@@ -0,0 +1,104 @@
|
|
1
|
+
import hashlib
|
2
|
+
import json
|
3
|
+
from typing import Any, Optional
|
4
|
+
|
5
|
+
from mlflow.data.dataset import Dataset
|
6
|
+
from mlflow.data.dataset_source import DatasetSource
|
7
|
+
from mlflow.types import Schema
|
8
|
+
|
9
|
+
|
10
|
+
class MetaDataset(Dataset):
|
11
|
+
"""Dataset that only contains metadata.
|
12
|
+
|
13
|
+
This class is used to represent a dataset that only contains metadata, which is useful when
|
14
|
+
users only want to log metadata to MLflow without logging the actual data. For example, users
|
15
|
+
build a custom dataset from a text file publicly hosted in the Internet, and they want to log
|
16
|
+
the text file's URL to MLflow for future tracking instead of the dataset itself.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
source: dataset source of type `DatasetSource`, indicates where the data is from.
|
20
|
+
name: name of the dataset. If not specified, a name is automatically generated.
|
21
|
+
digest: digest (hash, fingerprint) of the dataset. If not specified, a digest is
|
22
|
+
automatically computed.
|
23
|
+
schame: schema of the dataset.
|
24
|
+
|
25
|
+
.. code-block:: python
|
26
|
+
:caption: Create a MetaDataset
|
27
|
+
|
28
|
+
import mlflow
|
29
|
+
|
30
|
+
mlflow.set_experiment("/test-mlflow-meta-dataset")
|
31
|
+
|
32
|
+
source = mlflow.data.http_dataset_source.HTTPDatasetSource(
|
33
|
+
url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
|
34
|
+
)
|
35
|
+
ds = mlflow.data.meta_dataset.MetaDataset(source)
|
36
|
+
|
37
|
+
with mlflow.start_run() as run:
|
38
|
+
mlflow.log_input(ds)
|
39
|
+
|
40
|
+
.. code-block:: python
|
41
|
+
:caption: Create a MetaDataset with schema
|
42
|
+
|
43
|
+
import mlflow
|
44
|
+
|
45
|
+
mlflow.set_experiment("/test-mlflow-meta-dataset")
|
46
|
+
|
47
|
+
source = mlflow.data.http_dataset_source.HTTPDatasetSource(
|
48
|
+
url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
|
49
|
+
)
|
50
|
+
schema = Schema(
|
51
|
+
[
|
52
|
+
ColSpec(type=mlflow.types.DataType.string, name="text"),
|
53
|
+
ColSpec(type=mlflow.types.DataType.integer, name="label"),
|
54
|
+
]
|
55
|
+
)
|
56
|
+
ds = mlflow.data.meta_dataset.MetaDataset(source, schema=schema)
|
57
|
+
|
58
|
+
with mlflow.start_run() as run:
|
59
|
+
mlflow.log_input(ds)
|
60
|
+
"""
|
61
|
+
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
source: DatasetSource,
|
65
|
+
name: Optional[str] = None,
|
66
|
+
digest: Optional[str] = None,
|
67
|
+
schema: Optional[Schema] = None,
|
68
|
+
):
|
69
|
+
# Set `self._schema` before calling the superclass constructor because
|
70
|
+
# `self._compute_digest` depends on `self._schema`.
|
71
|
+
self._schema = schema
|
72
|
+
super().__init__(source=source, name=name, digest=digest)
|
73
|
+
|
74
|
+
def _compute_digest(self) -> str:
|
75
|
+
"""Computes a digest for the dataset.
|
76
|
+
|
77
|
+
The digest computation of `MetaDataset` is based on the dataset's name, source, source type,
|
78
|
+
and schema instead of the actual data. Basically we compute the sha256 hash of the config
|
79
|
+
dict.
|
80
|
+
"""
|
81
|
+
config = {
|
82
|
+
"name": self.name,
|
83
|
+
"source": self.source.to_json(),
|
84
|
+
"source_type": self.source._get_source_type(),
|
85
|
+
"schema": self.schema.to_dict() if self.schema else "",
|
86
|
+
}
|
87
|
+
return hashlib.sha256(json.dumps(config).encode("utf-8")).hexdigest()[:8]
|
88
|
+
|
89
|
+
@property
|
90
|
+
def schema(self) -> Optional[Any]:
|
91
|
+
"""Returns the schema of the dataset."""
|
92
|
+
return self._schema
|
93
|
+
|
94
|
+
def to_dict(self) -> dict[str, str]:
|
95
|
+
"""Create config dictionary for the MetaDataset.
|
96
|
+
|
97
|
+
Returns a string dictionary containing the following fields: name, digest, source, source
|
98
|
+
type, schema, and profile.
|
99
|
+
"""
|
100
|
+
config = super().to_dict()
|
101
|
+
if self.schema:
|
102
|
+
schema = json.dumps({"mlflow_colspec": self.schema.to_dict()}) if self.schema else None
|
103
|
+
config["schema"] = schema
|
104
|
+
return config
|