genesis-flow 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genesis_flow-1.0.0.dist-info/METADATA +822 -0
- genesis_flow-1.0.0.dist-info/RECORD +645 -0
- genesis_flow-1.0.0.dist-info/WHEEL +5 -0
- genesis_flow-1.0.0.dist-info/entry_points.txt +19 -0
- genesis_flow-1.0.0.dist-info/licenses/LICENSE.txt +202 -0
- genesis_flow-1.0.0.dist-info/top_level.txt +1 -0
- mlflow/__init__.py +367 -0
- mlflow/__main__.py +3 -0
- mlflow/ag2/__init__.py +56 -0
- mlflow/ag2/ag2_logger.py +294 -0
- mlflow/anthropic/__init__.py +40 -0
- mlflow/anthropic/autolog.py +129 -0
- mlflow/anthropic/chat.py +144 -0
- mlflow/artifacts/__init__.py +268 -0
- mlflow/autogen/__init__.py +144 -0
- mlflow/autogen/chat.py +142 -0
- mlflow/azure/__init__.py +26 -0
- mlflow/azure/auth_handler.py +257 -0
- mlflow/azure/client.py +319 -0
- mlflow/azure/config.py +120 -0
- mlflow/azure/connection_factory.py +340 -0
- mlflow/azure/exceptions.py +27 -0
- mlflow/azure/stores.py +327 -0
- mlflow/azure/utils.py +183 -0
- mlflow/bedrock/__init__.py +45 -0
- mlflow/bedrock/_autolog.py +202 -0
- mlflow/bedrock/chat.py +122 -0
- mlflow/bedrock/stream.py +160 -0
- mlflow/bedrock/utils.py +43 -0
- mlflow/cli.py +707 -0
- mlflow/client.py +12 -0
- mlflow/config/__init__.py +56 -0
- mlflow/crewai/__init__.py +79 -0
- mlflow/crewai/autolog.py +253 -0
- mlflow/crewai/chat.py +29 -0
- mlflow/data/__init__.py +75 -0
- mlflow/data/artifact_dataset_sources.py +170 -0
- mlflow/data/code_dataset_source.py +40 -0
- mlflow/data/dataset.py +123 -0
- mlflow/data/dataset_registry.py +168 -0
- mlflow/data/dataset_source.py +110 -0
- mlflow/data/dataset_source_registry.py +219 -0
- mlflow/data/delta_dataset_source.py +167 -0
- mlflow/data/digest_utils.py +108 -0
- mlflow/data/evaluation_dataset.py +562 -0
- mlflow/data/filesystem_dataset_source.py +81 -0
- mlflow/data/http_dataset_source.py +145 -0
- mlflow/data/huggingface_dataset.py +258 -0
- mlflow/data/huggingface_dataset_source.py +118 -0
- mlflow/data/meta_dataset.py +104 -0
- mlflow/data/numpy_dataset.py +223 -0
- mlflow/data/pandas_dataset.py +231 -0
- mlflow/data/polars_dataset.py +352 -0
- mlflow/data/pyfunc_dataset_mixin.py +31 -0
- mlflow/data/schema.py +76 -0
- mlflow/data/sources.py +1 -0
- mlflow/data/spark_dataset.py +406 -0
- mlflow/data/spark_dataset_source.py +74 -0
- mlflow/data/spark_delta_utils.py +118 -0
- mlflow/data/tensorflow_dataset.py +350 -0
- mlflow/data/uc_volume_dataset_source.py +81 -0
- mlflow/db.py +27 -0
- mlflow/dspy/__init__.py +17 -0
- mlflow/dspy/autolog.py +197 -0
- mlflow/dspy/callback.py +398 -0
- mlflow/dspy/constant.py +1 -0
- mlflow/dspy/load.py +93 -0
- mlflow/dspy/save.py +393 -0
- mlflow/dspy/util.py +109 -0
- mlflow/dspy/wrapper.py +226 -0
- mlflow/entities/__init__.py +104 -0
- mlflow/entities/_mlflow_object.py +52 -0
- mlflow/entities/assessment.py +545 -0
- mlflow/entities/assessment_error.py +80 -0
- mlflow/entities/assessment_source.py +141 -0
- mlflow/entities/dataset.py +92 -0
- mlflow/entities/dataset_input.py +51 -0
- mlflow/entities/dataset_summary.py +62 -0
- mlflow/entities/document.py +48 -0
- mlflow/entities/experiment.py +109 -0
- mlflow/entities/experiment_tag.py +35 -0
- mlflow/entities/file_info.py +45 -0
- mlflow/entities/input_tag.py +35 -0
- mlflow/entities/lifecycle_stage.py +35 -0
- mlflow/entities/logged_model.py +228 -0
- mlflow/entities/logged_model_input.py +26 -0
- mlflow/entities/logged_model_output.py +32 -0
- mlflow/entities/logged_model_parameter.py +46 -0
- mlflow/entities/logged_model_status.py +74 -0
- mlflow/entities/logged_model_tag.py +33 -0
- mlflow/entities/metric.py +200 -0
- mlflow/entities/model_registry/__init__.py +29 -0
- mlflow/entities/model_registry/_model_registry_entity.py +13 -0
- mlflow/entities/model_registry/model_version.py +243 -0
- mlflow/entities/model_registry/model_version_deployment_job_run_state.py +44 -0
- mlflow/entities/model_registry/model_version_deployment_job_state.py +70 -0
- mlflow/entities/model_registry/model_version_search.py +25 -0
- mlflow/entities/model_registry/model_version_stages.py +25 -0
- mlflow/entities/model_registry/model_version_status.py +35 -0
- mlflow/entities/model_registry/model_version_tag.py +35 -0
- mlflow/entities/model_registry/prompt.py +73 -0
- mlflow/entities/model_registry/prompt_version.py +244 -0
- mlflow/entities/model_registry/registered_model.py +175 -0
- mlflow/entities/model_registry/registered_model_alias.py +35 -0
- mlflow/entities/model_registry/registered_model_deployment_job_state.py +39 -0
- mlflow/entities/model_registry/registered_model_search.py +25 -0
- mlflow/entities/model_registry/registered_model_tag.py +35 -0
- mlflow/entities/multipart_upload.py +74 -0
- mlflow/entities/param.py +49 -0
- mlflow/entities/run.py +97 -0
- mlflow/entities/run_data.py +84 -0
- mlflow/entities/run_info.py +188 -0
- mlflow/entities/run_inputs.py +59 -0
- mlflow/entities/run_outputs.py +43 -0
- mlflow/entities/run_status.py +41 -0
- mlflow/entities/run_tag.py +36 -0
- mlflow/entities/source_type.py +31 -0
- mlflow/entities/span.py +774 -0
- mlflow/entities/span_event.py +96 -0
- mlflow/entities/span_status.py +102 -0
- mlflow/entities/trace.py +317 -0
- mlflow/entities/trace_data.py +71 -0
- mlflow/entities/trace_info.py +220 -0
- mlflow/entities/trace_info_v2.py +162 -0
- mlflow/entities/trace_location.py +173 -0
- mlflow/entities/trace_state.py +39 -0
- mlflow/entities/trace_status.py +68 -0
- mlflow/entities/view_type.py +51 -0
- mlflow/environment_variables.py +866 -0
- mlflow/evaluation/__init__.py +16 -0
- mlflow/evaluation/assessment.py +369 -0
- mlflow/evaluation/evaluation.py +411 -0
- mlflow/evaluation/evaluation_tag.py +61 -0
- mlflow/evaluation/fluent.py +48 -0
- mlflow/evaluation/utils.py +201 -0
- mlflow/exceptions.py +213 -0
- mlflow/experiments.py +140 -0
- mlflow/gemini/__init__.py +81 -0
- mlflow/gemini/autolog.py +186 -0
- mlflow/gemini/chat.py +261 -0
- mlflow/genai/__init__.py +71 -0
- mlflow/genai/datasets/__init__.py +67 -0
- mlflow/genai/datasets/evaluation_dataset.py +131 -0
- mlflow/genai/evaluation/__init__.py +3 -0
- mlflow/genai/evaluation/base.py +411 -0
- mlflow/genai/evaluation/constant.py +23 -0
- mlflow/genai/evaluation/utils.py +244 -0
- mlflow/genai/judges/__init__.py +21 -0
- mlflow/genai/judges/databricks.py +404 -0
- mlflow/genai/label_schemas/__init__.py +153 -0
- mlflow/genai/label_schemas/label_schemas.py +209 -0
- mlflow/genai/labeling/__init__.py +159 -0
- mlflow/genai/labeling/labeling.py +250 -0
- mlflow/genai/optimize/__init__.py +13 -0
- mlflow/genai/optimize/base.py +198 -0
- mlflow/genai/optimize/optimizers/__init__.py +4 -0
- mlflow/genai/optimize/optimizers/base_optimizer.py +38 -0
- mlflow/genai/optimize/optimizers/dspy_mipro_optimizer.py +221 -0
- mlflow/genai/optimize/optimizers/dspy_optimizer.py +91 -0
- mlflow/genai/optimize/optimizers/utils/dspy_mipro_callback.py +76 -0
- mlflow/genai/optimize/optimizers/utils/dspy_mipro_utils.py +18 -0
- mlflow/genai/optimize/types.py +75 -0
- mlflow/genai/optimize/util.py +30 -0
- mlflow/genai/prompts/__init__.py +206 -0
- mlflow/genai/scheduled_scorers.py +431 -0
- mlflow/genai/scorers/__init__.py +26 -0
- mlflow/genai/scorers/base.py +492 -0
- mlflow/genai/scorers/builtin_scorers.py +765 -0
- mlflow/genai/scorers/scorer_utils.py +138 -0
- mlflow/genai/scorers/validation.py +165 -0
- mlflow/genai/utils/data_validation.py +146 -0
- mlflow/genai/utils/enum_utils.py +23 -0
- mlflow/genai/utils/trace_utils.py +211 -0
- mlflow/groq/__init__.py +42 -0
- mlflow/groq/_groq_autolog.py +74 -0
- mlflow/johnsnowlabs/__init__.py +888 -0
- mlflow/langchain/__init__.py +24 -0
- mlflow/langchain/api_request_parallel_processor.py +330 -0
- mlflow/langchain/autolog.py +147 -0
- mlflow/langchain/chat_agent_langgraph.py +340 -0
- mlflow/langchain/constant.py +1 -0
- mlflow/langchain/constants.py +1 -0
- mlflow/langchain/databricks_dependencies.py +444 -0
- mlflow/langchain/langchain_tracer.py +597 -0
- mlflow/langchain/model.py +919 -0
- mlflow/langchain/output_parsers.py +142 -0
- mlflow/langchain/retriever_chain.py +153 -0
- mlflow/langchain/runnables.py +527 -0
- mlflow/langchain/utils/chat.py +402 -0
- mlflow/langchain/utils/logging.py +671 -0
- mlflow/langchain/utils/serialization.py +36 -0
- mlflow/legacy_databricks_cli/__init__.py +0 -0
- mlflow/legacy_databricks_cli/configure/__init__.py +0 -0
- mlflow/legacy_databricks_cli/configure/provider.py +482 -0
- mlflow/litellm/__init__.py +175 -0
- mlflow/llama_index/__init__.py +22 -0
- mlflow/llama_index/autolog.py +55 -0
- mlflow/llama_index/chat.py +43 -0
- mlflow/llama_index/constant.py +1 -0
- mlflow/llama_index/model.py +577 -0
- mlflow/llama_index/pyfunc_wrapper.py +332 -0
- mlflow/llama_index/serialize_objects.py +188 -0
- mlflow/llama_index/tracer.py +561 -0
- mlflow/metrics/__init__.py +479 -0
- mlflow/metrics/base.py +39 -0
- mlflow/metrics/genai/__init__.py +25 -0
- mlflow/metrics/genai/base.py +101 -0
- mlflow/metrics/genai/genai_metric.py +771 -0
- mlflow/metrics/genai/metric_definitions.py +450 -0
- mlflow/metrics/genai/model_utils.py +371 -0
- mlflow/metrics/genai/prompt_template.py +68 -0
- mlflow/metrics/genai/prompts/__init__.py +0 -0
- mlflow/metrics/genai/prompts/v1.py +422 -0
- mlflow/metrics/genai/utils.py +6 -0
- mlflow/metrics/metric_definitions.py +619 -0
- mlflow/mismatch.py +34 -0
- mlflow/mistral/__init__.py +34 -0
- mlflow/mistral/autolog.py +71 -0
- mlflow/mistral/chat.py +135 -0
- mlflow/ml_package_versions.py +452 -0
- mlflow/models/__init__.py +97 -0
- mlflow/models/auth_policy.py +83 -0
- mlflow/models/cli.py +354 -0
- mlflow/models/container/__init__.py +294 -0
- mlflow/models/container/scoring_server/__init__.py +0 -0
- mlflow/models/container/scoring_server/nginx.conf +39 -0
- mlflow/models/dependencies_schemas.py +287 -0
- mlflow/models/display_utils.py +158 -0
- mlflow/models/docker_utils.py +211 -0
- mlflow/models/evaluation/__init__.py +23 -0
- mlflow/models/evaluation/_shap_patch.py +64 -0
- mlflow/models/evaluation/artifacts.py +194 -0
- mlflow/models/evaluation/base.py +1811 -0
- mlflow/models/evaluation/calibration_curve.py +109 -0
- mlflow/models/evaluation/default_evaluator.py +996 -0
- mlflow/models/evaluation/deprecated.py +23 -0
- mlflow/models/evaluation/evaluator_registry.py +80 -0
- mlflow/models/evaluation/evaluators/classifier.py +704 -0
- mlflow/models/evaluation/evaluators/default.py +233 -0
- mlflow/models/evaluation/evaluators/regressor.py +96 -0
- mlflow/models/evaluation/evaluators/shap.py +296 -0
- mlflow/models/evaluation/lift_curve.py +178 -0
- mlflow/models/evaluation/utils/metric.py +123 -0
- mlflow/models/evaluation/utils/trace.py +179 -0
- mlflow/models/evaluation/validation.py +434 -0
- mlflow/models/flavor_backend.py +93 -0
- mlflow/models/flavor_backend_registry.py +53 -0
- mlflow/models/model.py +1639 -0
- mlflow/models/model_config.py +150 -0
- mlflow/models/notebook_resources/agent_evaluation_template.html +235 -0
- mlflow/models/notebook_resources/eval_with_dataset_example.py +22 -0
- mlflow/models/notebook_resources/eval_with_synthetic_example.py +22 -0
- mlflow/models/python_api.py +369 -0
- mlflow/models/rag_signatures.py +128 -0
- mlflow/models/resources.py +321 -0
- mlflow/models/signature.py +662 -0
- mlflow/models/utils.py +2054 -0
- mlflow/models/wheeled_model.py +280 -0
- mlflow/openai/__init__.py +57 -0
- mlflow/openai/_agent_tracer.py +364 -0
- mlflow/openai/api_request_parallel_processor.py +131 -0
- mlflow/openai/autolog.py +509 -0
- mlflow/openai/constant.py +1 -0
- mlflow/openai/model.py +824 -0
- mlflow/openai/utils/chat_schema.py +367 -0
- mlflow/optuna/__init__.py +3 -0
- mlflow/optuna/storage.py +646 -0
- mlflow/plugins/__init__.py +72 -0
- mlflow/plugins/base.py +358 -0
- mlflow/plugins/builtin/__init__.py +24 -0
- mlflow/plugins/builtin/pytorch_plugin.py +150 -0
- mlflow/plugins/builtin/sklearn_plugin.py +158 -0
- mlflow/plugins/builtin/transformers_plugin.py +187 -0
- mlflow/plugins/cli.py +321 -0
- mlflow/plugins/discovery.py +340 -0
- mlflow/plugins/manager.py +465 -0
- mlflow/plugins/registry.py +316 -0
- mlflow/plugins/templates/framework_plugin_template.py +329 -0
- mlflow/prompt/constants.py +20 -0
- mlflow/prompt/promptlab_model.py +197 -0
- mlflow/prompt/registry_utils.py +248 -0
- mlflow/promptflow/__init__.py +495 -0
- mlflow/protos/__init__.py +0 -0
- mlflow/protos/assessments_pb2.py +174 -0
- mlflow/protos/databricks_artifacts_pb2.py +489 -0
- mlflow/protos/databricks_filesystem_service_pb2.py +196 -0
- mlflow/protos/databricks_managed_catalog_messages_pb2.py +95 -0
- mlflow/protos/databricks_managed_catalog_service_pb2.py +86 -0
- mlflow/protos/databricks_pb2.py +267 -0
- mlflow/protos/databricks_trace_server_pb2.py +374 -0
- mlflow/protos/databricks_uc_registry_messages_pb2.py +1249 -0
- mlflow/protos/databricks_uc_registry_service_pb2.py +170 -0
- mlflow/protos/facet_feature_statistics_pb2.py +296 -0
- mlflow/protos/internal_pb2.py +77 -0
- mlflow/protos/mlflow_artifacts_pb2.py +336 -0
- mlflow/protos/model_registry_pb2.py +1073 -0
- mlflow/protos/scalapb/__init__.py +0 -0
- mlflow/protos/scalapb/scalapb_pb2.py +104 -0
- mlflow/protos/service_pb2.py +2600 -0
- mlflow/protos/unity_catalog_oss_messages_pb2.py +457 -0
- mlflow/protos/unity_catalog_oss_service_pb2.py +130 -0
- mlflow/protos/unity_catalog_prompt_messages_pb2.py +447 -0
- mlflow/protos/unity_catalog_prompt_messages_pb2_grpc.py +24 -0
- mlflow/protos/unity_catalog_prompt_service_pb2.py +164 -0
- mlflow/protos/unity_catalog_prompt_service_pb2_grpc.py +785 -0
- mlflow/py.typed +0 -0
- mlflow/pydantic_ai/__init__.py +57 -0
- mlflow/pydantic_ai/autolog.py +173 -0
- mlflow/pyfunc/__init__.py +3844 -0
- mlflow/pyfunc/_mlflow_pyfunc_backend_predict.py +61 -0
- mlflow/pyfunc/backend.py +523 -0
- mlflow/pyfunc/context.py +78 -0
- mlflow/pyfunc/dbconnect_artifact_cache.py +144 -0
- mlflow/pyfunc/loaders/__init__.py +7 -0
- mlflow/pyfunc/loaders/chat_agent.py +117 -0
- mlflow/pyfunc/loaders/chat_model.py +125 -0
- mlflow/pyfunc/loaders/code_model.py +31 -0
- mlflow/pyfunc/loaders/responses_agent.py +112 -0
- mlflow/pyfunc/mlserver.py +46 -0
- mlflow/pyfunc/model.py +1473 -0
- mlflow/pyfunc/scoring_server/__init__.py +604 -0
- mlflow/pyfunc/scoring_server/app.py +7 -0
- mlflow/pyfunc/scoring_server/client.py +146 -0
- mlflow/pyfunc/spark_model_cache.py +48 -0
- mlflow/pyfunc/stdin_server.py +44 -0
- mlflow/pyfunc/utils/__init__.py +3 -0
- mlflow/pyfunc/utils/data_validation.py +224 -0
- mlflow/pyfunc/utils/environment.py +22 -0
- mlflow/pyfunc/utils/input_converter.py +47 -0
- mlflow/pyfunc/utils/serving_data_parser.py +11 -0
- mlflow/pytorch/__init__.py +1171 -0
- mlflow/pytorch/_lightning_autolog.py +580 -0
- mlflow/pytorch/_pytorch_autolog.py +50 -0
- mlflow/pytorch/pickle_module.py +35 -0
- mlflow/rfunc/__init__.py +42 -0
- mlflow/rfunc/backend.py +134 -0
- mlflow/runs.py +89 -0
- mlflow/server/__init__.py +302 -0
- mlflow/server/auth/__init__.py +1224 -0
- mlflow/server/auth/__main__.py +4 -0
- mlflow/server/auth/basic_auth.ini +6 -0
- mlflow/server/auth/cli.py +11 -0
- mlflow/server/auth/client.py +537 -0
- mlflow/server/auth/config.py +34 -0
- mlflow/server/auth/db/__init__.py +0 -0
- mlflow/server/auth/db/cli.py +18 -0
- mlflow/server/auth/db/migrations/__init__.py +0 -0
- mlflow/server/auth/db/migrations/alembic.ini +110 -0
- mlflow/server/auth/db/migrations/env.py +76 -0
- mlflow/server/auth/db/migrations/versions/8606fa83a998_initial_migration.py +51 -0
- mlflow/server/auth/db/migrations/versions/__init__.py +0 -0
- mlflow/server/auth/db/models.py +67 -0
- mlflow/server/auth/db/utils.py +37 -0
- mlflow/server/auth/entities.py +165 -0
- mlflow/server/auth/logo.py +14 -0
- mlflow/server/auth/permissions.py +65 -0
- mlflow/server/auth/routes.py +18 -0
- mlflow/server/auth/sqlalchemy_store.py +263 -0
- mlflow/server/graphql/__init__.py +0 -0
- mlflow/server/graphql/autogenerated_graphql_schema.py +353 -0
- mlflow/server/graphql/graphql_custom_scalars.py +24 -0
- mlflow/server/graphql/graphql_errors.py +15 -0
- mlflow/server/graphql/graphql_no_batching.py +89 -0
- mlflow/server/graphql/graphql_schema_extensions.py +74 -0
- mlflow/server/handlers.py +3217 -0
- mlflow/server/prometheus_exporter.py +17 -0
- mlflow/server/validation.py +30 -0
- mlflow/shap/__init__.py +691 -0
- mlflow/sklearn/__init__.py +1994 -0
- mlflow/sklearn/utils.py +1041 -0
- mlflow/smolagents/__init__.py +66 -0
- mlflow/smolagents/autolog.py +139 -0
- mlflow/smolagents/chat.py +29 -0
- mlflow/store/__init__.py +10 -0
- mlflow/store/_unity_catalog/__init__.py +1 -0
- mlflow/store/_unity_catalog/lineage/__init__.py +1 -0
- mlflow/store/_unity_catalog/lineage/constants.py +2 -0
- mlflow/store/_unity_catalog/registry/__init__.py +6 -0
- mlflow/store/_unity_catalog/registry/prompt_info.py +75 -0
- mlflow/store/_unity_catalog/registry/rest_store.py +1740 -0
- mlflow/store/_unity_catalog/registry/uc_oss_rest_store.py +507 -0
- mlflow/store/_unity_catalog/registry/utils.py +121 -0
- mlflow/store/artifact/__init__.py +0 -0
- mlflow/store/artifact/artifact_repo.py +472 -0
- mlflow/store/artifact/artifact_repository_registry.py +154 -0
- mlflow/store/artifact/azure_blob_artifact_repo.py +275 -0
- mlflow/store/artifact/azure_data_lake_artifact_repo.py +295 -0
- mlflow/store/artifact/cli.py +141 -0
- mlflow/store/artifact/cloud_artifact_repo.py +332 -0
- mlflow/store/artifact/databricks_artifact_repo.py +729 -0
- mlflow/store/artifact/databricks_artifact_repo_resources.py +301 -0
- mlflow/store/artifact/databricks_logged_model_artifact_repo.py +93 -0
- mlflow/store/artifact/databricks_models_artifact_repo.py +216 -0
- mlflow/store/artifact/databricks_sdk_artifact_repo.py +134 -0
- mlflow/store/artifact/databricks_sdk_models_artifact_repo.py +97 -0
- mlflow/store/artifact/dbfs_artifact_repo.py +240 -0
- mlflow/store/artifact/ftp_artifact_repo.py +132 -0
- mlflow/store/artifact/gcs_artifact_repo.py +296 -0
- mlflow/store/artifact/hdfs_artifact_repo.py +209 -0
- mlflow/store/artifact/http_artifact_repo.py +218 -0
- mlflow/store/artifact/local_artifact_repo.py +142 -0
- mlflow/store/artifact/mlflow_artifacts_repo.py +94 -0
- mlflow/store/artifact/models_artifact_repo.py +259 -0
- mlflow/store/artifact/optimized_s3_artifact_repo.py +356 -0
- mlflow/store/artifact/presigned_url_artifact_repo.py +173 -0
- mlflow/store/artifact/r2_artifact_repo.py +70 -0
- mlflow/store/artifact/runs_artifact_repo.py +265 -0
- mlflow/store/artifact/s3_artifact_repo.py +330 -0
- mlflow/store/artifact/sftp_artifact_repo.py +141 -0
- mlflow/store/artifact/uc_volume_artifact_repo.py +76 -0
- mlflow/store/artifact/unity_catalog_models_artifact_repo.py +168 -0
- mlflow/store/artifact/unity_catalog_oss_models_artifact_repo.py +168 -0
- mlflow/store/artifact/utils/__init__.py +0 -0
- mlflow/store/artifact/utils/models.py +148 -0
- mlflow/store/db/__init__.py +0 -0
- mlflow/store/db/base_sql_model.py +3 -0
- mlflow/store/db/db_types.py +10 -0
- mlflow/store/db/utils.py +314 -0
- mlflow/store/db_migrations/__init__.py +0 -0
- mlflow/store/db_migrations/alembic.ini +74 -0
- mlflow/store/db_migrations/env.py +84 -0
- mlflow/store/db_migrations/versions/0584bdc529eb_add_cascading_deletion_to_datasets_from_experiments.py +88 -0
- mlflow/store/db_migrations/versions/0a8213491aaa_drop_duplicate_killed_constraint.py +49 -0
- mlflow/store/db_migrations/versions/0c779009ac13_add_deleted_time_field_to_runs_table.py +24 -0
- mlflow/store/db_migrations/versions/181f10493468_allow_nulls_for_metric_values.py +35 -0
- mlflow/store/db_migrations/versions/27a6a02d2cf1_add_model_version_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/2b4d017a5e9b_add_model_registry_tables_to_db.py +77 -0
- mlflow/store/db_migrations/versions/2d6e25af4d3e_increase_max_param_val_length.py +33 -0
- mlflow/store/db_migrations/versions/3500859a5d39_add_model_aliases_table.py +50 -0
- mlflow/store/db_migrations/versions/39d1c3be5f05_add_is_nan_constraint_for_metrics_tables_if_necessary.py +41 -0
- mlflow/store/db_migrations/versions/400f98739977_add_logged_model_tables.py +123 -0
- mlflow/store/db_migrations/versions/4465047574b1_increase_max_dataset_schema_size.py +38 -0
- mlflow/store/db_migrations/versions/451aebb31d03_add_metric_step.py +35 -0
- mlflow/store/db_migrations/versions/5b0e9adcef9c_add_cascade_deletion_to_trace_tables_fk.py +40 -0
- mlflow/store/db_migrations/versions/6953534de441_add_step_to_inputs_table.py +25 -0
- mlflow/store/db_migrations/versions/728d730b5ebd_add_registered_model_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/7ac759974ad8_update_run_tags_with_larger_limit.py +36 -0
- mlflow/store/db_migrations/versions/7f2a7d5fae7d_add_datasets_inputs_input_tags_tables.py +82 -0
- mlflow/store/db_migrations/versions/84291f40a231_add_run_link_to_model_version.py +26 -0
- mlflow/store/db_migrations/versions/867495a8f9d4_add_trace_tables.py +90 -0
- mlflow/store/db_migrations/versions/89d4b8295536_create_latest_metrics_table.py +169 -0
- mlflow/store/db_migrations/versions/90e64c465722_migrate_user_column_to_tags.py +64 -0
- mlflow/store/db_migrations/versions/97727af70f4d_creation_time_last_update_time_experiments.py +25 -0
- mlflow/store/db_migrations/versions/__init__.py +0 -0
- mlflow/store/db_migrations/versions/a8c4a736bde6_allow_nulls_for_run_id.py +27 -0
- mlflow/store/db_migrations/versions/acf3f17fdcc7_add_storage_location_field_to_model_.py +29 -0
- mlflow/store/db_migrations/versions/bd07f7e963c5_create_index_on_run_uuid.py +26 -0
- mlflow/store/db_migrations/versions/bda7b8c39065_increase_model_version_tag_value_limit.py +38 -0
- mlflow/store/db_migrations/versions/c48cb773bb87_reset_default_value_for_is_nan_in_metrics_table_for_mysql.py +41 -0
- mlflow/store/db_migrations/versions/cbc13b556ace_add_v3_trace_schema_columns.py +31 -0
- mlflow/store/db_migrations/versions/cc1f77228345_change_param_value_length_to_500.py +34 -0
- mlflow/store/db_migrations/versions/cfd24bdc0731_update_run_status_constraint_with_killed.py +78 -0
- mlflow/store/db_migrations/versions/df50e92ffc5e_add_experiment_tags_table.py +38 -0
- mlflow/store/db_migrations/versions/f5a4f2784254_increase_run_tag_value_limit.py +36 -0
- mlflow/store/entities/__init__.py +3 -0
- mlflow/store/entities/paged_list.py +18 -0
- mlflow/store/model_registry/__init__.py +10 -0
- mlflow/store/model_registry/abstract_store.py +1081 -0
- mlflow/store/model_registry/base_rest_store.py +44 -0
- mlflow/store/model_registry/databricks_workspace_model_registry_rest_store.py +37 -0
- mlflow/store/model_registry/dbmodels/__init__.py +0 -0
- mlflow/store/model_registry/dbmodels/models.py +206 -0
- mlflow/store/model_registry/file_store.py +1091 -0
- mlflow/store/model_registry/rest_store.py +481 -0
- mlflow/store/model_registry/sqlalchemy_store.py +1286 -0
- mlflow/store/tracking/__init__.py +23 -0
- mlflow/store/tracking/abstract_store.py +816 -0
- mlflow/store/tracking/dbmodels/__init__.py +0 -0
- mlflow/store/tracking/dbmodels/initial_models.py +243 -0
- mlflow/store/tracking/dbmodels/models.py +1073 -0
- mlflow/store/tracking/file_store.py +2438 -0
- mlflow/store/tracking/postgres_managed_identity.py +146 -0
- mlflow/store/tracking/rest_store.py +1131 -0
- mlflow/store/tracking/sqlalchemy_store.py +2785 -0
- mlflow/system_metrics/__init__.py +61 -0
- mlflow/system_metrics/metrics/__init__.py +0 -0
- mlflow/system_metrics/metrics/base_metrics_monitor.py +32 -0
- mlflow/system_metrics/metrics/cpu_monitor.py +23 -0
- mlflow/system_metrics/metrics/disk_monitor.py +21 -0
- mlflow/system_metrics/metrics/gpu_monitor.py +71 -0
- mlflow/system_metrics/metrics/network_monitor.py +34 -0
- mlflow/system_metrics/metrics/rocm_monitor.py +123 -0
- mlflow/system_metrics/system_metrics_monitor.py +198 -0
- mlflow/tracing/__init__.py +16 -0
- mlflow/tracing/assessment.py +356 -0
- mlflow/tracing/client.py +531 -0
- mlflow/tracing/config.py +125 -0
- mlflow/tracing/constant.py +105 -0
- mlflow/tracing/destination.py +81 -0
- mlflow/tracing/display/__init__.py +40 -0
- mlflow/tracing/display/display_handler.py +196 -0
- mlflow/tracing/export/async_export_queue.py +186 -0
- mlflow/tracing/export/inference_table.py +138 -0
- mlflow/tracing/export/mlflow_v3.py +137 -0
- mlflow/tracing/export/utils.py +70 -0
- mlflow/tracing/fluent.py +1417 -0
- mlflow/tracing/processor/base_mlflow.py +199 -0
- mlflow/tracing/processor/inference_table.py +175 -0
- mlflow/tracing/processor/mlflow_v3.py +47 -0
- mlflow/tracing/processor/otel.py +73 -0
- mlflow/tracing/provider.py +487 -0
- mlflow/tracing/trace_manager.py +200 -0
- mlflow/tracing/utils/__init__.py +616 -0
- mlflow/tracing/utils/artifact_utils.py +28 -0
- mlflow/tracing/utils/copy.py +55 -0
- mlflow/tracing/utils/environment.py +55 -0
- mlflow/tracing/utils/exception.py +21 -0
- mlflow/tracing/utils/once.py +35 -0
- mlflow/tracing/utils/otlp.py +63 -0
- mlflow/tracing/utils/processor.py +54 -0
- mlflow/tracing/utils/search.py +292 -0
- mlflow/tracing/utils/timeout.py +250 -0
- mlflow/tracing/utils/token.py +19 -0
- mlflow/tracing/utils/truncation.py +124 -0
- mlflow/tracing/utils/warning.py +76 -0
- mlflow/tracking/__init__.py +39 -0
- mlflow/tracking/_model_registry/__init__.py +1 -0
- mlflow/tracking/_model_registry/client.py +764 -0
- mlflow/tracking/_model_registry/fluent.py +853 -0
- mlflow/tracking/_model_registry/registry.py +67 -0
- mlflow/tracking/_model_registry/utils.py +251 -0
- mlflow/tracking/_tracking_service/__init__.py +0 -0
- mlflow/tracking/_tracking_service/client.py +883 -0
- mlflow/tracking/_tracking_service/registry.py +56 -0
- mlflow/tracking/_tracking_service/utils.py +275 -0
- mlflow/tracking/artifact_utils.py +179 -0
- mlflow/tracking/client.py +5900 -0
- mlflow/tracking/context/__init__.py +0 -0
- mlflow/tracking/context/abstract_context.py +35 -0
- mlflow/tracking/context/databricks_cluster_context.py +15 -0
- mlflow/tracking/context/databricks_command_context.py +15 -0
- mlflow/tracking/context/databricks_job_context.py +49 -0
- mlflow/tracking/context/databricks_notebook_context.py +41 -0
- mlflow/tracking/context/databricks_repo_context.py +43 -0
- mlflow/tracking/context/default_context.py +51 -0
- mlflow/tracking/context/git_context.py +32 -0
- mlflow/tracking/context/registry.py +98 -0
- mlflow/tracking/context/system_environment_context.py +15 -0
- mlflow/tracking/default_experiment/__init__.py +1 -0
- mlflow/tracking/default_experiment/abstract_context.py +43 -0
- mlflow/tracking/default_experiment/databricks_notebook_experiment_provider.py +44 -0
- mlflow/tracking/default_experiment/registry.py +75 -0
- mlflow/tracking/fluent.py +3595 -0
- mlflow/tracking/metric_value_conversion_utils.py +93 -0
- mlflow/tracking/multimedia.py +206 -0
- mlflow/tracking/registry.py +86 -0
- mlflow/tracking/request_auth/__init__.py +0 -0
- mlflow/tracking/request_auth/abstract_request_auth_provider.py +34 -0
- mlflow/tracking/request_auth/registry.py +60 -0
- mlflow/tracking/request_header/__init__.py +0 -0
- mlflow/tracking/request_header/abstract_request_header_provider.py +36 -0
- mlflow/tracking/request_header/databricks_request_header_provider.py +38 -0
- mlflow/tracking/request_header/default_request_header_provider.py +17 -0
- mlflow/tracking/request_header/registry.py +79 -0
- mlflow/transformers/__init__.py +2982 -0
- mlflow/transformers/flavor_config.py +258 -0
- mlflow/transformers/hub_utils.py +83 -0
- mlflow/transformers/llm_inference_utils.py +468 -0
- mlflow/transformers/model_io.py +301 -0
- mlflow/transformers/peft.py +51 -0
- mlflow/transformers/signature.py +183 -0
- mlflow/transformers/torch_utils.py +55 -0
- mlflow/types/__init__.py +21 -0
- mlflow/types/agent.py +270 -0
- mlflow/types/chat.py +240 -0
- mlflow/types/llm.py +935 -0
- mlflow/types/responses.py +139 -0
- mlflow/types/responses_helpers.py +416 -0
- mlflow/types/schema.py +1505 -0
- mlflow/types/type_hints.py +647 -0
- mlflow/types/utils.py +753 -0
- mlflow/utils/__init__.py +283 -0
- mlflow/utils/_capture_modules.py +256 -0
- mlflow/utils/_capture_transformers_modules.py +75 -0
- mlflow/utils/_spark_utils.py +201 -0
- mlflow/utils/_unity_catalog_oss_utils.py +97 -0
- mlflow/utils/_unity_catalog_utils.py +479 -0
- mlflow/utils/annotations.py +218 -0
- mlflow/utils/arguments_utils.py +16 -0
- mlflow/utils/async_logging/__init__.py +1 -0
- mlflow/utils/async_logging/async_artifacts_logging_queue.py +258 -0
- mlflow/utils/async_logging/async_logging_queue.py +366 -0
- mlflow/utils/async_logging/run_artifact.py +38 -0
- mlflow/utils/async_logging/run_batch.py +58 -0
- mlflow/utils/async_logging/run_operations.py +49 -0
- mlflow/utils/autologging_utils/__init__.py +737 -0
- mlflow/utils/autologging_utils/client.py +432 -0
- mlflow/utils/autologging_utils/config.py +33 -0
- mlflow/utils/autologging_utils/events.py +294 -0
- mlflow/utils/autologging_utils/logging_and_warnings.py +328 -0
- mlflow/utils/autologging_utils/metrics_queue.py +71 -0
- mlflow/utils/autologging_utils/safety.py +1104 -0
- mlflow/utils/autologging_utils/versioning.py +95 -0
- mlflow/utils/checkpoint_utils.py +206 -0
- mlflow/utils/class_utils.py +6 -0
- mlflow/utils/cli_args.py +257 -0
- mlflow/utils/conda.py +354 -0
- mlflow/utils/credentials.py +231 -0
- mlflow/utils/data_utils.py +17 -0
- mlflow/utils/databricks_utils.py +1436 -0
- mlflow/utils/docstring_utils.py +477 -0
- mlflow/utils/doctor.py +133 -0
- mlflow/utils/download_cloud_file_chunk.py +43 -0
- mlflow/utils/env_manager.py +16 -0
- mlflow/utils/env_pack.py +131 -0
- mlflow/utils/environment.py +1009 -0
- mlflow/utils/exception_utils.py +14 -0
- mlflow/utils/file_utils.py +978 -0
- mlflow/utils/git_utils.py +77 -0
- mlflow/utils/gorilla.py +797 -0
- mlflow/utils/import_hooks/__init__.py +363 -0
- mlflow/utils/lazy_load.py +51 -0
- mlflow/utils/logging_utils.py +168 -0
- mlflow/utils/mime_type_utils.py +58 -0
- mlflow/utils/mlflow_tags.py +103 -0
- mlflow/utils/model_utils.py +486 -0
- mlflow/utils/name_utils.py +346 -0
- mlflow/utils/nfs_on_spark.py +62 -0
- mlflow/utils/openai_utils.py +164 -0
- mlflow/utils/os.py +12 -0
- mlflow/utils/oss_registry_utils.py +29 -0
- mlflow/utils/plugins.py +17 -0
- mlflow/utils/process.py +182 -0
- mlflow/utils/promptlab_utils.py +146 -0
- mlflow/utils/proto_json_utils.py +743 -0
- mlflow/utils/pydantic_utils.py +54 -0
- mlflow/utils/request_utils.py +279 -0
- mlflow/utils/requirements_utils.py +704 -0
- mlflow/utils/rest_utils.py +673 -0
- mlflow/utils/search_logged_model_utils.py +127 -0
- mlflow/utils/search_utils.py +2111 -0
- mlflow/utils/secure_loading.py +221 -0
- mlflow/utils/security_validation.py +384 -0
- mlflow/utils/server_cli_utils.py +61 -0
- mlflow/utils/spark_utils.py +15 -0
- mlflow/utils/string_utils.py +138 -0
- mlflow/utils/thread_utils.py +63 -0
- mlflow/utils/time.py +54 -0
- mlflow/utils/timeout.py +42 -0
- mlflow/utils/uri.py +572 -0
- mlflow/utils/validation.py +662 -0
- mlflow/utils/virtualenv.py +458 -0
- mlflow/utils/warnings_utils.py +25 -0
- mlflow/utils/yaml_utils.py +179 -0
- mlflow/version.py +24 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
"""System metrics logging module."""
|
2
|
+
|
3
|
+
from mlflow.environment_variables import (
|
4
|
+
MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING,
|
5
|
+
MLFLOW_SYSTEM_METRICS_NODE_ID,
|
6
|
+
MLFLOW_SYSTEM_METRICS_SAMPLES_BEFORE_LOGGING,
|
7
|
+
MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
def disable_system_metrics_logging():
|
12
|
+
"""Disable system metrics logging globally.
|
13
|
+
|
14
|
+
Calling this function will disable system metrics logging globally, but users can still opt in
|
15
|
+
system metrics logging for individual runs by `mlflow.start_run(log_system_metrics=True)`.
|
16
|
+
"""
|
17
|
+
MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING.set(False)
|
18
|
+
|
19
|
+
|
20
|
+
def enable_system_metrics_logging():
|
21
|
+
"""Enable system metrics logging globally.
|
22
|
+
|
23
|
+
Calling this function will enable system metrics logging globally, but users can still opt out
|
24
|
+
system metrics logging for individual runs by `mlflow.start_run(log_system_metrics=False)`.
|
25
|
+
"""
|
26
|
+
MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING.set(True)
|
27
|
+
|
28
|
+
|
29
|
+
def set_system_metrics_sampling_interval(interval):
|
30
|
+
"""Set the system metrics sampling interval.
|
31
|
+
|
32
|
+
Every `interval` seconds, the system metrics will be collected. By default `interval=10`.
|
33
|
+
"""
|
34
|
+
if interval is None:
|
35
|
+
MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL.unset()
|
36
|
+
else:
|
37
|
+
MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL.set(interval)
|
38
|
+
|
39
|
+
|
40
|
+
def set_system_metrics_samples_before_logging(samples):
|
41
|
+
"""Set the number of samples before logging system metrics.
|
42
|
+
|
43
|
+
Every time `samples` samples have been collected, the system metrics will be logged to mlflow.
|
44
|
+
By default `samples=1`.
|
45
|
+
"""
|
46
|
+
if samples is None:
|
47
|
+
MLFLOW_SYSTEM_METRICS_SAMPLES_BEFORE_LOGGING.unset()
|
48
|
+
else:
|
49
|
+
MLFLOW_SYSTEM_METRICS_SAMPLES_BEFORE_LOGGING.set(samples)
|
50
|
+
|
51
|
+
|
52
|
+
def set_system_metrics_node_id(node_id):
|
53
|
+
"""Set the system metrics node id.
|
54
|
+
|
55
|
+
node_id is the identifier of the machine where the metrics are collected. This is useful in
|
56
|
+
multi-node (distributed training) setup.
|
57
|
+
"""
|
58
|
+
if node_id is None:
|
59
|
+
MLFLOW_SYSTEM_METRICS_NODE_ID.unset()
|
60
|
+
else:
|
61
|
+
MLFLOW_SYSTEM_METRICS_NODE_ID.set(node_id)
|
File without changes
|
@@ -0,0 +1,32 @@
|
|
1
|
+
"""Base class of system metrics monitor."""
|
2
|
+
|
3
|
+
import abc
|
4
|
+
from collections import defaultdict
|
5
|
+
|
6
|
+
|
7
|
+
class BaseMetricsMonitor(abc.ABC):
|
8
|
+
"""Base class of system metrics monitor."""
|
9
|
+
|
10
|
+
def __init__(self):
|
11
|
+
self._metrics = defaultdict(list)
|
12
|
+
|
13
|
+
@abc.abstractmethod
|
14
|
+
def collect_metrics(self):
|
15
|
+
"""Method to collect metrics.
|
16
|
+
|
17
|
+
Subclass should implement this method to collect metrics and store in `self._metrics`.
|
18
|
+
"""
|
19
|
+
|
20
|
+
@abc.abstractmethod
|
21
|
+
def aggregate_metrics(self):
|
22
|
+
"""Method to aggregate metrics.
|
23
|
+
|
24
|
+
Subclass should implement this method to aggregate the metrics and return it in a dict.
|
25
|
+
"""
|
26
|
+
|
27
|
+
@property
|
28
|
+
def metrics(self):
|
29
|
+
return self._metrics
|
30
|
+
|
31
|
+
def clear_metrics(self):
|
32
|
+
self._metrics.clear()
|
@@ -0,0 +1,23 @@
|
|
1
|
+
"""Class for monitoring CPU stats."""
|
2
|
+
|
3
|
+
import psutil
|
4
|
+
|
5
|
+
from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor
|
6
|
+
|
7
|
+
|
8
|
+
class CPUMonitor(BaseMetricsMonitor):
|
9
|
+
"""Class for monitoring CPU stats."""
|
10
|
+
|
11
|
+
def collect_metrics(self):
|
12
|
+
# Get CPU metrics.
|
13
|
+
cpu_percent = psutil.cpu_percent()
|
14
|
+
self._metrics["cpu_utilization_percentage"].append(cpu_percent)
|
15
|
+
|
16
|
+
system_memory = psutil.virtual_memory()
|
17
|
+
self._metrics["system_memory_usage_megabytes"].append(system_memory.used / 1e6)
|
18
|
+
self._metrics["system_memory_usage_percentage"].append(
|
19
|
+
system_memory.used / system_memory.total * 100
|
20
|
+
)
|
21
|
+
|
22
|
+
def aggregate_metrics(self):
|
23
|
+
return {k: round(sum(v) / len(v), 1) for k, v in self._metrics.items()}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
"""Class for monitoring disk stats."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
|
5
|
+
import psutil
|
6
|
+
|
7
|
+
from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor
|
8
|
+
|
9
|
+
|
10
|
+
class DiskMonitor(BaseMetricsMonitor):
|
11
|
+
"""Class for monitoring disk stats."""
|
12
|
+
|
13
|
+
def collect_metrics(self):
|
14
|
+
# Get disk usage metrics.
|
15
|
+
disk_usage = psutil.disk_usage(os.sep)
|
16
|
+
self._metrics["disk_usage_percentage"].append(disk_usage.percent)
|
17
|
+
self._metrics["disk_usage_megabytes"].append(disk_usage.used / 1e6)
|
18
|
+
self._metrics["disk_available_megabytes"].append(disk_usage.free / 1e6)
|
19
|
+
|
20
|
+
def aggregate_metrics(self):
|
21
|
+
return {k: round(sum(v) / len(v), 1) for k, v in self._metrics.items()}
|
@@ -0,0 +1,71 @@
|
|
1
|
+
"""Class for monitoring GPU stats."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import sys
|
5
|
+
|
6
|
+
from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor
|
7
|
+
|
8
|
+
_logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
try:
|
11
|
+
import pynvml
|
12
|
+
except ImportError:
|
13
|
+
# If `pynvml` is not installed, a warning will be logged at monitor instantiation.
|
14
|
+
# We don't log a warning here to avoid spamming warning at every import.
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
class GPUMonitor(BaseMetricsMonitor):
|
19
|
+
"""Class for monitoring GPU stats."""
|
20
|
+
|
21
|
+
def __init__(self):
|
22
|
+
if "pynvml" not in sys.modules:
|
23
|
+
# Only instantiate if `pynvml` is installed.
|
24
|
+
raise ImportError(
|
25
|
+
"`pynvml` is not installed, to log GPU metrics please run `pip install pynvml` "
|
26
|
+
"to install it."
|
27
|
+
)
|
28
|
+
try:
|
29
|
+
# `nvmlInit()` will fail if no GPU is found.
|
30
|
+
pynvml.nvmlInit()
|
31
|
+
except pynvml.NVMLError as e:
|
32
|
+
raise RuntimeError(f"Failed to initialize NVML, skip logging GPU metrics: {e}")
|
33
|
+
|
34
|
+
super().__init__()
|
35
|
+
self.num_gpus = pynvml.nvmlDeviceGetCount()
|
36
|
+
self.gpu_handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(self.num_gpus)]
|
37
|
+
|
38
|
+
def collect_metrics(self):
|
39
|
+
# Get GPU metrics.
|
40
|
+
for i, handle in enumerate(self.gpu_handles):
|
41
|
+
try:
|
42
|
+
memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
43
|
+
self._metrics[f"gpu_{i}_memory_usage_percentage"].append(
|
44
|
+
round(memory.used / memory.total * 100, 1)
|
45
|
+
)
|
46
|
+
self._metrics[f"gpu_{i}_memory_usage_megabytes"].append(memory.used / 1e6)
|
47
|
+
except pynvml.NVMLError as e:
|
48
|
+
_logger.warning(f"Encountered error {e} when trying to collect GPU memory metrics.")
|
49
|
+
|
50
|
+
try:
|
51
|
+
device_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
52
|
+
self._metrics[f"gpu_{i}_utilization_percentage"].append(device_utilization.gpu)
|
53
|
+
except pynvml.NVMLError as e:
|
54
|
+
_logger.warning(
|
55
|
+
f"Encountered error {e} when trying to collect GPU utilization metrics."
|
56
|
+
)
|
57
|
+
|
58
|
+
try:
|
59
|
+
power_milliwatts = pynvml.nvmlDeviceGetPowerUsage(handle)
|
60
|
+
power_capacity_milliwatts = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)
|
61
|
+
self._metrics[f"gpu_{i}_power_usage_watts"].append(power_milliwatts / 1000)
|
62
|
+
self._metrics[f"gpu_{i}_power_usage_percentage"].append(
|
63
|
+
(power_milliwatts / power_capacity_milliwatts) * 100
|
64
|
+
)
|
65
|
+
except pynvml.NVMLError as e:
|
66
|
+
_logger.warning(
|
67
|
+
f"Encountered error {e} when trying to collect GPU power usage metrics."
|
68
|
+
)
|
69
|
+
|
70
|
+
def aggregate_metrics(self):
|
71
|
+
return {k: round(sum(v) / len(v), 1) for k, v in self._metrics.items()}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
"""Class for monitoring network stats."""
|
2
|
+
|
3
|
+
import psutil
|
4
|
+
|
5
|
+
from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor
|
6
|
+
|
7
|
+
|
8
|
+
class NetworkMonitor(BaseMetricsMonitor):
|
9
|
+
def __init__(self):
|
10
|
+
super().__init__()
|
11
|
+
self._set_initial_metrics()
|
12
|
+
|
13
|
+
def _set_initial_metrics(self):
|
14
|
+
# Set initial network usage metrics. `psutil.net_io_counters()` counts the stats since the
|
15
|
+
# system boot, so to set network usage metrics as 0 when we start logging, we need to keep
|
16
|
+
# the initial network usage metrics.
|
17
|
+
network_usage = psutil.net_io_counters()
|
18
|
+
self._initial_receive_megabytes = network_usage.bytes_recv / 1e6
|
19
|
+
self._initial_transmit_megabytes = network_usage.bytes_sent / 1e6
|
20
|
+
|
21
|
+
def collect_metrics(self):
|
22
|
+
# Get network usage metrics.
|
23
|
+
network_usage = psutil.net_io_counters()
|
24
|
+
# Usage metrics will be the diff between current and initial metrics.
|
25
|
+
self._metrics["network_receive_megabytes"] = (
|
26
|
+
network_usage.bytes_recv / 1e6 - self._initial_receive_megabytes
|
27
|
+
)
|
28
|
+
self._metrics["network_transmit_megabytes"] = (
|
29
|
+
network_usage.bytes_sent / 1e6 - self._initial_transmit_megabytes
|
30
|
+
)
|
31
|
+
|
32
|
+
def aggregate_metrics(self):
|
33
|
+
# Network metrics don't need to be averaged.
|
34
|
+
return dict(self._metrics)
|
@@ -0,0 +1,123 @@
|
|
1
|
+
"""Class for monitoring GPU stats on HIP devices.
|
2
|
+
Inspired by GPUMonitor, but with the pynvml method
|
3
|
+
named replaced by pyrsmi method names
|
4
|
+
"""
|
5
|
+
|
6
|
+
import contextlib
|
7
|
+
import io
|
8
|
+
import logging
|
9
|
+
import sys
|
10
|
+
|
11
|
+
from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor
|
12
|
+
|
13
|
+
_logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
is_rocml_available = False
|
16
|
+
try:
|
17
|
+
from pyrsmi import rocml
|
18
|
+
|
19
|
+
is_rocml_available = True
|
20
|
+
except ImportError:
|
21
|
+
# If `pyrsmi` is not installed, a warning will be logged at monitor instantiation.
|
22
|
+
# We don't log a warning here to avoid spamming warning at every import.
|
23
|
+
pass
|
24
|
+
|
25
|
+
|
26
|
+
class ROCMMonitor(BaseMetricsMonitor):
|
27
|
+
"""
|
28
|
+
Class for monitoring AMD GPU stats. This is
|
29
|
+
class has been modified and has been inspired by
|
30
|
+
the original GPUMonitor class written by MLflow.
|
31
|
+
This class uses the package pyrsmi which is an
|
32
|
+
official ROCM python package which tracks and monitor
|
33
|
+
AMD GPU's, has been tested on AMD MI250x 128GB GPUs
|
34
|
+
|
35
|
+
For more information see:
|
36
|
+
https://github.com/ROCm/pyrsmi
|
37
|
+
|
38
|
+
PyPi package:
|
39
|
+
https://pypi.org/project/pyrsmi/
|
40
|
+
|
41
|
+
|
42
|
+
"""
|
43
|
+
|
44
|
+
def __init__(self):
|
45
|
+
if "pyrsmi" not in sys.modules:
|
46
|
+
# Only instantiate if `pyrsmi` is installed.
|
47
|
+
raise ImportError(
|
48
|
+
"`pyrsmi` is not installed, to log GPU metrics please run `pip install pyrsmi` "
|
49
|
+
"to install it."
|
50
|
+
)
|
51
|
+
|
52
|
+
try:
|
53
|
+
rocml.smi_initialize()
|
54
|
+
except RuntimeError:
|
55
|
+
raise RuntimeError("Failed to initialize RSMI, skip logging GPU metrics")
|
56
|
+
|
57
|
+
super().__init__()
|
58
|
+
|
59
|
+
# Check if GPU is virtual. If so, collect power information from physical GPU
|
60
|
+
self.physical_idx = []
|
61
|
+
for i in range(rocml.smi_get_device_count()):
|
62
|
+
try:
|
63
|
+
self.raise_error(rocml.smi_get_device_average_power, i)
|
64
|
+
# physical GPU if no error is raised
|
65
|
+
self.physical_idx.append(i)
|
66
|
+
except SystemError:
|
67
|
+
# virtual if error is raised
|
68
|
+
# all virtual GPUs must share physical GPU with previous virtual/physical GPU
|
69
|
+
assert i >= 1
|
70
|
+
self.physical_idx.append(self.physical_idx[-1])
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def raise_error(func, *args, **kwargs):
|
74
|
+
"""Raise error if message containing 'error' is printed out to stdout or stderr."""
|
75
|
+
stdout = io.StringIO()
|
76
|
+
stderr = io.StringIO()
|
77
|
+
|
78
|
+
with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr):
|
79
|
+
func(*args, **kwargs)
|
80
|
+
|
81
|
+
out = stdout.getvalue()
|
82
|
+
err = stderr.getvalue()
|
83
|
+
|
84
|
+
# Check if there is an error message in either stdout or stderr
|
85
|
+
if "error" in out.lower():
|
86
|
+
raise SystemError(out)
|
87
|
+
if "error" in err.lower():
|
88
|
+
raise SystemError(err)
|
89
|
+
|
90
|
+
def collect_metrics(self):
|
91
|
+
# Get GPU metrics.
|
92
|
+
self.num_gpus = rocml.smi_get_device_count()
|
93
|
+
|
94
|
+
for i in range(self.num_gpus):
|
95
|
+
memory_used = rocml.smi_get_device_memory_used(i)
|
96
|
+
memory_total = rocml.smi_get_device_memory_total(i)
|
97
|
+
self._metrics[f"gpu_{i}_memory_usage_percentage"].append(
|
98
|
+
round(memory_used / memory_total * 100, 1)
|
99
|
+
)
|
100
|
+
self._metrics[f"gpu_{i}_memory_usage_gigabytes"].append(memory_used / 1e9)
|
101
|
+
|
102
|
+
device_utilization = rocml.smi_get_device_utilization(i)
|
103
|
+
self._metrics[f"gpu_{i}_utilization_percentage"].append(device_utilization)
|
104
|
+
|
105
|
+
power_watts = rocml.smi_get_device_average_power(self.physical_idx[i])
|
106
|
+
power_capacity_watts = 500 # hard coded for now, should get this from rocm-smi
|
107
|
+
self._metrics[f"gpu_{i}_power_usage_watts"].append(power_watts)
|
108
|
+
self._metrics[f"gpu_{i}_power_usage_percentage"].append(
|
109
|
+
(power_watts / power_capacity_watts) * 100
|
110
|
+
)
|
111
|
+
|
112
|
+
# TODO:
|
113
|
+
# memory_busy (and other useful metrics) are available in pyrsmi>1.1.0.
|
114
|
+
# We are currently on pyrsmi==1.0.1, so these are not available
|
115
|
+
# memory_busy = rocml.smi_get_device_memory_busy(i)
|
116
|
+
# self._metrics[f"gpu_{i}_memory_busy_time_percent"].append(memory_busy)
|
117
|
+
|
118
|
+
def aggregate_metrics(self):
|
119
|
+
return {k: round(sum(v) / len(v), 1) for k, v in self._metrics.items()}
|
120
|
+
|
121
|
+
def __del__(self):
|
122
|
+
if is_rocml_available:
|
123
|
+
rocml.smi_shutdown()
|
@@ -0,0 +1,198 @@
|
|
1
|
+
"""Class for monitoring system stats."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import threading
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from mlflow.environment_variables import (
|
8
|
+
MLFLOW_SYSTEM_METRICS_NODE_ID,
|
9
|
+
MLFLOW_SYSTEM_METRICS_SAMPLES_BEFORE_LOGGING,
|
10
|
+
MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL,
|
11
|
+
)
|
12
|
+
from mlflow.exceptions import MlflowException
|
13
|
+
from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor
|
14
|
+
from mlflow.system_metrics.metrics.cpu_monitor import CPUMonitor
|
15
|
+
from mlflow.system_metrics.metrics.disk_monitor import DiskMonitor
|
16
|
+
from mlflow.system_metrics.metrics.gpu_monitor import GPUMonitor
|
17
|
+
from mlflow.system_metrics.metrics.network_monitor import NetworkMonitor
|
18
|
+
from mlflow.system_metrics.metrics.rocm_monitor import ROCMMonitor
|
19
|
+
|
20
|
+
_logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class SystemMetricsMonitor:
|
24
|
+
"""Class for monitoring system stats.
|
25
|
+
|
26
|
+
This class is used for pulling system metrics and logging them to MLflow. Calling `start()` will
|
27
|
+
spawn a thread that logs system metrics periodically. Calling `finish()` will stop the thread.
|
28
|
+
Logging is done on a different frequency from pulling metrics, so that the metrics are
|
29
|
+
aggregated over the period. Users can change the logging frequency by setting
|
30
|
+
`MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL` and `MLFLOW_SYSTEM_METRICS_SAMPLES_BEFORE_LOGGING`
|
31
|
+
environment variables, e.g., run `export MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL=10` in terminal
|
32
|
+
will set the sampling interval to 10 seconds.
|
33
|
+
|
34
|
+
System metrics are logged with a prefix "system/", e.g., "system/cpu_utilization_percentage".
|
35
|
+
|
36
|
+
Args:
|
37
|
+
run_id: string, the MLflow run ID.
|
38
|
+
sampling_interval: float, default to 10. The interval (in seconds) at which to pull system
|
39
|
+
metrics. Will be overridden by `MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL` environment
|
40
|
+
variable.
|
41
|
+
samples_before_logging: int, default to 1. The number of samples to aggregate before
|
42
|
+
logging. Will be overridden by `MLFLOW_SYSTEM_METRICS_SAMPLES_BEFORE_LOGGING`
|
43
|
+
evnironment variable.
|
44
|
+
resume_logging: bool, default to False. If True, we will resume the system metrics logging
|
45
|
+
from the `run_id`, and the first step to log will be the last step of `run_id` + 1, if
|
46
|
+
False, system metrics logging will start from step 0.
|
47
|
+
node_id: string, default to None. The node ID of the machine where the metrics are
|
48
|
+
collected. Will be overridden by `MLFLOW_SYSTEM_METRICS_NODE_ID`
|
49
|
+
evnironment variable. This is useful in multi-node training to distinguish the metrics
|
50
|
+
from different nodes. For example, if you set node_id to "node_0", the system metrics
|
51
|
+
getting logged will be of format "system/node_0/cpu_utilization_percentage".
|
52
|
+
"""
|
53
|
+
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
run_id,
|
57
|
+
sampling_interval=10,
|
58
|
+
samples_before_logging=1,
|
59
|
+
resume_logging=False,
|
60
|
+
node_id=None,
|
61
|
+
):
|
62
|
+
from mlflow.utils.autologging_utils import BatchMetricsLogger
|
63
|
+
|
64
|
+
# Instantiate default monitors.
|
65
|
+
self.monitors = [CPUMonitor(), DiskMonitor(), NetworkMonitor()]
|
66
|
+
|
67
|
+
if gpu_monitor := self._initialize_gpu_monitor():
|
68
|
+
self.monitors.append(gpu_monitor)
|
69
|
+
|
70
|
+
self.sampling_interval = MLFLOW_SYSTEM_METRICS_SAMPLING_INTERVAL.get() or sampling_interval
|
71
|
+
self.samples_before_logging = (
|
72
|
+
MLFLOW_SYSTEM_METRICS_SAMPLES_BEFORE_LOGGING.get() or samples_before_logging
|
73
|
+
)
|
74
|
+
|
75
|
+
self._run_id = run_id
|
76
|
+
self.mlflow_logger = BatchMetricsLogger(self._run_id)
|
77
|
+
self._shutdown_event = threading.Event()
|
78
|
+
self._process = None
|
79
|
+
self._metrics_prefix = "system/"
|
80
|
+
self.node_id = MLFLOW_SYSTEM_METRICS_NODE_ID.get() or node_id
|
81
|
+
self._logging_step = self._get_next_logging_step(run_id) if resume_logging else 0
|
82
|
+
|
83
|
+
def _get_next_logging_step(self, run_id):
|
84
|
+
from mlflow.tracking.client import MlflowClient
|
85
|
+
|
86
|
+
client = MlflowClient()
|
87
|
+
try:
|
88
|
+
run = client.get_run(run_id)
|
89
|
+
except MlflowException:
|
90
|
+
return 0
|
91
|
+
system_metric_name = None
|
92
|
+
for metric_name in run.data.metrics.keys():
|
93
|
+
if metric_name.startswith(self._metrics_prefix):
|
94
|
+
system_metric_name = metric_name
|
95
|
+
break
|
96
|
+
if system_metric_name is None:
|
97
|
+
return 0
|
98
|
+
metric_history = client.get_metric_history(run_id, system_metric_name)
|
99
|
+
return metric_history[-1].step + 1
|
100
|
+
|
101
|
+
def start(self):
|
102
|
+
"""Start monitoring system metrics."""
|
103
|
+
try:
|
104
|
+
self._process = threading.Thread(
|
105
|
+
target=self.monitor,
|
106
|
+
daemon=True,
|
107
|
+
name="SystemMetricsMonitor",
|
108
|
+
)
|
109
|
+
self._process.start()
|
110
|
+
_logger.info("Started monitoring system metrics.")
|
111
|
+
except Exception as e:
|
112
|
+
_logger.warning(f"Failed to start monitoring system metrics: {e}")
|
113
|
+
self._process = None
|
114
|
+
|
115
|
+
def monitor(self):
|
116
|
+
"""Main monitoring loop, which consistently collect and log system metrics."""
|
117
|
+
from mlflow.tracking.fluent import get_run
|
118
|
+
|
119
|
+
while not self._shutdown_event.is_set():
|
120
|
+
for _ in range(self.samples_before_logging):
|
121
|
+
self.collect_metrics()
|
122
|
+
self._shutdown_event.wait(self.sampling_interval)
|
123
|
+
try:
|
124
|
+
# Get the MLflow run to check if the run is not RUNNING.
|
125
|
+
run = get_run(self._run_id)
|
126
|
+
except Exception as e:
|
127
|
+
_logger.warning(f"Failed to get mlflow run: {e}.")
|
128
|
+
return
|
129
|
+
if run.info.status != "RUNNING" or self._shutdown_event.is_set():
|
130
|
+
# If the mlflow run is terminated or receives the shutdown signal, stop
|
131
|
+
# monitoring.
|
132
|
+
return
|
133
|
+
metrics = self.aggregate_metrics()
|
134
|
+
try:
|
135
|
+
self.publish_metrics(metrics)
|
136
|
+
except Exception as e:
|
137
|
+
_logger.warning(
|
138
|
+
f"Failed to log system metrics: {e}, this is expected if the experiment/run is "
|
139
|
+
"already terminated."
|
140
|
+
)
|
141
|
+
return
|
142
|
+
|
143
|
+
def collect_metrics(self):
|
144
|
+
"""Collect system metrics."""
|
145
|
+
metrics = {}
|
146
|
+
for monitor in self.monitors:
|
147
|
+
monitor.collect_metrics()
|
148
|
+
metrics.update(monitor._metrics)
|
149
|
+
return metrics
|
150
|
+
|
151
|
+
def aggregate_metrics(self):
|
152
|
+
"""Aggregate collected metrics."""
|
153
|
+
metrics = {}
|
154
|
+
for monitor in self.monitors:
|
155
|
+
metrics.update(monitor.aggregate_metrics())
|
156
|
+
return metrics
|
157
|
+
|
158
|
+
def publish_metrics(self, metrics):
|
159
|
+
"""Log collected metrics to MLflow."""
|
160
|
+
# Add prefix "system/" to the metrics name for grouping. If `self.node_id` is not None, also
|
161
|
+
# add it to the metrics name.
|
162
|
+
prefix = self._metrics_prefix + (self.node_id + "/" if self.node_id else "")
|
163
|
+
metrics = {prefix + k: v for k, v in metrics.items()}
|
164
|
+
|
165
|
+
self.mlflow_logger.record_metrics(metrics, self._logging_step)
|
166
|
+
self._logging_step += 1
|
167
|
+
for monitor in self.monitors:
|
168
|
+
monitor.clear_metrics()
|
169
|
+
|
170
|
+
def finish(self):
|
171
|
+
"""Stop monitoring system metrics."""
|
172
|
+
if self._process is None:
|
173
|
+
return
|
174
|
+
_logger.info("Stopping system metrics monitoring...")
|
175
|
+
self._shutdown_event.set()
|
176
|
+
try:
|
177
|
+
self._process.join()
|
178
|
+
self.mlflow_logger.flush()
|
179
|
+
_logger.info("Successfully terminated system metrics monitoring!")
|
180
|
+
except Exception as e:
|
181
|
+
_logger.error(f"Error terminating system metrics monitoring process: {e}.")
|
182
|
+
self._process = None
|
183
|
+
|
184
|
+
def _initialize_gpu_monitor(self) -> Optional[BaseMetricsMonitor]:
|
185
|
+
# NVIDIA GPU
|
186
|
+
try:
|
187
|
+
return GPUMonitor()
|
188
|
+
except Exception:
|
189
|
+
_logger.debug("Failed to initialize GPU monitor for NVIDIA GPU.", exc_info=True)
|
190
|
+
|
191
|
+
# Falling back to pyrocml (AMD/HIP GPU)
|
192
|
+
try:
|
193
|
+
return ROCMMonitor()
|
194
|
+
except Exception:
|
195
|
+
_logger.debug("Failed to initialize GPU monitor for AMD/HIP GPU.", exc_info=True)
|
196
|
+
|
197
|
+
_logger.info("Skip logging GPU metrics. Set logger level to DEBUG for more details.")
|
198
|
+
return None
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from mlflow.tracing.config import configure
|
2
|
+
from mlflow.tracing.display import disable_notebook_display, enable_notebook_display
|
3
|
+
from mlflow.tracing.provider import disable, enable, reset, set_destination
|
4
|
+
from mlflow.tracing.utils import set_span_chat_messages, set_span_chat_tools
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"configure",
|
8
|
+
"disable",
|
9
|
+
"enable",
|
10
|
+
"disable_notebook_display",
|
11
|
+
"enable_notebook_display",
|
12
|
+
"set_span_chat_messages",
|
13
|
+
"set_span_chat_tools",
|
14
|
+
"set_destination",
|
15
|
+
"reset",
|
16
|
+
]
|