genesis-flow 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (645) hide show
  1. genesis_flow-1.0.0.dist-info/METADATA +822 -0
  2. genesis_flow-1.0.0.dist-info/RECORD +645 -0
  3. genesis_flow-1.0.0.dist-info/WHEEL +5 -0
  4. genesis_flow-1.0.0.dist-info/entry_points.txt +19 -0
  5. genesis_flow-1.0.0.dist-info/licenses/LICENSE.txt +202 -0
  6. genesis_flow-1.0.0.dist-info/top_level.txt +1 -0
  7. mlflow/__init__.py +367 -0
  8. mlflow/__main__.py +3 -0
  9. mlflow/ag2/__init__.py +56 -0
  10. mlflow/ag2/ag2_logger.py +294 -0
  11. mlflow/anthropic/__init__.py +40 -0
  12. mlflow/anthropic/autolog.py +129 -0
  13. mlflow/anthropic/chat.py +144 -0
  14. mlflow/artifacts/__init__.py +268 -0
  15. mlflow/autogen/__init__.py +144 -0
  16. mlflow/autogen/chat.py +142 -0
  17. mlflow/azure/__init__.py +26 -0
  18. mlflow/azure/auth_handler.py +257 -0
  19. mlflow/azure/client.py +319 -0
  20. mlflow/azure/config.py +120 -0
  21. mlflow/azure/connection_factory.py +340 -0
  22. mlflow/azure/exceptions.py +27 -0
  23. mlflow/azure/stores.py +327 -0
  24. mlflow/azure/utils.py +183 -0
  25. mlflow/bedrock/__init__.py +45 -0
  26. mlflow/bedrock/_autolog.py +202 -0
  27. mlflow/bedrock/chat.py +122 -0
  28. mlflow/bedrock/stream.py +160 -0
  29. mlflow/bedrock/utils.py +43 -0
  30. mlflow/cli.py +707 -0
  31. mlflow/client.py +12 -0
  32. mlflow/config/__init__.py +56 -0
  33. mlflow/crewai/__init__.py +79 -0
  34. mlflow/crewai/autolog.py +253 -0
  35. mlflow/crewai/chat.py +29 -0
  36. mlflow/data/__init__.py +75 -0
  37. mlflow/data/artifact_dataset_sources.py +170 -0
  38. mlflow/data/code_dataset_source.py +40 -0
  39. mlflow/data/dataset.py +123 -0
  40. mlflow/data/dataset_registry.py +168 -0
  41. mlflow/data/dataset_source.py +110 -0
  42. mlflow/data/dataset_source_registry.py +219 -0
  43. mlflow/data/delta_dataset_source.py +167 -0
  44. mlflow/data/digest_utils.py +108 -0
  45. mlflow/data/evaluation_dataset.py +562 -0
  46. mlflow/data/filesystem_dataset_source.py +81 -0
  47. mlflow/data/http_dataset_source.py +145 -0
  48. mlflow/data/huggingface_dataset.py +258 -0
  49. mlflow/data/huggingface_dataset_source.py +118 -0
  50. mlflow/data/meta_dataset.py +104 -0
  51. mlflow/data/numpy_dataset.py +223 -0
  52. mlflow/data/pandas_dataset.py +231 -0
  53. mlflow/data/polars_dataset.py +352 -0
  54. mlflow/data/pyfunc_dataset_mixin.py +31 -0
  55. mlflow/data/schema.py +76 -0
  56. mlflow/data/sources.py +1 -0
  57. mlflow/data/spark_dataset.py +406 -0
  58. mlflow/data/spark_dataset_source.py +74 -0
  59. mlflow/data/spark_delta_utils.py +118 -0
  60. mlflow/data/tensorflow_dataset.py +350 -0
  61. mlflow/data/uc_volume_dataset_source.py +81 -0
  62. mlflow/db.py +27 -0
  63. mlflow/dspy/__init__.py +17 -0
  64. mlflow/dspy/autolog.py +197 -0
  65. mlflow/dspy/callback.py +398 -0
  66. mlflow/dspy/constant.py +1 -0
  67. mlflow/dspy/load.py +93 -0
  68. mlflow/dspy/save.py +393 -0
  69. mlflow/dspy/util.py +109 -0
  70. mlflow/dspy/wrapper.py +226 -0
  71. mlflow/entities/__init__.py +104 -0
  72. mlflow/entities/_mlflow_object.py +52 -0
  73. mlflow/entities/assessment.py +545 -0
  74. mlflow/entities/assessment_error.py +80 -0
  75. mlflow/entities/assessment_source.py +141 -0
  76. mlflow/entities/dataset.py +92 -0
  77. mlflow/entities/dataset_input.py +51 -0
  78. mlflow/entities/dataset_summary.py +62 -0
  79. mlflow/entities/document.py +48 -0
  80. mlflow/entities/experiment.py +109 -0
  81. mlflow/entities/experiment_tag.py +35 -0
  82. mlflow/entities/file_info.py +45 -0
  83. mlflow/entities/input_tag.py +35 -0
  84. mlflow/entities/lifecycle_stage.py +35 -0
  85. mlflow/entities/logged_model.py +228 -0
  86. mlflow/entities/logged_model_input.py +26 -0
  87. mlflow/entities/logged_model_output.py +32 -0
  88. mlflow/entities/logged_model_parameter.py +46 -0
  89. mlflow/entities/logged_model_status.py +74 -0
  90. mlflow/entities/logged_model_tag.py +33 -0
  91. mlflow/entities/metric.py +200 -0
  92. mlflow/entities/model_registry/__init__.py +29 -0
  93. mlflow/entities/model_registry/_model_registry_entity.py +13 -0
  94. mlflow/entities/model_registry/model_version.py +243 -0
  95. mlflow/entities/model_registry/model_version_deployment_job_run_state.py +44 -0
  96. mlflow/entities/model_registry/model_version_deployment_job_state.py +70 -0
  97. mlflow/entities/model_registry/model_version_search.py +25 -0
  98. mlflow/entities/model_registry/model_version_stages.py +25 -0
  99. mlflow/entities/model_registry/model_version_status.py +35 -0
  100. mlflow/entities/model_registry/model_version_tag.py +35 -0
  101. mlflow/entities/model_registry/prompt.py +73 -0
  102. mlflow/entities/model_registry/prompt_version.py +244 -0
  103. mlflow/entities/model_registry/registered_model.py +175 -0
  104. mlflow/entities/model_registry/registered_model_alias.py +35 -0
  105. mlflow/entities/model_registry/registered_model_deployment_job_state.py +39 -0
  106. mlflow/entities/model_registry/registered_model_search.py +25 -0
  107. mlflow/entities/model_registry/registered_model_tag.py +35 -0
  108. mlflow/entities/multipart_upload.py +74 -0
  109. mlflow/entities/param.py +49 -0
  110. mlflow/entities/run.py +97 -0
  111. mlflow/entities/run_data.py +84 -0
  112. mlflow/entities/run_info.py +188 -0
  113. mlflow/entities/run_inputs.py +59 -0
  114. mlflow/entities/run_outputs.py +43 -0
  115. mlflow/entities/run_status.py +41 -0
  116. mlflow/entities/run_tag.py +36 -0
  117. mlflow/entities/source_type.py +31 -0
  118. mlflow/entities/span.py +774 -0
  119. mlflow/entities/span_event.py +96 -0
  120. mlflow/entities/span_status.py +102 -0
  121. mlflow/entities/trace.py +317 -0
  122. mlflow/entities/trace_data.py +71 -0
  123. mlflow/entities/trace_info.py +220 -0
  124. mlflow/entities/trace_info_v2.py +162 -0
  125. mlflow/entities/trace_location.py +173 -0
  126. mlflow/entities/trace_state.py +39 -0
  127. mlflow/entities/trace_status.py +68 -0
  128. mlflow/entities/view_type.py +51 -0
  129. mlflow/environment_variables.py +866 -0
  130. mlflow/evaluation/__init__.py +16 -0
  131. mlflow/evaluation/assessment.py +369 -0
  132. mlflow/evaluation/evaluation.py +411 -0
  133. mlflow/evaluation/evaluation_tag.py +61 -0
  134. mlflow/evaluation/fluent.py +48 -0
  135. mlflow/evaluation/utils.py +201 -0
  136. mlflow/exceptions.py +213 -0
  137. mlflow/experiments.py +140 -0
  138. mlflow/gemini/__init__.py +81 -0
  139. mlflow/gemini/autolog.py +186 -0
  140. mlflow/gemini/chat.py +261 -0
  141. mlflow/genai/__init__.py +71 -0
  142. mlflow/genai/datasets/__init__.py +67 -0
  143. mlflow/genai/datasets/evaluation_dataset.py +131 -0
  144. mlflow/genai/evaluation/__init__.py +3 -0
  145. mlflow/genai/evaluation/base.py +411 -0
  146. mlflow/genai/evaluation/constant.py +23 -0
  147. mlflow/genai/evaluation/utils.py +244 -0
  148. mlflow/genai/judges/__init__.py +21 -0
  149. mlflow/genai/judges/databricks.py +404 -0
  150. mlflow/genai/label_schemas/__init__.py +153 -0
  151. mlflow/genai/label_schemas/label_schemas.py +209 -0
  152. mlflow/genai/labeling/__init__.py +159 -0
  153. mlflow/genai/labeling/labeling.py +250 -0
  154. mlflow/genai/optimize/__init__.py +13 -0
  155. mlflow/genai/optimize/base.py +198 -0
  156. mlflow/genai/optimize/optimizers/__init__.py +4 -0
  157. mlflow/genai/optimize/optimizers/base_optimizer.py +38 -0
  158. mlflow/genai/optimize/optimizers/dspy_mipro_optimizer.py +221 -0
  159. mlflow/genai/optimize/optimizers/dspy_optimizer.py +91 -0
  160. mlflow/genai/optimize/optimizers/utils/dspy_mipro_callback.py +76 -0
  161. mlflow/genai/optimize/optimizers/utils/dspy_mipro_utils.py +18 -0
  162. mlflow/genai/optimize/types.py +75 -0
  163. mlflow/genai/optimize/util.py +30 -0
  164. mlflow/genai/prompts/__init__.py +206 -0
  165. mlflow/genai/scheduled_scorers.py +431 -0
  166. mlflow/genai/scorers/__init__.py +26 -0
  167. mlflow/genai/scorers/base.py +492 -0
  168. mlflow/genai/scorers/builtin_scorers.py +765 -0
  169. mlflow/genai/scorers/scorer_utils.py +138 -0
  170. mlflow/genai/scorers/validation.py +165 -0
  171. mlflow/genai/utils/data_validation.py +146 -0
  172. mlflow/genai/utils/enum_utils.py +23 -0
  173. mlflow/genai/utils/trace_utils.py +211 -0
  174. mlflow/groq/__init__.py +42 -0
  175. mlflow/groq/_groq_autolog.py +74 -0
  176. mlflow/johnsnowlabs/__init__.py +888 -0
  177. mlflow/langchain/__init__.py +24 -0
  178. mlflow/langchain/api_request_parallel_processor.py +330 -0
  179. mlflow/langchain/autolog.py +147 -0
  180. mlflow/langchain/chat_agent_langgraph.py +340 -0
  181. mlflow/langchain/constant.py +1 -0
  182. mlflow/langchain/constants.py +1 -0
  183. mlflow/langchain/databricks_dependencies.py +444 -0
  184. mlflow/langchain/langchain_tracer.py +597 -0
  185. mlflow/langchain/model.py +919 -0
  186. mlflow/langchain/output_parsers.py +142 -0
  187. mlflow/langchain/retriever_chain.py +153 -0
  188. mlflow/langchain/runnables.py +527 -0
  189. mlflow/langchain/utils/chat.py +402 -0
  190. mlflow/langchain/utils/logging.py +671 -0
  191. mlflow/langchain/utils/serialization.py +36 -0
  192. mlflow/legacy_databricks_cli/__init__.py +0 -0
  193. mlflow/legacy_databricks_cli/configure/__init__.py +0 -0
  194. mlflow/legacy_databricks_cli/configure/provider.py +482 -0
  195. mlflow/litellm/__init__.py +175 -0
  196. mlflow/llama_index/__init__.py +22 -0
  197. mlflow/llama_index/autolog.py +55 -0
  198. mlflow/llama_index/chat.py +43 -0
  199. mlflow/llama_index/constant.py +1 -0
  200. mlflow/llama_index/model.py +577 -0
  201. mlflow/llama_index/pyfunc_wrapper.py +332 -0
  202. mlflow/llama_index/serialize_objects.py +188 -0
  203. mlflow/llama_index/tracer.py +561 -0
  204. mlflow/metrics/__init__.py +479 -0
  205. mlflow/metrics/base.py +39 -0
  206. mlflow/metrics/genai/__init__.py +25 -0
  207. mlflow/metrics/genai/base.py +101 -0
  208. mlflow/metrics/genai/genai_metric.py +771 -0
  209. mlflow/metrics/genai/metric_definitions.py +450 -0
  210. mlflow/metrics/genai/model_utils.py +371 -0
  211. mlflow/metrics/genai/prompt_template.py +68 -0
  212. mlflow/metrics/genai/prompts/__init__.py +0 -0
  213. mlflow/metrics/genai/prompts/v1.py +422 -0
  214. mlflow/metrics/genai/utils.py +6 -0
  215. mlflow/metrics/metric_definitions.py +619 -0
  216. mlflow/mismatch.py +34 -0
  217. mlflow/mistral/__init__.py +34 -0
  218. mlflow/mistral/autolog.py +71 -0
  219. mlflow/mistral/chat.py +135 -0
  220. mlflow/ml_package_versions.py +452 -0
  221. mlflow/models/__init__.py +97 -0
  222. mlflow/models/auth_policy.py +83 -0
  223. mlflow/models/cli.py +354 -0
  224. mlflow/models/container/__init__.py +294 -0
  225. mlflow/models/container/scoring_server/__init__.py +0 -0
  226. mlflow/models/container/scoring_server/nginx.conf +39 -0
  227. mlflow/models/dependencies_schemas.py +287 -0
  228. mlflow/models/display_utils.py +158 -0
  229. mlflow/models/docker_utils.py +211 -0
  230. mlflow/models/evaluation/__init__.py +23 -0
  231. mlflow/models/evaluation/_shap_patch.py +64 -0
  232. mlflow/models/evaluation/artifacts.py +194 -0
  233. mlflow/models/evaluation/base.py +1811 -0
  234. mlflow/models/evaluation/calibration_curve.py +109 -0
  235. mlflow/models/evaluation/default_evaluator.py +996 -0
  236. mlflow/models/evaluation/deprecated.py +23 -0
  237. mlflow/models/evaluation/evaluator_registry.py +80 -0
  238. mlflow/models/evaluation/evaluators/classifier.py +704 -0
  239. mlflow/models/evaluation/evaluators/default.py +233 -0
  240. mlflow/models/evaluation/evaluators/regressor.py +96 -0
  241. mlflow/models/evaluation/evaluators/shap.py +296 -0
  242. mlflow/models/evaluation/lift_curve.py +178 -0
  243. mlflow/models/evaluation/utils/metric.py +123 -0
  244. mlflow/models/evaluation/utils/trace.py +179 -0
  245. mlflow/models/evaluation/validation.py +434 -0
  246. mlflow/models/flavor_backend.py +93 -0
  247. mlflow/models/flavor_backend_registry.py +53 -0
  248. mlflow/models/model.py +1639 -0
  249. mlflow/models/model_config.py +150 -0
  250. mlflow/models/notebook_resources/agent_evaluation_template.html +235 -0
  251. mlflow/models/notebook_resources/eval_with_dataset_example.py +22 -0
  252. mlflow/models/notebook_resources/eval_with_synthetic_example.py +22 -0
  253. mlflow/models/python_api.py +369 -0
  254. mlflow/models/rag_signatures.py +128 -0
  255. mlflow/models/resources.py +321 -0
  256. mlflow/models/signature.py +662 -0
  257. mlflow/models/utils.py +2054 -0
  258. mlflow/models/wheeled_model.py +280 -0
  259. mlflow/openai/__init__.py +57 -0
  260. mlflow/openai/_agent_tracer.py +364 -0
  261. mlflow/openai/api_request_parallel_processor.py +131 -0
  262. mlflow/openai/autolog.py +509 -0
  263. mlflow/openai/constant.py +1 -0
  264. mlflow/openai/model.py +824 -0
  265. mlflow/openai/utils/chat_schema.py +367 -0
  266. mlflow/optuna/__init__.py +3 -0
  267. mlflow/optuna/storage.py +646 -0
  268. mlflow/plugins/__init__.py +72 -0
  269. mlflow/plugins/base.py +358 -0
  270. mlflow/plugins/builtin/__init__.py +24 -0
  271. mlflow/plugins/builtin/pytorch_plugin.py +150 -0
  272. mlflow/plugins/builtin/sklearn_plugin.py +158 -0
  273. mlflow/plugins/builtin/transformers_plugin.py +187 -0
  274. mlflow/plugins/cli.py +321 -0
  275. mlflow/plugins/discovery.py +340 -0
  276. mlflow/plugins/manager.py +465 -0
  277. mlflow/plugins/registry.py +316 -0
  278. mlflow/plugins/templates/framework_plugin_template.py +329 -0
  279. mlflow/prompt/constants.py +20 -0
  280. mlflow/prompt/promptlab_model.py +197 -0
  281. mlflow/prompt/registry_utils.py +248 -0
  282. mlflow/promptflow/__init__.py +495 -0
  283. mlflow/protos/__init__.py +0 -0
  284. mlflow/protos/assessments_pb2.py +174 -0
  285. mlflow/protos/databricks_artifacts_pb2.py +489 -0
  286. mlflow/protos/databricks_filesystem_service_pb2.py +196 -0
  287. mlflow/protos/databricks_managed_catalog_messages_pb2.py +95 -0
  288. mlflow/protos/databricks_managed_catalog_service_pb2.py +86 -0
  289. mlflow/protos/databricks_pb2.py +267 -0
  290. mlflow/protos/databricks_trace_server_pb2.py +374 -0
  291. mlflow/protos/databricks_uc_registry_messages_pb2.py +1249 -0
  292. mlflow/protos/databricks_uc_registry_service_pb2.py +170 -0
  293. mlflow/protos/facet_feature_statistics_pb2.py +296 -0
  294. mlflow/protos/internal_pb2.py +77 -0
  295. mlflow/protos/mlflow_artifacts_pb2.py +336 -0
  296. mlflow/protos/model_registry_pb2.py +1073 -0
  297. mlflow/protos/scalapb/__init__.py +0 -0
  298. mlflow/protos/scalapb/scalapb_pb2.py +104 -0
  299. mlflow/protos/service_pb2.py +2600 -0
  300. mlflow/protos/unity_catalog_oss_messages_pb2.py +457 -0
  301. mlflow/protos/unity_catalog_oss_service_pb2.py +130 -0
  302. mlflow/protos/unity_catalog_prompt_messages_pb2.py +447 -0
  303. mlflow/protos/unity_catalog_prompt_messages_pb2_grpc.py +24 -0
  304. mlflow/protos/unity_catalog_prompt_service_pb2.py +164 -0
  305. mlflow/protos/unity_catalog_prompt_service_pb2_grpc.py +785 -0
  306. mlflow/py.typed +0 -0
  307. mlflow/pydantic_ai/__init__.py +57 -0
  308. mlflow/pydantic_ai/autolog.py +173 -0
  309. mlflow/pyfunc/__init__.py +3844 -0
  310. mlflow/pyfunc/_mlflow_pyfunc_backend_predict.py +61 -0
  311. mlflow/pyfunc/backend.py +523 -0
  312. mlflow/pyfunc/context.py +78 -0
  313. mlflow/pyfunc/dbconnect_artifact_cache.py +144 -0
  314. mlflow/pyfunc/loaders/__init__.py +7 -0
  315. mlflow/pyfunc/loaders/chat_agent.py +117 -0
  316. mlflow/pyfunc/loaders/chat_model.py +125 -0
  317. mlflow/pyfunc/loaders/code_model.py +31 -0
  318. mlflow/pyfunc/loaders/responses_agent.py +112 -0
  319. mlflow/pyfunc/mlserver.py +46 -0
  320. mlflow/pyfunc/model.py +1473 -0
  321. mlflow/pyfunc/scoring_server/__init__.py +604 -0
  322. mlflow/pyfunc/scoring_server/app.py +7 -0
  323. mlflow/pyfunc/scoring_server/client.py +146 -0
  324. mlflow/pyfunc/spark_model_cache.py +48 -0
  325. mlflow/pyfunc/stdin_server.py +44 -0
  326. mlflow/pyfunc/utils/__init__.py +3 -0
  327. mlflow/pyfunc/utils/data_validation.py +224 -0
  328. mlflow/pyfunc/utils/environment.py +22 -0
  329. mlflow/pyfunc/utils/input_converter.py +47 -0
  330. mlflow/pyfunc/utils/serving_data_parser.py +11 -0
  331. mlflow/pytorch/__init__.py +1171 -0
  332. mlflow/pytorch/_lightning_autolog.py +580 -0
  333. mlflow/pytorch/_pytorch_autolog.py +50 -0
  334. mlflow/pytorch/pickle_module.py +35 -0
  335. mlflow/rfunc/__init__.py +42 -0
  336. mlflow/rfunc/backend.py +134 -0
  337. mlflow/runs.py +89 -0
  338. mlflow/server/__init__.py +302 -0
  339. mlflow/server/auth/__init__.py +1224 -0
  340. mlflow/server/auth/__main__.py +4 -0
  341. mlflow/server/auth/basic_auth.ini +6 -0
  342. mlflow/server/auth/cli.py +11 -0
  343. mlflow/server/auth/client.py +537 -0
  344. mlflow/server/auth/config.py +34 -0
  345. mlflow/server/auth/db/__init__.py +0 -0
  346. mlflow/server/auth/db/cli.py +18 -0
  347. mlflow/server/auth/db/migrations/__init__.py +0 -0
  348. mlflow/server/auth/db/migrations/alembic.ini +110 -0
  349. mlflow/server/auth/db/migrations/env.py +76 -0
  350. mlflow/server/auth/db/migrations/versions/8606fa83a998_initial_migration.py +51 -0
  351. mlflow/server/auth/db/migrations/versions/__init__.py +0 -0
  352. mlflow/server/auth/db/models.py +67 -0
  353. mlflow/server/auth/db/utils.py +37 -0
  354. mlflow/server/auth/entities.py +165 -0
  355. mlflow/server/auth/logo.py +14 -0
  356. mlflow/server/auth/permissions.py +65 -0
  357. mlflow/server/auth/routes.py +18 -0
  358. mlflow/server/auth/sqlalchemy_store.py +263 -0
  359. mlflow/server/graphql/__init__.py +0 -0
  360. mlflow/server/graphql/autogenerated_graphql_schema.py +353 -0
  361. mlflow/server/graphql/graphql_custom_scalars.py +24 -0
  362. mlflow/server/graphql/graphql_errors.py +15 -0
  363. mlflow/server/graphql/graphql_no_batching.py +89 -0
  364. mlflow/server/graphql/graphql_schema_extensions.py +74 -0
  365. mlflow/server/handlers.py +3217 -0
  366. mlflow/server/prometheus_exporter.py +17 -0
  367. mlflow/server/validation.py +30 -0
  368. mlflow/shap/__init__.py +691 -0
  369. mlflow/sklearn/__init__.py +1994 -0
  370. mlflow/sklearn/utils.py +1041 -0
  371. mlflow/smolagents/__init__.py +66 -0
  372. mlflow/smolagents/autolog.py +139 -0
  373. mlflow/smolagents/chat.py +29 -0
  374. mlflow/store/__init__.py +10 -0
  375. mlflow/store/_unity_catalog/__init__.py +1 -0
  376. mlflow/store/_unity_catalog/lineage/__init__.py +1 -0
  377. mlflow/store/_unity_catalog/lineage/constants.py +2 -0
  378. mlflow/store/_unity_catalog/registry/__init__.py +6 -0
  379. mlflow/store/_unity_catalog/registry/prompt_info.py +75 -0
  380. mlflow/store/_unity_catalog/registry/rest_store.py +1740 -0
  381. mlflow/store/_unity_catalog/registry/uc_oss_rest_store.py +507 -0
  382. mlflow/store/_unity_catalog/registry/utils.py +121 -0
  383. mlflow/store/artifact/__init__.py +0 -0
  384. mlflow/store/artifact/artifact_repo.py +472 -0
  385. mlflow/store/artifact/artifact_repository_registry.py +154 -0
  386. mlflow/store/artifact/azure_blob_artifact_repo.py +275 -0
  387. mlflow/store/artifact/azure_data_lake_artifact_repo.py +295 -0
  388. mlflow/store/artifact/cli.py +141 -0
  389. mlflow/store/artifact/cloud_artifact_repo.py +332 -0
  390. mlflow/store/artifact/databricks_artifact_repo.py +729 -0
  391. mlflow/store/artifact/databricks_artifact_repo_resources.py +301 -0
  392. mlflow/store/artifact/databricks_logged_model_artifact_repo.py +93 -0
  393. mlflow/store/artifact/databricks_models_artifact_repo.py +216 -0
  394. mlflow/store/artifact/databricks_sdk_artifact_repo.py +134 -0
  395. mlflow/store/artifact/databricks_sdk_models_artifact_repo.py +97 -0
  396. mlflow/store/artifact/dbfs_artifact_repo.py +240 -0
  397. mlflow/store/artifact/ftp_artifact_repo.py +132 -0
  398. mlflow/store/artifact/gcs_artifact_repo.py +296 -0
  399. mlflow/store/artifact/hdfs_artifact_repo.py +209 -0
  400. mlflow/store/artifact/http_artifact_repo.py +218 -0
  401. mlflow/store/artifact/local_artifact_repo.py +142 -0
  402. mlflow/store/artifact/mlflow_artifacts_repo.py +94 -0
  403. mlflow/store/artifact/models_artifact_repo.py +259 -0
  404. mlflow/store/artifact/optimized_s3_artifact_repo.py +356 -0
  405. mlflow/store/artifact/presigned_url_artifact_repo.py +173 -0
  406. mlflow/store/artifact/r2_artifact_repo.py +70 -0
  407. mlflow/store/artifact/runs_artifact_repo.py +265 -0
  408. mlflow/store/artifact/s3_artifact_repo.py +330 -0
  409. mlflow/store/artifact/sftp_artifact_repo.py +141 -0
  410. mlflow/store/artifact/uc_volume_artifact_repo.py +76 -0
  411. mlflow/store/artifact/unity_catalog_models_artifact_repo.py +168 -0
  412. mlflow/store/artifact/unity_catalog_oss_models_artifact_repo.py +168 -0
  413. mlflow/store/artifact/utils/__init__.py +0 -0
  414. mlflow/store/artifact/utils/models.py +148 -0
  415. mlflow/store/db/__init__.py +0 -0
  416. mlflow/store/db/base_sql_model.py +3 -0
  417. mlflow/store/db/db_types.py +10 -0
  418. mlflow/store/db/utils.py +314 -0
  419. mlflow/store/db_migrations/__init__.py +0 -0
  420. mlflow/store/db_migrations/alembic.ini +74 -0
  421. mlflow/store/db_migrations/env.py +84 -0
  422. mlflow/store/db_migrations/versions/0584bdc529eb_add_cascading_deletion_to_datasets_from_experiments.py +88 -0
  423. mlflow/store/db_migrations/versions/0a8213491aaa_drop_duplicate_killed_constraint.py +49 -0
  424. mlflow/store/db_migrations/versions/0c779009ac13_add_deleted_time_field_to_runs_table.py +24 -0
  425. mlflow/store/db_migrations/versions/181f10493468_allow_nulls_for_metric_values.py +35 -0
  426. mlflow/store/db_migrations/versions/27a6a02d2cf1_add_model_version_tags_table.py +38 -0
  427. mlflow/store/db_migrations/versions/2b4d017a5e9b_add_model_registry_tables_to_db.py +77 -0
  428. mlflow/store/db_migrations/versions/2d6e25af4d3e_increase_max_param_val_length.py +33 -0
  429. mlflow/store/db_migrations/versions/3500859a5d39_add_model_aliases_table.py +50 -0
  430. mlflow/store/db_migrations/versions/39d1c3be5f05_add_is_nan_constraint_for_metrics_tables_if_necessary.py +41 -0
  431. mlflow/store/db_migrations/versions/400f98739977_add_logged_model_tables.py +123 -0
  432. mlflow/store/db_migrations/versions/4465047574b1_increase_max_dataset_schema_size.py +38 -0
  433. mlflow/store/db_migrations/versions/451aebb31d03_add_metric_step.py +35 -0
  434. mlflow/store/db_migrations/versions/5b0e9adcef9c_add_cascade_deletion_to_trace_tables_fk.py +40 -0
  435. mlflow/store/db_migrations/versions/6953534de441_add_step_to_inputs_table.py +25 -0
  436. mlflow/store/db_migrations/versions/728d730b5ebd_add_registered_model_tags_table.py +38 -0
  437. mlflow/store/db_migrations/versions/7ac759974ad8_update_run_tags_with_larger_limit.py +36 -0
  438. mlflow/store/db_migrations/versions/7f2a7d5fae7d_add_datasets_inputs_input_tags_tables.py +82 -0
  439. mlflow/store/db_migrations/versions/84291f40a231_add_run_link_to_model_version.py +26 -0
  440. mlflow/store/db_migrations/versions/867495a8f9d4_add_trace_tables.py +90 -0
  441. mlflow/store/db_migrations/versions/89d4b8295536_create_latest_metrics_table.py +169 -0
  442. mlflow/store/db_migrations/versions/90e64c465722_migrate_user_column_to_tags.py +64 -0
  443. mlflow/store/db_migrations/versions/97727af70f4d_creation_time_last_update_time_experiments.py +25 -0
  444. mlflow/store/db_migrations/versions/__init__.py +0 -0
  445. mlflow/store/db_migrations/versions/a8c4a736bde6_allow_nulls_for_run_id.py +27 -0
  446. mlflow/store/db_migrations/versions/acf3f17fdcc7_add_storage_location_field_to_model_.py +29 -0
  447. mlflow/store/db_migrations/versions/bd07f7e963c5_create_index_on_run_uuid.py +26 -0
  448. mlflow/store/db_migrations/versions/bda7b8c39065_increase_model_version_tag_value_limit.py +38 -0
  449. mlflow/store/db_migrations/versions/c48cb773bb87_reset_default_value_for_is_nan_in_metrics_table_for_mysql.py +41 -0
  450. mlflow/store/db_migrations/versions/cbc13b556ace_add_v3_trace_schema_columns.py +31 -0
  451. mlflow/store/db_migrations/versions/cc1f77228345_change_param_value_length_to_500.py +34 -0
  452. mlflow/store/db_migrations/versions/cfd24bdc0731_update_run_status_constraint_with_killed.py +78 -0
  453. mlflow/store/db_migrations/versions/df50e92ffc5e_add_experiment_tags_table.py +38 -0
  454. mlflow/store/db_migrations/versions/f5a4f2784254_increase_run_tag_value_limit.py +36 -0
  455. mlflow/store/entities/__init__.py +3 -0
  456. mlflow/store/entities/paged_list.py +18 -0
  457. mlflow/store/model_registry/__init__.py +10 -0
  458. mlflow/store/model_registry/abstract_store.py +1081 -0
  459. mlflow/store/model_registry/base_rest_store.py +44 -0
  460. mlflow/store/model_registry/databricks_workspace_model_registry_rest_store.py +37 -0
  461. mlflow/store/model_registry/dbmodels/__init__.py +0 -0
  462. mlflow/store/model_registry/dbmodels/models.py +206 -0
  463. mlflow/store/model_registry/file_store.py +1091 -0
  464. mlflow/store/model_registry/rest_store.py +481 -0
  465. mlflow/store/model_registry/sqlalchemy_store.py +1286 -0
  466. mlflow/store/tracking/__init__.py +23 -0
  467. mlflow/store/tracking/abstract_store.py +816 -0
  468. mlflow/store/tracking/dbmodels/__init__.py +0 -0
  469. mlflow/store/tracking/dbmodels/initial_models.py +243 -0
  470. mlflow/store/tracking/dbmodels/models.py +1073 -0
  471. mlflow/store/tracking/file_store.py +2438 -0
  472. mlflow/store/tracking/postgres_managed_identity.py +146 -0
  473. mlflow/store/tracking/rest_store.py +1131 -0
  474. mlflow/store/tracking/sqlalchemy_store.py +2785 -0
  475. mlflow/system_metrics/__init__.py +61 -0
  476. mlflow/system_metrics/metrics/__init__.py +0 -0
  477. mlflow/system_metrics/metrics/base_metrics_monitor.py +32 -0
  478. mlflow/system_metrics/metrics/cpu_monitor.py +23 -0
  479. mlflow/system_metrics/metrics/disk_monitor.py +21 -0
  480. mlflow/system_metrics/metrics/gpu_monitor.py +71 -0
  481. mlflow/system_metrics/metrics/network_monitor.py +34 -0
  482. mlflow/system_metrics/metrics/rocm_monitor.py +123 -0
  483. mlflow/system_metrics/system_metrics_monitor.py +198 -0
  484. mlflow/tracing/__init__.py +16 -0
  485. mlflow/tracing/assessment.py +356 -0
  486. mlflow/tracing/client.py +531 -0
  487. mlflow/tracing/config.py +125 -0
  488. mlflow/tracing/constant.py +105 -0
  489. mlflow/tracing/destination.py +81 -0
  490. mlflow/tracing/display/__init__.py +40 -0
  491. mlflow/tracing/display/display_handler.py +196 -0
  492. mlflow/tracing/export/async_export_queue.py +186 -0
  493. mlflow/tracing/export/inference_table.py +138 -0
  494. mlflow/tracing/export/mlflow_v3.py +137 -0
  495. mlflow/tracing/export/utils.py +70 -0
  496. mlflow/tracing/fluent.py +1417 -0
  497. mlflow/tracing/processor/base_mlflow.py +199 -0
  498. mlflow/tracing/processor/inference_table.py +175 -0
  499. mlflow/tracing/processor/mlflow_v3.py +47 -0
  500. mlflow/tracing/processor/otel.py +73 -0
  501. mlflow/tracing/provider.py +487 -0
  502. mlflow/tracing/trace_manager.py +200 -0
  503. mlflow/tracing/utils/__init__.py +616 -0
  504. mlflow/tracing/utils/artifact_utils.py +28 -0
  505. mlflow/tracing/utils/copy.py +55 -0
  506. mlflow/tracing/utils/environment.py +55 -0
  507. mlflow/tracing/utils/exception.py +21 -0
  508. mlflow/tracing/utils/once.py +35 -0
  509. mlflow/tracing/utils/otlp.py +63 -0
  510. mlflow/tracing/utils/processor.py +54 -0
  511. mlflow/tracing/utils/search.py +292 -0
  512. mlflow/tracing/utils/timeout.py +250 -0
  513. mlflow/tracing/utils/token.py +19 -0
  514. mlflow/tracing/utils/truncation.py +124 -0
  515. mlflow/tracing/utils/warning.py +76 -0
  516. mlflow/tracking/__init__.py +39 -0
  517. mlflow/tracking/_model_registry/__init__.py +1 -0
  518. mlflow/tracking/_model_registry/client.py +764 -0
  519. mlflow/tracking/_model_registry/fluent.py +853 -0
  520. mlflow/tracking/_model_registry/registry.py +67 -0
  521. mlflow/tracking/_model_registry/utils.py +251 -0
  522. mlflow/tracking/_tracking_service/__init__.py +0 -0
  523. mlflow/tracking/_tracking_service/client.py +883 -0
  524. mlflow/tracking/_tracking_service/registry.py +56 -0
  525. mlflow/tracking/_tracking_service/utils.py +275 -0
  526. mlflow/tracking/artifact_utils.py +179 -0
  527. mlflow/tracking/client.py +5900 -0
  528. mlflow/tracking/context/__init__.py +0 -0
  529. mlflow/tracking/context/abstract_context.py +35 -0
  530. mlflow/tracking/context/databricks_cluster_context.py +15 -0
  531. mlflow/tracking/context/databricks_command_context.py +15 -0
  532. mlflow/tracking/context/databricks_job_context.py +49 -0
  533. mlflow/tracking/context/databricks_notebook_context.py +41 -0
  534. mlflow/tracking/context/databricks_repo_context.py +43 -0
  535. mlflow/tracking/context/default_context.py +51 -0
  536. mlflow/tracking/context/git_context.py +32 -0
  537. mlflow/tracking/context/registry.py +98 -0
  538. mlflow/tracking/context/system_environment_context.py +15 -0
  539. mlflow/tracking/default_experiment/__init__.py +1 -0
  540. mlflow/tracking/default_experiment/abstract_context.py +43 -0
  541. mlflow/tracking/default_experiment/databricks_notebook_experiment_provider.py +44 -0
  542. mlflow/tracking/default_experiment/registry.py +75 -0
  543. mlflow/tracking/fluent.py +3595 -0
  544. mlflow/tracking/metric_value_conversion_utils.py +93 -0
  545. mlflow/tracking/multimedia.py +206 -0
  546. mlflow/tracking/registry.py +86 -0
  547. mlflow/tracking/request_auth/__init__.py +0 -0
  548. mlflow/tracking/request_auth/abstract_request_auth_provider.py +34 -0
  549. mlflow/tracking/request_auth/registry.py +60 -0
  550. mlflow/tracking/request_header/__init__.py +0 -0
  551. mlflow/tracking/request_header/abstract_request_header_provider.py +36 -0
  552. mlflow/tracking/request_header/databricks_request_header_provider.py +38 -0
  553. mlflow/tracking/request_header/default_request_header_provider.py +17 -0
  554. mlflow/tracking/request_header/registry.py +79 -0
  555. mlflow/transformers/__init__.py +2982 -0
  556. mlflow/transformers/flavor_config.py +258 -0
  557. mlflow/transformers/hub_utils.py +83 -0
  558. mlflow/transformers/llm_inference_utils.py +468 -0
  559. mlflow/transformers/model_io.py +301 -0
  560. mlflow/transformers/peft.py +51 -0
  561. mlflow/transformers/signature.py +183 -0
  562. mlflow/transformers/torch_utils.py +55 -0
  563. mlflow/types/__init__.py +21 -0
  564. mlflow/types/agent.py +270 -0
  565. mlflow/types/chat.py +240 -0
  566. mlflow/types/llm.py +935 -0
  567. mlflow/types/responses.py +139 -0
  568. mlflow/types/responses_helpers.py +416 -0
  569. mlflow/types/schema.py +1505 -0
  570. mlflow/types/type_hints.py +647 -0
  571. mlflow/types/utils.py +753 -0
  572. mlflow/utils/__init__.py +283 -0
  573. mlflow/utils/_capture_modules.py +256 -0
  574. mlflow/utils/_capture_transformers_modules.py +75 -0
  575. mlflow/utils/_spark_utils.py +201 -0
  576. mlflow/utils/_unity_catalog_oss_utils.py +97 -0
  577. mlflow/utils/_unity_catalog_utils.py +479 -0
  578. mlflow/utils/annotations.py +218 -0
  579. mlflow/utils/arguments_utils.py +16 -0
  580. mlflow/utils/async_logging/__init__.py +1 -0
  581. mlflow/utils/async_logging/async_artifacts_logging_queue.py +258 -0
  582. mlflow/utils/async_logging/async_logging_queue.py +366 -0
  583. mlflow/utils/async_logging/run_artifact.py +38 -0
  584. mlflow/utils/async_logging/run_batch.py +58 -0
  585. mlflow/utils/async_logging/run_operations.py +49 -0
  586. mlflow/utils/autologging_utils/__init__.py +737 -0
  587. mlflow/utils/autologging_utils/client.py +432 -0
  588. mlflow/utils/autologging_utils/config.py +33 -0
  589. mlflow/utils/autologging_utils/events.py +294 -0
  590. mlflow/utils/autologging_utils/logging_and_warnings.py +328 -0
  591. mlflow/utils/autologging_utils/metrics_queue.py +71 -0
  592. mlflow/utils/autologging_utils/safety.py +1104 -0
  593. mlflow/utils/autologging_utils/versioning.py +95 -0
  594. mlflow/utils/checkpoint_utils.py +206 -0
  595. mlflow/utils/class_utils.py +6 -0
  596. mlflow/utils/cli_args.py +257 -0
  597. mlflow/utils/conda.py +354 -0
  598. mlflow/utils/credentials.py +231 -0
  599. mlflow/utils/data_utils.py +17 -0
  600. mlflow/utils/databricks_utils.py +1436 -0
  601. mlflow/utils/docstring_utils.py +477 -0
  602. mlflow/utils/doctor.py +133 -0
  603. mlflow/utils/download_cloud_file_chunk.py +43 -0
  604. mlflow/utils/env_manager.py +16 -0
  605. mlflow/utils/env_pack.py +131 -0
  606. mlflow/utils/environment.py +1009 -0
  607. mlflow/utils/exception_utils.py +14 -0
  608. mlflow/utils/file_utils.py +978 -0
  609. mlflow/utils/git_utils.py +77 -0
  610. mlflow/utils/gorilla.py +797 -0
  611. mlflow/utils/import_hooks/__init__.py +363 -0
  612. mlflow/utils/lazy_load.py +51 -0
  613. mlflow/utils/logging_utils.py +168 -0
  614. mlflow/utils/mime_type_utils.py +58 -0
  615. mlflow/utils/mlflow_tags.py +103 -0
  616. mlflow/utils/model_utils.py +486 -0
  617. mlflow/utils/name_utils.py +346 -0
  618. mlflow/utils/nfs_on_spark.py +62 -0
  619. mlflow/utils/openai_utils.py +164 -0
  620. mlflow/utils/os.py +12 -0
  621. mlflow/utils/oss_registry_utils.py +29 -0
  622. mlflow/utils/plugins.py +17 -0
  623. mlflow/utils/process.py +182 -0
  624. mlflow/utils/promptlab_utils.py +146 -0
  625. mlflow/utils/proto_json_utils.py +743 -0
  626. mlflow/utils/pydantic_utils.py +54 -0
  627. mlflow/utils/request_utils.py +279 -0
  628. mlflow/utils/requirements_utils.py +704 -0
  629. mlflow/utils/rest_utils.py +673 -0
  630. mlflow/utils/search_logged_model_utils.py +127 -0
  631. mlflow/utils/search_utils.py +2111 -0
  632. mlflow/utils/secure_loading.py +221 -0
  633. mlflow/utils/security_validation.py +384 -0
  634. mlflow/utils/server_cli_utils.py +61 -0
  635. mlflow/utils/spark_utils.py +15 -0
  636. mlflow/utils/string_utils.py +138 -0
  637. mlflow/utils/thread_utils.py +63 -0
  638. mlflow/utils/time.py +54 -0
  639. mlflow/utils/timeout.py +42 -0
  640. mlflow/utils/uri.py +572 -0
  641. mlflow/utils/validation.py +662 -0
  642. mlflow/utils/virtualenv.py +458 -0
  643. mlflow/utils/warnings_utils.py +25 -0
  644. mlflow/utils/yaml_utils.py +179 -0
  645. mlflow/version.py +24 -0
@@ -0,0 +1,145 @@
1
+ import os
2
+ import re
3
+ from typing import Any
4
+ from urllib.parse import urlparse
5
+
6
+ from mlflow.data.dataset_source import DatasetSource
7
+ from mlflow.exceptions import MlflowException
8
+ from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
9
+ from mlflow.utils.file_utils import create_tmp_dir
10
+ from mlflow.utils.rest_utils import augmented_raise_for_status, cloud_storage_http_request
11
+
12
+
13
+ def _is_path(filename: str) -> bool:
14
+ """
15
+ Return True if `filename` is a path, False otherwise. For example,
16
+ "foo/bar" is a path, but "bar" is not.
17
+ """
18
+ return os.path.basename(filename) != filename
19
+
20
+
21
+ class HTTPDatasetSource(DatasetSource):
22
+ """
23
+ Represents the source of a dataset stored at a web location and referred to
24
+ by an HTTP or HTTPS URL.
25
+ """
26
+
27
+ def __init__(self, url):
28
+ self._url = url
29
+
30
+ @property
31
+ def url(self):
32
+ """The HTTP/S URL referring to the dataset source location.
33
+
34
+ Returns:
35
+ The HTTP/S URL referring to the dataset source location.
36
+
37
+ """
38
+ return self._url
39
+
40
+ @staticmethod
41
+ def _get_source_type() -> str:
42
+ return "http"
43
+
44
+ def _extract_filename(self, response) -> str:
45
+ """
46
+ Extracts a filename from the Content-Disposition header or the URL's path.
47
+ """
48
+ if content_disposition := response.headers.get("Content-Disposition"):
49
+ for match in re.finditer(r"filename=(.+)", content_disposition):
50
+ filename = match[1].strip("'\"")
51
+ if _is_path(filename):
52
+ raise MlflowException.invalid_parameter_value(
53
+ f"Invalid filename in Content-Disposition header: {filename}. "
54
+ "It must be a file name, not a path."
55
+ )
56
+ return filename
57
+
58
+ # Extract basename from URL if no valid filename in Content-Disposition
59
+ return os.path.basename(urlparse(self.url).path)
60
+
61
+ def load(self, dst_path=None) -> str:
62
+ """Downloads the dataset source to the local filesystem.
63
+
64
+ Args:
65
+ dst_path: Path of the local filesystem destination directory to which to download the
66
+ dataset source. If the directory does not exist, it is created. If
67
+ unspecified, the dataset source is downloaded to a new uniquely-named
68
+ directory on the local filesystem.
69
+
70
+ Returns:
71
+ The path to the downloaded dataset source on the local filesystem.
72
+
73
+ """
74
+ resp = cloud_storage_http_request(
75
+ method="GET",
76
+ url=self.url,
77
+ stream=True,
78
+ )
79
+ augmented_raise_for_status(resp)
80
+
81
+ basename = self._extract_filename(resp)
82
+
83
+ if not basename:
84
+ basename = "dataset_source"
85
+
86
+ if dst_path is None:
87
+ dst_path = create_tmp_dir()
88
+
89
+ dst_path = os.path.join(dst_path, basename)
90
+ with open(dst_path, "wb") as f:
91
+ chunk_size = 1024 * 1024 # 1 MB
92
+ for chunk in resp.iter_content(chunk_size=chunk_size):
93
+ f.write(chunk)
94
+
95
+ return dst_path
96
+
97
+ @staticmethod
98
+ def _can_resolve(raw_source: Any) -> bool:
99
+ """
100
+ Args:
101
+ raw_source: The raw source, e.g. a string like "http://mysite/mydata.tar.gz".
102
+
103
+ Returns:
104
+ True if this DatasetSource can resolve the raw source, False otherwise.
105
+ """
106
+ if not isinstance(raw_source, str):
107
+ return False
108
+
109
+ try:
110
+ parsed_source = urlparse(str(raw_source))
111
+ return parsed_source.scheme in ["http", "https"]
112
+ except Exception:
113
+ return False
114
+
115
+ @classmethod
116
+ def _resolve(cls, raw_source: Any) -> "HTTPDatasetSource":
117
+ """
118
+ Args:
119
+ raw_source: The raw source, e.g. a string like "http://mysite/mydata.tar.gz".
120
+ """
121
+ return HTTPDatasetSource(raw_source)
122
+
123
+ def to_dict(self) -> dict[Any, Any]:
124
+ """
125
+ Returns:
126
+ A JSON-compatible dictionary representation of the HTTPDatasetSource.
127
+ """
128
+ return {
129
+ "url": self.url,
130
+ }
131
+
132
+ @classmethod
133
+ def from_dict(cls, source_dict: dict[Any, Any]) -> "HTTPDatasetSource":
134
+ """
135
+ Args:
136
+ source_dict: A dictionary representation of the HTTPDatasetSource.
137
+ """
138
+ url = source_dict.get("url")
139
+ if url is None:
140
+ raise MlflowException(
141
+ 'Failed to parse HTTPDatasetSource. Missing expected key: "url"',
142
+ INVALID_PARAMETER_VALUE,
143
+ )
144
+
145
+ return cls(url=url)
@@ -0,0 +1,258 @@
1
+ import json
2
+ import logging
3
+ from functools import cached_property
4
+ from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Union
5
+
6
+ from mlflow.data.dataset import Dataset
7
+ from mlflow.data.dataset_source import DatasetSource
8
+ from mlflow.data.digest_utils import compute_pandas_digest
9
+ from mlflow.data.evaluation_dataset import EvaluationDataset
10
+ from mlflow.data.huggingface_dataset_source import HuggingFaceDatasetSource
11
+ from mlflow.data.pyfunc_dataset_mixin import PyFuncConvertibleDatasetMixin, PyFuncInputsOutputs
12
+ from mlflow.exceptions import MlflowException
13
+ from mlflow.protos.databricks_pb2 import INTERNAL_ERROR, INVALID_PARAMETER_VALUE
14
+ from mlflow.types import Schema
15
+ from mlflow.types.utils import _infer_schema
16
+
17
+ _logger = logging.getLogger(__name__)
18
+
19
+ _MAX_ROWS_FOR_DIGEST_COMPUTATION_AND_SCHEMA_INFERENCE = 10000
20
+
21
+ if TYPE_CHECKING:
22
+ import datasets
23
+
24
+
25
+ class HuggingFaceDataset(Dataset, PyFuncConvertibleDatasetMixin):
26
+ """
27
+ Represents a HuggingFace dataset for use with MLflow Tracking.
28
+ """
29
+
30
+ def __init__( # noqa: D417
31
+ self,
32
+ ds: "datasets.Dataset",
33
+ source: HuggingFaceDatasetSource,
34
+ targets: Optional[str] = None,
35
+ name: Optional[str] = None,
36
+ digest: Optional[str] = None,
37
+ ):
38
+ """
39
+ Args:
40
+ ds: A Hugging Face dataset. Must be an instance of `datasets.Dataset`.
41
+ Other types, such as :py:class:`datasets.DatasetDict`, are not supported.
42
+ source: The source of the Hugging Face dataset.
43
+ name: The name of the dataset. E.g. "wiki_train". If unspecified, a name is
44
+ automatically generated.
45
+ digest: The digest (hash, fingerprint) of the dataset. If unspecified, a digest
46
+ is automatically computed.
47
+ """
48
+ if targets is not None and targets not in ds.column_names:
49
+ raise MlflowException(
50
+ f"The specified Hugging Face dataset does not contain the specified targets column"
51
+ f" '{targets}'.",
52
+ INVALID_PARAMETER_VALUE,
53
+ )
54
+
55
+ self._ds = ds
56
+ self._targets = targets
57
+ super().__init__(source=source, name=name, digest=digest)
58
+
59
+ def _compute_digest(self) -> str:
60
+ """
61
+ Computes a digest for the dataset. Called if the user doesn't supply
62
+ a digest when constructing the dataset.
63
+ """
64
+ df = next(
65
+ self._ds.to_pandas(
66
+ batch_size=_MAX_ROWS_FOR_DIGEST_COMPUTATION_AND_SCHEMA_INFERENCE, batched=True
67
+ )
68
+ )
69
+ return compute_pandas_digest(df)
70
+
71
+ def to_dict(self) -> dict[str, str]:
72
+ """Create config dictionary for the dataset.
73
+
74
+ Returns a string dictionary containing the following fields: name, digest, source, source
75
+ type, schema, and profile.
76
+ """
77
+ schema = json.dumps({"mlflow_colspec": self.schema.to_dict()}) if self.schema else None
78
+ config = super().to_dict()
79
+ config.update(
80
+ {
81
+ "schema": schema,
82
+ "profile": json.dumps(self.profile),
83
+ }
84
+ )
85
+ return config
86
+
87
+ @property
88
+ def ds(self) -> "datasets.Dataset":
89
+ """The Hugging Face ``datasets.Dataset`` instance.
90
+
91
+ Returns:
92
+ The Hugging Face ``datasets.Dataset`` instance.
93
+
94
+ """
95
+ return self._ds
96
+
97
+ @property
98
+ def targets(self) -> Optional[str]:
99
+ """
100
+ The name of the Hugging Face dataset column containing targets (labels) for supervised
101
+ learning.
102
+
103
+ Returns:
104
+ The string name of the Hugging Face dataset column containing targets.
105
+ """
106
+ return self._targets
107
+
108
+ @property
109
+ def source(self) -> HuggingFaceDatasetSource:
110
+ """Hugging Face dataset source information.
111
+
112
+ Returns:
113
+ A :py:class:`mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource`
114
+ """
115
+ return self._source
116
+
117
+ @property
118
+ def profile(self) -> Optional[Any]:
119
+ """
120
+ Summary statistics for the Hugging Face dataset, including the number of rows,
121
+ size, and size in bytes.
122
+ """
123
+ return {
124
+ "num_rows": self._ds.num_rows,
125
+ "dataset_size": self._ds.dataset_size,
126
+ "size_in_bytes": self._ds.size_in_bytes,
127
+ }
128
+
129
+ @cached_property
130
+ def schema(self) -> Optional[Schema]:
131
+ """
132
+ The MLflow ColSpec schema of the Hugging Face dataset.
133
+ """
134
+ try:
135
+ df = next(
136
+ self._ds.to_pandas(
137
+ batch_size=_MAX_ROWS_FOR_DIGEST_COMPUTATION_AND_SCHEMA_INFERENCE, batched=True
138
+ )
139
+ )
140
+ return _infer_schema(df)
141
+ except Exception as e:
142
+ _logger.warning("Failed to infer schema for Hugging Face dataset. Exception: %s", e)
143
+ return None
144
+
145
+ def to_pyfunc(self) -> PyFuncInputsOutputs:
146
+ df = self._ds.to_pandas()
147
+ if self._targets is not None:
148
+ if self._targets not in df.columns:
149
+ raise MlflowException(
150
+ f"Failed to convert Hugging Face dataset to pyfunc inputs and outputs because"
151
+ f" the pandas representation of the Hugging Face dataset does not contain the"
152
+ f" specified targets column '{self._targets}'.",
153
+ # This is an internal error because we should have validated the presence of
154
+ # the target column in the Hugging Face dataset at construction time
155
+ INTERNAL_ERROR,
156
+ )
157
+ inputs = df.drop(columns=self._targets)
158
+ outputs = df[self._targets]
159
+ return PyFuncInputsOutputs(inputs=inputs, outputs=outputs)
160
+ else:
161
+ return PyFuncInputsOutputs(inputs=df, outputs=None)
162
+
163
+ def to_evaluation_dataset(self, path=None, feature_names=None) -> EvaluationDataset:
164
+ """
165
+ Converts the dataset to an EvaluationDataset for model evaluation. Required
166
+ for use with mlflow.evaluate().
167
+ """
168
+ return EvaluationDataset(
169
+ data=self._ds.to_pandas(),
170
+ targets=self._targets,
171
+ path=path,
172
+ feature_names=feature_names,
173
+ name=self.name,
174
+ digest=self.digest,
175
+ )
176
+
177
+
178
+ def from_huggingface(
179
+ ds,
180
+ path: Optional[str] = None,
181
+ targets: Optional[str] = None,
182
+ data_dir: Optional[str] = None,
183
+ data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
184
+ revision=None,
185
+ name: Optional[str] = None,
186
+ digest: Optional[str] = None,
187
+ trust_remote_code: Optional[bool] = None,
188
+ source: Optional[Union[str, DatasetSource]] = None,
189
+ ) -> HuggingFaceDataset:
190
+ """
191
+ Create a `mlflow.data.huggingface_dataset.HuggingFaceDataset` from a Hugging Face dataset.
192
+
193
+ Args:
194
+ ds:
195
+ A Hugging Face dataset. Must be an instance of `datasets.Dataset`. Other types, such as
196
+ `datasets.DatasetDict`, are not supported.
197
+ path: The path of the Hugging Face dataset used to construct the source. This is the same
198
+ argument as `path` in `datasets.load_dataset()` function. To be able to reload the
199
+ dataset via MLflow, `path` must match the path of the dataset on the hub, e.g.,
200
+ "databricks/databricks-dolly-15k". If no path is specified, a `CodeDatasetSource` is,
201
+ used which will source information from the run context.
202
+ targets: The name of the Hugging Face `dataset.Dataset` column containing targets (labels)
203
+ for supervised learning.
204
+ data_dir: The `data_dir` of the Hugging Face dataset configuration. This is used by the
205
+ `datasets.load_dataset()` function to reload the dataset upon request via
206
+ :py:func:`HuggingFaceDataset.source.load()
207
+ <mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource.load>`.
208
+ data_files: Paths to source data file(s) for the Hugging Face dataset configuration.
209
+ This is used by the `datasets.load_dataset()` function to reload the
210
+ dataset upon request via :py:func:`HuggingFaceDataset.source.load()
211
+ <mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource.load>`.
212
+ revision: Version of the dataset script to load. This is used by the
213
+ `datasets.load_dataset()` function to reload the dataset upon request via
214
+ :py:func:`HuggingFaceDataset.source.load()
215
+ <mlflow.data.huggingface_dataset_source.HuggingFaceDatasetSource.load>`.
216
+ name: The name of the dataset. E.g. "wiki_train". If unspecified, a name is automatically
217
+ generated.
218
+ digest: The digest (hash, fingerprint) of the dataset. If unspecified, a digest is
219
+ automatically computed.
220
+ trust_remote_code: Whether to trust remote code from the dataset repo.
221
+ source: The source of the dataset, e.g. a S3 URI, an HTTPS URL etc.
222
+ """
223
+ import datasets
224
+
225
+ from mlflow.data.code_dataset_source import CodeDatasetSource
226
+ from mlflow.data.dataset_source_registry import resolve_dataset_source
227
+ from mlflow.tracking.context import registry
228
+
229
+ if not isinstance(ds, datasets.Dataset):
230
+ raise MlflowException(
231
+ f"The specified Hugging Face dataset must be an instance of `datasets.Dataset`."
232
+ f" Instead, found an instance of: {type(ds)}",
233
+ INVALID_PARAMETER_VALUE,
234
+ )
235
+
236
+ # Set the source to a `HuggingFaceDatasetSource` if a path is specified, otherwise set it to a
237
+ # `CodeDatasetSource`.
238
+ if source is not None and path is not None:
239
+ _logger.warning(
240
+ "Both 'source' and 'path' are provided."
241
+ "'source' will take precedence, and 'path' will be ignored."
242
+ )
243
+ if source is not None:
244
+ source = source if isinstance(source, DatasetSource) else resolve_dataset_source(source)
245
+ elif path is not None:
246
+ source = HuggingFaceDatasetSource(
247
+ path=path,
248
+ config_name=ds.config_name,
249
+ data_dir=data_dir,
250
+ data_files=data_files,
251
+ split=ds.split,
252
+ revision=revision,
253
+ trust_remote_code=trust_remote_code,
254
+ )
255
+ else:
256
+ context_tags = registry.resolve_tags()
257
+ source = CodeDatasetSource(tags=context_tags)
258
+ return HuggingFaceDataset(ds=ds, targets=targets, source=source, name=name, digest=digest)
@@ -0,0 +1,118 @@
1
+ from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Union
2
+
3
+ from mlflow.data.dataset_source import DatasetSource
4
+
5
+ if TYPE_CHECKING:
6
+ import datasets
7
+
8
+
9
+ class HuggingFaceDatasetSource(DatasetSource):
10
+ """Represents the source of a Hugging Face dataset used in MLflow Tracking."""
11
+
12
+ def __init__(
13
+ self,
14
+ path: str,
15
+ config_name: Optional[str] = None,
16
+ data_dir: Optional[str] = None,
17
+ data_files: Optional[
18
+ Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
19
+ ] = None,
20
+ split: Optional[Union[str, "datasets.Split"]] = None,
21
+ revision: Optional[Union[str, "datasets.Version"]] = None,
22
+ trust_remote_code: Optional[bool] = None,
23
+ ):
24
+ """Create a `HuggingFaceDatasetSource` instance.
25
+
26
+ Arguments in `__init__` match arguments of the same name in
27
+ `datasets.load_dataset() <https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/loading_methods#datasets.load_dataset>`_.
28
+ The only exception is `config_name` matches `name` in `datasets.load_dataset()`, because
29
+ we need to differentiate from `mlflow.data.Dataset` `name` attribute.
30
+
31
+ Args:
32
+ path: The path of the Hugging Face dataset, if it is a dataset from HuggingFace hub,
33
+ `path` must match the hub path, e.g., "databricks/databricks-dolly-15k".
34
+ config_name: The name of of the Hugging Face dataset configuration.
35
+ data_dir: The `data_dir` of the Hugging Face dataset configuration.
36
+ data_files: Paths to source data file(s) for the Hugging Face dataset configuration.
37
+ split: Which split of the data to load.
38
+ revision: Version of the dataset script to load.
39
+ trust_remote_code: Whether to trust remote code from the dataset repo.
40
+ """
41
+ self.path = path
42
+ self.config_name = config_name
43
+ self.data_dir = data_dir
44
+ self.data_files = data_files
45
+ self.split = split
46
+ self.revision = revision
47
+ self.trust_remote_code = trust_remote_code
48
+
49
+ @staticmethod
50
+ def _get_source_type() -> str:
51
+ return "hugging_face"
52
+
53
+ def load(self, **kwargs):
54
+ """Load the Hugging Face dataset based on `HuggingFaceDatasetSource`.
55
+
56
+ Args:
57
+ kwargs: Additional keyword arguments used for loading the dataset with the Hugging Face
58
+ `datasets.load_dataset()` method.
59
+
60
+ Returns:
61
+ An instance of `datasets.Dataset`.
62
+ """
63
+ import datasets
64
+ from packaging.version import Version
65
+
66
+ load_kwargs = {
67
+ "path": self.path,
68
+ "name": self.config_name,
69
+ "data_dir": self.data_dir,
70
+ "data_files": self.data_files,
71
+ "split": self.split,
72
+ "revision": self.revision,
73
+ }
74
+
75
+ # this argument only exists in >= 2.16.0
76
+ if Version(datasets.__version__) >= Version("2.16.0"):
77
+ load_kwargs["trust_remote_code"] = self.trust_remote_code
78
+
79
+ intersecting_keys = set(load_kwargs.keys()) & set(kwargs.keys())
80
+ if intersecting_keys:
81
+ raise KeyError(
82
+ f"Found duplicated arguments in `HuggingFaceDatasetSource` and "
83
+ f"`kwargs`: {intersecting_keys}. Please remove them from `kwargs`."
84
+ )
85
+ load_kwargs.update(kwargs)
86
+ return datasets.load_dataset(**load_kwargs)
87
+
88
+ @staticmethod
89
+ def _can_resolve(raw_source: Any):
90
+ # NB: Initially, we expect that Hugging Face dataset sources will only be used with
91
+ # Hugging Face datasets constructed by from_huggingface_dataset, which can create
92
+ # an instance of HuggingFaceDatasetSource directly without the need for resolution
93
+ return False
94
+
95
+ @classmethod
96
+ def _resolve(cls, raw_source: str) -> "HuggingFaceDatasetSource":
97
+ raise NotImplementedError
98
+
99
+ def to_dict(self) -> dict[Any, Any]:
100
+ return {
101
+ "path": self.path,
102
+ "config_name": self.config_name,
103
+ "data_dir": self.data_dir,
104
+ "data_files": self.data_files,
105
+ "split": str(self.split),
106
+ "revision": self.revision,
107
+ }
108
+
109
+ @classmethod
110
+ def from_dict(cls, source_dict: dict[Any, Any]) -> "HuggingFaceDatasetSource":
111
+ return cls(
112
+ path=source_dict.get("path"),
113
+ config_name=source_dict.get("config_name"),
114
+ data_dir=source_dict.get("data_dir"),
115
+ data_files=source_dict.get("data_files"),
116
+ split=source_dict.get("split"),
117
+ revision=source_dict.get("revision"),
118
+ )
@@ -0,0 +1,104 @@
1
+ import hashlib
2
+ import json
3
+ from typing import Any, Optional
4
+
5
+ from mlflow.data.dataset import Dataset
6
+ from mlflow.data.dataset_source import DatasetSource
7
+ from mlflow.types import Schema
8
+
9
+
10
+ class MetaDataset(Dataset):
11
+ """Dataset that only contains metadata.
12
+
13
+ This class is used to represent a dataset that only contains metadata, which is useful when
14
+ users only want to log metadata to MLflow without logging the actual data. For example, users
15
+ build a custom dataset from a text file publicly hosted in the Internet, and they want to log
16
+ the text file's URL to MLflow for future tracking instead of the dataset itself.
17
+
18
+ Args:
19
+ source: dataset source of type `DatasetSource`, indicates where the data is from.
20
+ name: name of the dataset. If not specified, a name is automatically generated.
21
+ digest: digest (hash, fingerprint) of the dataset. If not specified, a digest is
22
+ automatically computed.
23
+ schame: schema of the dataset.
24
+
25
+ .. code-block:: python
26
+ :caption: Create a MetaDataset
27
+
28
+ import mlflow
29
+
30
+ mlflow.set_experiment("/test-mlflow-meta-dataset")
31
+
32
+ source = mlflow.data.http_dataset_source.HTTPDatasetSource(
33
+ url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
34
+ )
35
+ ds = mlflow.data.meta_dataset.MetaDataset(source)
36
+
37
+ with mlflow.start_run() as run:
38
+ mlflow.log_input(ds)
39
+
40
+ .. code-block:: python
41
+ :caption: Create a MetaDataset with schema
42
+
43
+ import mlflow
44
+
45
+ mlflow.set_experiment("/test-mlflow-meta-dataset")
46
+
47
+ source = mlflow.data.http_dataset_source.HTTPDatasetSource(
48
+ url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
49
+ )
50
+ schema = Schema(
51
+ [
52
+ ColSpec(type=mlflow.types.DataType.string, name="text"),
53
+ ColSpec(type=mlflow.types.DataType.integer, name="label"),
54
+ ]
55
+ )
56
+ ds = mlflow.data.meta_dataset.MetaDataset(source, schema=schema)
57
+
58
+ with mlflow.start_run() as run:
59
+ mlflow.log_input(ds)
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ source: DatasetSource,
65
+ name: Optional[str] = None,
66
+ digest: Optional[str] = None,
67
+ schema: Optional[Schema] = None,
68
+ ):
69
+ # Set `self._schema` before calling the superclass constructor because
70
+ # `self._compute_digest` depends on `self._schema`.
71
+ self._schema = schema
72
+ super().__init__(source=source, name=name, digest=digest)
73
+
74
+ def _compute_digest(self) -> str:
75
+ """Computes a digest for the dataset.
76
+
77
+ The digest computation of `MetaDataset` is based on the dataset's name, source, source type,
78
+ and schema instead of the actual data. Basically we compute the sha256 hash of the config
79
+ dict.
80
+ """
81
+ config = {
82
+ "name": self.name,
83
+ "source": self.source.to_json(),
84
+ "source_type": self.source._get_source_type(),
85
+ "schema": self.schema.to_dict() if self.schema else "",
86
+ }
87
+ return hashlib.sha256(json.dumps(config).encode("utf-8")).hexdigest()[:8]
88
+
89
+ @property
90
+ def schema(self) -> Optional[Any]:
91
+ """Returns the schema of the dataset."""
92
+ return self._schema
93
+
94
+ def to_dict(self) -> dict[str, str]:
95
+ """Create config dictionary for the MetaDataset.
96
+
97
+ Returns a string dictionary containing the following fields: name, digest, source, source
98
+ type, schema, and profile.
99
+ """
100
+ config = super().to_dict()
101
+ if self.schema:
102
+ schema = json.dumps({"mlflow_colspec": self.schema.to_dict()}) if self.schema else None
103
+ config["schema"] = schema
104
+ return config