acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -36,9 +36,10 @@ from datahub.ingestion.api.source_helpers import (
36
36
  )
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.graph.client import get_default_graph
39
- from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
40
- FineGrainedLineageDownstreamType,
41
- FineGrainedLineageUpstreamType,
39
+ from datahub.ingestion.graph.config import ClientMode
40
+ from datahub.metadata.schema_classes import (
41
+ FineGrainedLineageDownstreamTypeClass,
42
+ FineGrainedLineageUpstreamTypeClass,
42
43
  )
43
44
 
44
45
  logger = logging.getLogger(__name__)
@@ -48,7 +49,7 @@ class EntityConfig(EnvConfigMixin):
48
49
  name: str
49
50
  type: str
50
51
  platform: str
51
- platform_instance: Optional[str]
52
+ platform_instance: Optional[str] = None
52
53
 
53
54
  @validator("type")
54
55
  def type_must_be_supported(cls, v: str) -> str:
@@ -79,9 +80,9 @@ class FineGrainedLineageConfig(ConfigModel):
79
80
  @validator("upstreamType")
80
81
  def upstream_type_must_be_supported(cls, v: str) -> str:
81
82
  allowed_types = [
82
- FineGrainedLineageUpstreamType.FIELD_SET,
83
- FineGrainedLineageUpstreamType.DATASET,
84
- FineGrainedLineageUpstreamType.NONE,
83
+ FineGrainedLineageUpstreamTypeClass.FIELD_SET,
84
+ FineGrainedLineageUpstreamTypeClass.DATASET,
85
+ FineGrainedLineageUpstreamTypeClass.NONE,
85
86
  ]
86
87
  if v not in allowed_types:
87
88
  raise ValueError(
@@ -92,8 +93,8 @@ class FineGrainedLineageConfig(ConfigModel):
92
93
  @validator("downstreamType")
93
94
  def downstream_type_must_be_supported(cls, v: str) -> str:
94
95
  allowed_types = [
95
- FineGrainedLineageDownstreamType.FIELD_SET,
96
- FineGrainedLineageDownstreamType.FIELD,
96
+ FineGrainedLineageDownstreamTypeClass.FIELD_SET,
97
+ FineGrainedLineageDownstreamTypeClass.FIELD,
97
98
  ]
98
99
  if v not in allowed_types:
99
100
  raise ValueError(
@@ -210,7 +211,7 @@ def _get_lineage_mcp(
210
211
 
211
212
  # extract the old lineage and save it for the new mcp
212
213
  if preserve_upstream:
213
- client = get_default_graph()
214
+ client = get_default_graph(ClientMode.INGESTION)
214
215
 
215
216
  old_upstream_lineage = get_aspects_for_entity(
216
217
  client._session,
@@ -1,10 +1,13 @@
1
+ import json
2
+ import os
1
3
  import time
2
4
  from dataclasses import dataclass
3
- from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
5
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
4
6
 
5
7
  from mlflow import MlflowClient
6
- from mlflow.entities import Experiment, Run
8
+ from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
7
9
  from mlflow.entities.model_registry import ModelVersion, RegisteredModel
10
+ from mlflow.exceptions import MlflowException
8
11
  from mlflow.store.entities import PagedList
9
12
  from pydantic.fields import Field
10
13
 
@@ -14,7 +17,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
14
17
  )
15
18
  from datahub.configuration.source_common import EnvConfigMixin
16
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
- from datahub.emitter.mcp_builder import ContainerKey
20
+ from datahub.emitter.mcp_builder import ExperimentKey
18
21
  from datahub.ingestion.api.common import PipelineContext
19
22
  from datahub.ingestion.api.decorators import (
20
23
  SupportStatus,
@@ -29,10 +32,15 @@ from datahub.ingestion.api.source import (
29
32
  SourceReport,
30
33
  )
31
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
32
- from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
35
+ from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
36
+ from datahub.ingestion.source.common.subtypes import (
37
+ MLAssetSubTypes,
38
+ SourceCapabilityModifier,
39
+ )
33
40
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
41
  StaleEntityRemovalHandler,
35
42
  StaleEntityRemovalSourceReport,
43
+ StatefulStaleMetadataRemovalConfig,
36
44
  )
37
45
  from datahub.ingestion.source.state.stateful_ingestion_base import (
38
46
  StatefulIngestionConfigBase,
@@ -42,6 +50,7 @@ from datahub.metadata.schema_classes import (
42
50
  AuditStampClass,
43
51
  ContainerClass,
44
52
  DataPlatformInstanceClass,
53
+ DataProcessInstanceInputClass,
45
54
  DataProcessInstanceOutputClass,
46
55
  DataProcessInstancePropertiesClass,
47
56
  DataProcessInstanceRunEventClass,
@@ -60,24 +69,19 @@ from datahub.metadata.schema_classes import (
60
69
  TagAssociationClass,
61
70
  TagPropertiesClass,
62
71
  TimeStampClass,
72
+ UpstreamClass,
73
+ UpstreamLineageClass,
63
74
  VersionPropertiesClass,
64
75
  VersionTagClass,
65
76
  _Aspect,
66
77
  )
67
- from datahub.metadata.urns import (
68
- DataPlatformUrn,
69
- MlModelUrn,
70
- VersionSetUrn,
71
- )
78
+ from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
72
79
  from datahub.sdk.container import Container
80
+ from datahub.sdk.dataset import Dataset
73
81
 
74
82
  T = TypeVar("T")
75
83
 
76
84
 
77
- class ContainerKeyWithId(ContainerKey):
78
- id: str
79
-
80
-
81
85
  class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
82
86
  tracking_uri: Optional[str] = Field(
83
87
  default=None,
@@ -105,6 +109,22 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
105
109
  " If neither is set, external URLs are not generated."
106
110
  ),
107
111
  )
112
+ materialize_dataset_inputs: Optional[bool] = Field(
113
+ default=False,
114
+ description="Whether to materialize dataset inputs for each run",
115
+ )
116
+ source_mapping_to_platform: Optional[dict] = Field(
117
+ default=None, description="Mapping of source type to datahub platform"
118
+ )
119
+
120
+ username: Optional[str] = Field(
121
+ default=None, description="Username for MLflow authentication"
122
+ )
123
+ password: Optional[str] = Field(
124
+ default=None, description="Password for MLflow authentication"
125
+ )
126
+
127
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
108
128
 
109
129
 
110
130
  @dataclass
@@ -116,11 +136,18 @@ class MLflowRegisteredModelStageInfo:
116
136
 
117
137
  @platform_name("MLflow")
118
138
  @config_class(MLflowConfig)
119
- @support_status(SupportStatus.TESTING)
139
+ @support_status(SupportStatus.INCUBATING)
120
140
  @capability(
121
141
  SourceCapability.DESCRIPTIONS,
122
142
  "Extract descriptions for MLflow Registered Models and Model Versions",
123
143
  )
144
+ @capability(
145
+ SourceCapability.CONTAINERS,
146
+ "Extract ML experiments",
147
+ subtype_modifier=[
148
+ SourceCapabilityModifier.MLFLOW_EXPERIMENT,
149
+ ],
150
+ )
124
151
  @capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
125
152
  class MLflowSource(StatefulIngestionSourceBase):
126
153
  platform = "mlflow"
@@ -152,7 +179,17 @@ class MLflowSource(StatefulIngestionSourceBase):
152
179
  self.ctx = ctx
153
180
  self.config = config
154
181
  self.report = StaleEntityRemovalSourceReport()
155
- self.client = MlflowClient(
182
+ self.client = self._configure_client()
183
+
184
+ def _configure_client(self) -> MlflowClient:
185
+ if bool(self.config.username) != bool(self.config.password):
186
+ raise ValueError("Both username and password must be set together")
187
+
188
+ if self.config.username and self.config.password:
189
+ os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
190
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
191
+
192
+ return MlflowClient(
156
193
  tracking_uri=self.config.tracking_uri,
157
194
  registry_uri=self.config.registry_uri,
158
195
  )
@@ -213,6 +250,7 @@ class MLflowSource(StatefulIngestionSourceBase):
213
250
  if runs:
214
251
  for run in runs:
215
252
  yield from self._get_run_workunits(experiment, run)
253
+ yield from self._get_dataset_input_workunits(run)
216
254
 
217
255
  def _get_experiment_custom_properties(self, experiment):
218
256
  experiment_custom_props = getattr(experiment, "tags", {}) or {}
@@ -224,7 +262,7 @@ class MLflowSource(StatefulIngestionSourceBase):
224
262
  self, experiment: Experiment
225
263
  ) -> Iterable[MetadataWorkUnit]:
226
264
  experiment_container = Container(
227
- container_key=ContainerKeyWithId(
265
+ container_key=ExperimentKey(
228
266
  platform=str(DataPlatformUrn(platform_name=self.platform)),
229
267
  id=experiment.name,
230
268
  ),
@@ -262,10 +300,187 @@ class MLflowSource(StatefulIngestionSourceBase):
262
300
  type="SKIPPED", nativeResultType=self.platform
263
301
  )
264
302
 
303
+ def _get_dataset_schema(
304
+ self, dataset: MlflowDataset
305
+ ) -> Optional[List[Tuple[str, str]]]:
306
+ try:
307
+ schema_dict = json.loads(dataset.schema)
308
+ except json.JSONDecodeError:
309
+ self.report.warning(
310
+ title="Failed to load dataset schema",
311
+ message="Schema metadata will be missing due to a JSON parsing error.",
312
+ context=f"Dataset: {dataset.name}, Schema: {dataset.schema}",
313
+ )
314
+ return None
315
+
316
+ if "mlflow_colspec" in schema_dict:
317
+ try:
318
+ return [
319
+ (field["name"], field["type"])
320
+ for field in schema_dict["mlflow_colspec"]
321
+ ]
322
+ except (KeyError, TypeError):
323
+ return None
324
+ # If the schema is not formatted, return None
325
+ return None
326
+
327
+ def _get_external_dataset_urn(self, platform: str, dataset_name: str) -> str:
328
+ """
329
+ Get the URN for an external dataset.
330
+ Args:
331
+ platform: The platform of the external dataset (e.g., 's3', 'bigquery')
332
+ dataset: The MLflow dataset
333
+ Returns:
334
+ str: The URN of the external dataset
335
+ """
336
+ return str(DatasetUrn(platform=platform, name=dataset_name))
337
+
338
+ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
339
+ """
340
+ Generate workunits for dataset inputs in a run.
341
+
342
+ For each dataset input:
343
+ 1. If source type is 'local' or 'code':
344
+ - Create a local dataset reference
345
+ 2. Otherwise:
346
+ - If materialization is enabled:
347
+ - Create a hosted dataset and a dataset reference with upstream
348
+ - If materialization is not enabled:
349
+ - Create a dataset reference and add upstream if dataset exists
350
+ 3. Add all dataset references as upstreams for the run
351
+ """
352
+ run_urn = DataProcessInstance(
353
+ id=run.info.run_id,
354
+ orchestrator=self.platform,
355
+ ).urn
356
+
357
+ dataset_reference_urns = []
358
+
359
+ for dataset_input in run.inputs.dataset_inputs:
360
+ dataset = dataset_input.dataset
361
+ source_type = dataset.source_type
362
+ dataset_tags = {k[1]: v[1] for k, v in dataset_input.tags}
363
+
364
+ # Prepare dataset properties
365
+ custom_properties = dataset_tags
366
+ formatted_schema = self._get_dataset_schema(dataset)
367
+ if formatted_schema is None:
368
+ custom_properties["schema"] = dataset.schema
369
+
370
+ # Handle local/code datasets
371
+ if source_type in ("local", "code"):
372
+ local_dataset = Dataset(
373
+ platform=self.platform,
374
+ name=dataset.name,
375
+ schema=formatted_schema,
376
+ custom_properties=custom_properties,
377
+ )
378
+ yield from local_dataset.as_workunits()
379
+ dataset_reference_urns.append(local_dataset.urn)
380
+ continue
381
+
382
+ # Handle hosted datasets
383
+ formatted_platform = self._get_dataset_platform_from_source_type(
384
+ source_type
385
+ )
386
+
387
+ # Validate platform if materialization is enabled
388
+ if self.config.materialize_dataset_inputs:
389
+ if not formatted_platform:
390
+ self.report.failure(
391
+ title="Unable to materialize dataset inputs",
392
+ message=f"No mapping dataPlatform found for dataset input source type '{source_type}'",
393
+ context=f"please add `materialize_dataset_inputs.source_mapping_to_platform` in config "
394
+ f"(e.g. '{source_type}': 'snowflake')",
395
+ )
396
+ continue
397
+ # Create hosted dataset
398
+ hosted_dataset = Dataset(
399
+ platform=formatted_platform,
400
+ name=dataset.name,
401
+ schema=formatted_schema,
402
+ custom_properties=dataset_tags,
403
+ )
404
+ yield from hosted_dataset.as_workunits()
405
+
406
+ # Create dataset reference with upstream
407
+ hosted_dataset_reference = Dataset(
408
+ platform=self.platform,
409
+ name=dataset.name,
410
+ schema=formatted_schema,
411
+ custom_properties=dataset_tags,
412
+ upstreams=UpstreamLineageClass(
413
+ upstreams=[
414
+ UpstreamClass(
415
+ self._get_external_dataset_urn(
416
+ formatted_platform, dataset.name
417
+ ),
418
+ type="COPY",
419
+ )
420
+ ]
421
+ )
422
+ if formatted_platform
423
+ else None,
424
+ )
425
+ dataset_reference_urns.append(hosted_dataset_reference.urn)
426
+ yield from hosted_dataset_reference.as_workunits()
427
+
428
+ # Add dataset references as upstreams for the run
429
+ if dataset_reference_urns:
430
+ input_edges = [
431
+ EdgeClass(destinationUrn=str(dataset_ref_urn))
432
+ for dataset_ref_urn in dataset_reference_urns
433
+ ]
434
+ yield MetadataChangeProposalWrapper(
435
+ entityUrn=str(run_urn),
436
+ aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
437
+ ).as_workunit()
438
+
439
+ def _get_dataset_platform_from_source_type(self, source_type: str) -> Optional[str]:
440
+ """
441
+ Map MLflow source type to DataHub platform.
442
+
443
+ Priority:
444
+ 1. User-provided mapping in config
445
+ 2. Internal mapping
446
+ 3. Direct platform match from list of supported platforms
447
+ """
448
+ source_type = source_type.lower()
449
+
450
+ # User-provided mapping
451
+ platform = self._get_platform_from_user_mapping(source_type)
452
+ if platform:
453
+ return platform
454
+
455
+ # Internal mapping
456
+ if source_type == "gs":
457
+ return "gcs"
458
+
459
+ # Check direct platform match
460
+ if self._is_valid_platform(source_type):
461
+ return source_type
462
+
463
+ return None
464
+
465
+ def _get_platform_from_user_mapping(self, source_type: str) -> Optional[str]:
466
+ """
467
+ Get platform from user-provided mapping in config.
468
+ Returns None if mapping is invalid or platform is not supported.
469
+ """
470
+ source_mapping = self.config.source_mapping_to_platform
471
+ if not source_mapping:
472
+ return None
473
+
474
+ platform = source_mapping.get(source_type)
475
+ if not platform:
476
+ return None
477
+
478
+ return platform
479
+
265
480
  def _get_run_workunits(
266
481
  self, experiment: Experiment, run: Run
267
482
  ) -> Iterable[MetadataWorkUnit]:
268
- experiment_key = ContainerKeyWithId(
483
+ experiment_key = ExperimentKey(
269
484
  platform=str(DataPlatformUrn(self.platform)), id=experiment.name
270
485
  )
271
486
 
@@ -385,8 +600,8 @@ class MLflowSource(StatefulIngestionSourceBase):
385
600
  )
386
601
  return runs
387
602
 
388
- @staticmethod
389
603
  def _traverse_mlflow_search_func(
604
+ self,
390
605
  search_func: Callable[..., PagedList[T]],
391
606
  **kwargs: Any,
392
607
  ) -> Iterable[T]:
@@ -394,12 +609,24 @@ class MLflowSource(StatefulIngestionSourceBase):
394
609
  Utility to traverse an MLflow search_* functions which return PagedList.
395
610
  """
396
611
  next_page_token = None
397
- while True:
398
- paged_list = search_func(page_token=next_page_token, **kwargs)
399
- yield from paged_list.to_list()
400
- next_page_token = paged_list.token
401
- if not next_page_token:
612
+ try:
613
+ while True:
614
+ paged_list = search_func(page_token=next_page_token, **kwargs)
615
+ yield from paged_list.to_list()
616
+ next_page_token = paged_list.token
617
+ if not next_page_token:
618
+ return
619
+ except MlflowException as e:
620
+ if e.error_code == "ENDPOINT_NOT_FOUND":
621
+ self.report.warning(
622
+ title="MLflow API Endpoint Not Found for Experiments.",
623
+ message="Please upgrade to version 1.28.0 or higher to ensure compatibility. Skipping ingestion for experiments and runs.",
624
+ context=None,
625
+ exc=e,
626
+ )
402
627
  return
628
+ else:
629
+ raise # Only re-raise other exceptions
403
630
 
404
631
  def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
405
632
  return (
@@ -659,6 +886,10 @@ class MLflowSource(StatefulIngestionSourceBase):
659
886
  )
660
887
  return wu
661
888
 
889
+ def _is_valid_platform(self, platform: Optional[str]) -> bool:
890
+ """Check if platform is registered as a source plugin"""
891
+ return platform in KNOWN_VALID_PLATFORM_NAMES
892
+
662
893
  @classmethod
663
894
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
664
895
  config = MLflowConfig.parse_obj(config_dict)
File without changes