acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,20 @@
1
1
  import logging
2
2
  import re
3
- from concurrent.futures import ThreadPoolExecutor
3
+ import time
4
4
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
5
5
  from urllib.parse import urljoin
6
6
 
7
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
7
8
  from datahub.emitter.mce_builder import (
9
+ UNKNOWN_USER,
8
10
  make_data_platform_urn,
9
11
  make_dataplatform_instance_urn,
10
12
  make_dataset_urn_with_platform_instance,
11
13
  make_domain_urn,
12
14
  make_group_urn,
15
+ make_ml_model_group_urn,
13
16
  make_schema_field_urn,
17
+ make_ts_millis,
14
18
  make_user_urn,
15
19
  )
16
20
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -23,9 +27,9 @@ from datahub.emitter.mcp_builder import (
23
27
  UnitySchemaKey,
24
28
  UnitySchemaKeyWithMetastore,
25
29
  add_dataset_to_container,
30
+ add_entity_to_container,
26
31
  gen_containers,
27
32
  )
28
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
29
33
  from datahub.ingestion.api.common import PipelineContext
30
34
  from datahub.ingestion.api.decorators import (
31
35
  SupportStatus,
@@ -53,6 +57,7 @@ from datahub.ingestion.source.aws.s3_util import (
53
57
  from datahub.ingestion.source.common.subtypes import (
54
58
  DatasetContainerSubTypes,
55
59
  DatasetSubTypes,
60
+ SourceCapabilityModifier,
56
61
  )
57
62
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
58
63
  StaleEntityRemovalHandler,
@@ -72,13 +77,19 @@ from datahub.ingestion.source.unity.hive_metastore_proxy import (
72
77
  HIVE_METASTORE,
73
78
  HiveMetastoreProxy,
74
79
  )
80
+ from datahub.ingestion.source.unity.platform_resource_repository import (
81
+ UnityCatalogPlatformResourceRepository,
82
+ )
75
83
  from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
76
84
  from datahub.ingestion.source.unity.proxy_types import (
77
85
  DATA_TYPE_REGISTRY,
78
86
  Catalog,
79
87
  Column,
80
88
  CustomCatalogType,
89
+ HiveTableType,
81
90
  Metastore,
91
+ Model,
92
+ ModelVersion,
82
93
  Notebook,
83
94
  NotebookId,
84
95
  Schema,
@@ -87,8 +98,17 @@ from datahub.ingestion.source.unity.proxy_types import (
87
98
  TableReference,
88
99
  )
89
100
  from datahub.ingestion.source.unity.report import UnityCatalogReport
101
+ from datahub.ingestion.source.unity.tag_entities import (
102
+ UnityCatalogTagPlatformResource,
103
+ UnityCatalogTagPlatformResourceId,
104
+ )
90
105
  from datahub.ingestion.source.unity.usage import UnityCatalogUsageExtractor
91
- from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
106
+ from datahub.metadata.com.linkedin.pegasus2avro.common import (
107
+ GlobalTags,
108
+ MetadataAttribution,
109
+ Siblings,
110
+ TagAssociation,
111
+ )
92
112
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
93
113
  DatasetLineageType,
94
114
  FineGrainedLineage,
@@ -98,11 +118,13 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
98
118
  ViewProperties,
99
119
  )
100
120
  from datahub.metadata.schema_classes import (
121
+ AuditStampClass,
101
122
  BrowsePathsClass,
102
123
  DataPlatformInstanceClass,
103
124
  DatasetLineageTypeClass,
104
125
  DatasetPropertiesClass,
105
126
  DomainsClass,
127
+ MLModelPropertiesClass,
106
128
  MySqlDDLClass,
107
129
  NullTypeClass,
108
130
  OwnerClass,
@@ -116,12 +138,10 @@ from datahub.metadata.schema_classes import (
116
138
  UpstreamClass,
117
139
  UpstreamLineageClass,
118
140
  )
141
+ from datahub.metadata.urns import MlModelGroupUrn, MlModelUrn, TagUrn
142
+ from datahub.sdk import MLModel, MLModelGroup
119
143
  from datahub.sql_parsing.schema_resolver import SchemaResolver
120
- from datahub.sql_parsing.sqlglot_lineage import (
121
- SqlParsingResult,
122
- sqlglot_lineage,
123
- view_definition_lineage_helper,
124
- )
144
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
125
145
  from datahub.utilities.file_backed_collections import FileBackedDict
126
146
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
127
147
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -138,23 +158,32 @@ logger: logging.Logger = logging.getLogger(__name__)
138
158
  @capability(SourceCapability.USAGE_STATS, "Enabled by default")
139
159
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
140
160
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
141
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
161
+ @capability(
162
+ SourceCapability.CONTAINERS,
163
+ "Enabled by default",
164
+ subtype_modifier=[
165
+ SourceCapabilityModifier.CATALOG,
166
+ SourceCapabilityModifier.SCHEMA,
167
+ ],
168
+ )
142
169
  @capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
143
170
  @capability(
144
171
  SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
145
172
  )
146
173
  @capability(
147
174
  SourceCapability.DELETION_DETECTION,
148
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
175
+ "Enabled by default via stateful ingestion",
149
176
  supported=True,
150
177
  )
151
- @support_status(SupportStatus.INCUBATING)
178
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
179
+ @support_status(SupportStatus.CERTIFIED)
152
180
  class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
153
181
  """
154
182
  This plugin extracts the following metadata from Databricks Unity Catalog:
155
183
  - metastores
156
184
  - schemas
157
185
  - tables and column lineage
186
+ - model and model versions
158
187
  """
159
188
 
160
189
  config: UnityCatalogSourceConfig
@@ -162,6 +191,10 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
162
191
  platform: str = "databricks"
163
192
  platform_instance_name: Optional[str]
164
193
  sql_parser_schema_resolver: Optional[SchemaResolver] = None
194
+ platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
195
+ None
196
+ )
197
+ sql_parsing_aggregator: Optional[SqlParsingAggregator] = None
165
198
 
166
199
  def get_report(self) -> UnityCatalogReport:
167
200
  return self.report
@@ -180,6 +213,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
180
213
  config.warehouse_id,
181
214
  report=self.report,
182
215
  hive_metastore_proxy=self.hive_metastore_proxy,
216
+ lineage_data_source=config.lineage_data_source,
217
+ usage_data_source=config.usage_data_source,
218
+ databricks_api_page_size=config.databricks_api_page_size,
183
219
  )
184
220
 
185
221
  self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
@@ -205,12 +241,31 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
205
241
  self.table_refs: Set[TableReference] = set()
206
242
  self.view_refs: Set[TableReference] = set()
207
243
  self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
208
- self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
209
- FileBackedDict()
210
- )
211
244
 
212
245
  # Global map of tables, for profiling
213
246
  self.tables: FileBackedDict[Table] = FileBackedDict()
247
+ if self.ctx.graph:
248
+ self.platform_resource_repository = UnityCatalogPlatformResourceRepository(
249
+ self.ctx.graph, platform_instance=self.platform_instance_name
250
+ )
251
+ else:
252
+ self.platform_resource_repository = None
253
+
254
+ if self.config._forced_disable_tag_extraction:
255
+ self.report.report_warning(
256
+ "Some features disabled because of configuration conflicts",
257
+ "Tag Extraction is disabled due to missing warehouse_id in config",
258
+ )
259
+
260
+ if self.config._forced_disable_hive_metastore_extraction:
261
+ self.report.report_warning(
262
+ "Some features disabled because of configuration conflicts",
263
+ "Hive Metastore Extraction is disabled due to missing warehouse_id in config",
264
+ )
265
+
266
+ # Include platform resource repository in report for automatic cache statistics
267
+ if self.config.include_tags and self.platform_resource_repository:
268
+ self.report.tag_urn_resolver_cache = self.platform_resource_repository
214
269
 
215
270
  def init_hive_metastore_proxy(self):
216
271
  self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
@@ -229,6 +284,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
229
284
  platform_instance=self.config.platform_instance,
230
285
  env=self.config.env,
231
286
  )
287
+ self.sql_parsing_aggregator = SqlParsingAggregator(
288
+ platform=self.platform,
289
+ platform_instance=self.config.platform_instance,
290
+ env=self.config.env,
291
+ schema_resolver=self.sql_parser_schema_resolver,
292
+ generate_lineage=True,
293
+ generate_queries=False,
294
+ generate_usage_statistics=False,
295
+ generate_operations=False,
296
+ )
297
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
232
298
  except Exception as e:
233
299
  logger.debug("Exception", exc_info=True)
234
300
  self.warn(
@@ -383,12 +449,12 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
383
449
  self.config.workspace_url, f"#notebook/{notebook.id}"
384
450
  ),
385
451
  created=(
386
- TimeStampClass(int(notebook.created_at.timestamp() * 1000))
452
+ TimeStampClass(make_ts_millis(notebook.created_at))
387
453
  if notebook.created_at
388
454
  else None
389
455
  ),
390
456
  lastModified=(
391
- TimeStampClass(int(notebook.modified_at.timestamp() * 1000))
457
+ TimeStampClass(make_ts_millis(notebook.modified_at))
392
458
  if notebook.modified_at
393
459
  else None
394
460
  ),
@@ -407,17 +473,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
407
473
  if not notebook.upstreams:
408
474
  return None
409
475
 
476
+ upstreams = []
477
+ for upstream_ref in notebook.upstreams:
478
+ timestamp = make_ts_millis(upstream_ref.last_updated)
479
+ upstreams.append(
480
+ self._create_upstream_class(
481
+ self.gen_dataset_urn(upstream_ref),
482
+ DatasetLineageTypeClass.COPY,
483
+ timestamp,
484
+ )
485
+ )
486
+
410
487
  return MetadataChangeProposalWrapper(
411
488
  entityUrn=self.gen_notebook_urn(notebook),
412
- aspect=UpstreamLineageClass(
413
- upstreams=[
414
- UpstreamClass(
415
- dataset=self.gen_dataset_urn(upstream_ref),
416
- type=DatasetLineageTypeClass.COPY,
417
- )
418
- for upstream_ref in notebook.upstreams
419
- ]
420
- ),
489
+ aspect=UpstreamLineageClass(upstreams=upstreams),
421
490
  ).as_workunit()
422
491
 
423
492
  def process_metastores(self) -> Iterable[MetadataWorkUnit]:
@@ -436,14 +505,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
436
505
  self, metastore: Optional[Metastore]
437
506
  ) -> Iterable[MetadataWorkUnit]:
438
507
  for catalog in self._get_catalogs(metastore):
439
- if not self.config.catalog_pattern.allowed(catalog.id):
440
- self.report.catalogs.dropped(catalog.id)
441
- continue
508
+ with self.report.new_stage(f"Ingest catalog {catalog.id}"):
509
+ if not self.config.catalog_pattern.allowed(catalog.id):
510
+ self.report.catalogs.dropped(catalog.id)
511
+ continue
442
512
 
443
- yield from self.gen_catalog_containers(catalog)
444
- yield from self.process_schemas(catalog)
513
+ yield from self.gen_catalog_containers(catalog)
514
+ yield from self.process_schemas(catalog)
445
515
 
446
- self.report.catalogs.processed(catalog.id)
516
+ self.report.catalogs.processed(catalog.id)
447
517
 
448
518
  def _get_catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]:
449
519
  if self.config.catalogs:
@@ -466,6 +536,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
466
536
  yield from self.gen_schema_containers(schema)
467
537
  try:
468
538
  yield from self.process_tables(schema)
539
+ yield from self.process_ml_models(schema)
469
540
  except Exception as e:
470
541
  logger.exception(f"Error parsing schema {schema}")
471
542
  self.report.report_warning(
@@ -506,13 +577,42 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
506
577
  yield from self.add_table_to_dataset_container(dataset_urn, schema)
507
578
 
508
579
  table_props = self._create_table_property_aspect(table)
580
+ tags = None
581
+ if not isinstance(table.table_type, HiveTableType) and self.config.include_tags:
582
+ try:
583
+ table_tags = self._get_table_tags(
584
+ table.ref.catalog, table.ref.schema, table.ref.table
585
+ )
586
+ if table_tags:
587
+ logger.debug(f"Table tags for {table.ref}: {table_tags}")
588
+ attribution = MetadataAttribution(
589
+ # source="unity-catalog",
590
+ actor="urn:li:corpuser:datahub",
591
+ time=int(time.time() * 1000),
592
+ )
593
+ tags = GlobalTags(
594
+ tags=[
595
+ TagAssociation(
596
+ tag=tag.to_datahub_tag_urn().urn(),
597
+ attribution=attribution,
598
+ )
599
+ for tag in table_tags
600
+ ]
601
+ )
602
+
603
+ yield from self.gen_platform_resources(table_tags)
604
+
605
+ except Exception as e:
606
+ logger.exception(f"Error fetching table {table.ref} tags", exc_info=e)
509
607
 
510
608
  view_props = None
511
609
  if table.view_definition:
512
610
  view_props = self._create_view_property_aspect(table)
513
611
 
514
612
  sub_type = self._create_table_sub_type_aspect(table)
515
- schema_metadata = self._create_schema_metadata_aspect(table)
613
+ schema_metadata, platform_resources = self._create_schema_metadata_aspect(table)
614
+ yield from platform_resources
615
+
516
616
  domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name)
517
617
  ownership = self._create_table_ownership_aspect(table)
518
618
  data_platform_instance = self._create_data_platform_instance_aspect()
@@ -534,8 +634,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
534
634
  self.sql_parser_schema_resolver.add_schema_metadata(
535
635
  dataset_urn, schema_metadata
536
636
  )
537
- if table.view_definition:
538
- self.view_definitions[dataset_urn] = (table.ref, table.view_definition)
637
+ if table.view_definition and self.sql_parsing_aggregator:
638
+ self.sql_parsing_aggregator.add_view_definition(
639
+ view_urn=dataset_urn,
640
+ view_definition=table.view_definition,
641
+ default_db=table.ref.catalog,
642
+ default_schema=table.ref.schema,
643
+ )
539
644
 
540
645
  if (
541
646
  table_props.customProperties.get("table_type")
@@ -585,29 +690,107 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
585
690
  domain,
586
691
  data_platform_instance,
587
692
  lineage,
693
+ tags,
588
694
  ],
589
695
  )
590
696
  ]
591
697
 
698
+ def process_ml_models(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
699
+ for ml_model in self.unity_catalog_api_proxy.ml_models(
700
+ schema=schema, max_results=self.config.ml_model_max_results
701
+ ):
702
+ yield from self.process_ml_model(ml_model, schema)
703
+ ml_model_urn = self.gen_ml_model_urn(ml_model.id)
704
+ for ml_model_version in self.unity_catalog_api_proxy.ml_model_versions(
705
+ ml_model, include_aliases=self.config.include_ml_model_aliases
706
+ ):
707
+ yield from self.process_ml_model_version(
708
+ ml_model_urn, ml_model_version, schema
709
+ )
710
+
711
+ def process_ml_model(
712
+ self, ml_model: Model, schema: Schema
713
+ ) -> Iterable[MetadataWorkUnit]:
714
+ ml_model_group = MLModelGroup(
715
+ id=ml_model.id,
716
+ name=ml_model.name,
717
+ platform=self.platform,
718
+ platform_instance=schema.name,
719
+ env=self.config.env,
720
+ description=ml_model.description,
721
+ created=ml_model.created_at,
722
+ last_modified=ml_model.updated_at,
723
+ )
724
+ yield from ml_model_group.as_workunits()
725
+ yield from self.add_model_to_schema_container(str(ml_model_group.urn), schema)
726
+ self.report.ml_models.processed(ml_model.id)
727
+
728
+ def process_ml_model_version(
729
+ self, ml_model_urn: str, ml_model_version: ModelVersion, schema: Schema
730
+ ) -> Iterable[MetadataWorkUnit]:
731
+ extra_aspects = []
732
+ if ml_model_version.created_at is not None:
733
+ created_time = int(ml_model_version.created_at.timestamp() * 1000)
734
+ created_actor = (
735
+ f"urn:li:platformResource:{ml_model_version.created_by}"
736
+ if ml_model_version.created_by
737
+ else None
738
+ )
739
+ extra_aspects.append(
740
+ MLModelPropertiesClass(
741
+ created=TimeStampClass(time=created_time, actor=created_actor),
742
+ )
743
+ )
744
+
745
+ ml_model = MLModel(
746
+ id=ml_model_version.id,
747
+ name=ml_model_version.name,
748
+ version=str(ml_model_version.version),
749
+ aliases=ml_model_version.aliases,
750
+ description=ml_model_version.description,
751
+ model_group=ml_model_urn,
752
+ platform=self.platform,
753
+ last_modified=ml_model_version.updated_at,
754
+ extra_aspects=extra_aspects,
755
+ )
756
+
757
+ yield from ml_model.as_workunits()
758
+ yield from self.add_model_version_to_schema_container(str(ml_model.urn), schema)
759
+ self.report.ml_model_versions.processed(ml_model_version.id)
760
+
592
761
  def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]:
762
+ # Calculate datetime filters for lineage
763
+ lineage_start_time = None
764
+ lineage_end_time = self.config.end_time
765
+
766
+ if self.config.ignore_start_time_lineage:
767
+ lineage_start_time = None # Ignore start time to get all lineage
768
+ else:
769
+ lineage_start_time = self.config.start_time
770
+
593
771
  if self.config.include_table_lineage:
594
772
  self.unity_catalog_api_proxy.table_lineage(
595
- table, include_entity_lineage=self.config.include_notebooks
773
+ table,
774
+ include_entity_lineage=self.config.include_notebooks,
775
+ start_time=lineage_start_time,
776
+ end_time=lineage_end_time,
596
777
  )
597
778
 
598
779
  if self.config.include_column_lineage and table.upstreams:
599
780
  if len(table.columns) > self.config.column_lineage_column_limit:
600
781
  self.report.num_column_lineage_skipped_column_count += 1
601
782
 
602
- with ThreadPoolExecutor(
603
- max_workers=self.config.lineage_max_workers
604
- ) as executor:
605
- for column in table.columns[: self.config.column_lineage_column_limit]:
606
- executor.submit(
607
- self.unity_catalog_api_proxy.get_column_lineage,
608
- table,
609
- column.name,
610
- )
783
+ column_names = [
784
+ column.name
785
+ for column in table.columns[: self.config.column_lineage_column_limit]
786
+ ]
787
+ self.unity_catalog_api_proxy.get_column_lineage(
788
+ table,
789
+ column_names,
790
+ max_workers=self.config.lineage_max_workers,
791
+ start_time=lineage_start_time,
792
+ end_time=lineage_end_time,
793
+ )
611
794
 
612
795
  return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table)
613
796
 
@@ -635,18 +818,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
635
818
  for d_col, u_cols in sorted(downstream_to_upstream_cols.items())
636
819
  )
637
820
 
821
+ timestamp = make_ts_millis(upstream_ref.last_updated)
638
822
  upstreams.append(
639
- UpstreamClass(
640
- dataset=upstream_urn,
641
- type=DatasetLineageTypeClass.TRANSFORMED,
823
+ self._create_upstream_class(
824
+ upstream_urn,
825
+ DatasetLineageTypeClass.TRANSFORMED,
826
+ timestamp,
642
827
  )
643
828
  )
644
829
 
645
- for notebook in table.upstream_notebooks:
830
+ for notebook in table.upstream_notebooks.values():
831
+ timestamp = make_ts_millis(notebook.last_updated)
646
832
  upstreams.append(
647
- UpstreamClass(
648
- dataset=self.gen_notebook_urn(notebook),
649
- type=DatasetLineageTypeClass.TRANSFORMED,
833
+ self._create_upstream_class(
834
+ self.gen_notebook_urn(notebook.id),
835
+ DatasetLineageTypeClass.TRANSFORMED,
836
+ timestamp,
650
837
  )
651
838
  )
652
839
 
@@ -708,6 +895,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
708
895
  env=self.config.env,
709
896
  )
710
897
 
898
+ def gen_ml_model_urn(self, name: str) -> str:
899
+ return make_ml_model_group_urn(
900
+ platform=self.platform,
901
+ group_name=name,
902
+ env=self.config.env,
903
+ )
904
+
711
905
  def gen_notebook_urn(self, notebook: Union[Notebook, NotebookId]) -> str:
712
906
  notebook_id = notebook.id if isinstance(notebook, Notebook) else notebook
713
907
  return NotebookKey(
@@ -716,8 +910,41 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
716
910
  instance=self.config.platform_instance,
717
911
  ).as_urn()
718
912
 
913
+ def _create_upstream_class(
914
+ self,
915
+ dataset_urn: str,
916
+ lineage_type: Union[str, DatasetLineageTypeClass],
917
+ timestamp: Optional[int],
918
+ ) -> UpstreamClass:
919
+ """
920
+ Helper method to create UpstreamClass with optional audit stamp.
921
+ If timestamp is None, audit stamp is omitted.
922
+ """
923
+ if timestamp is not None:
924
+ return UpstreamClass(
925
+ dataset=dataset_urn,
926
+ type=lineage_type,
927
+ auditStamp=AuditStampClass(
928
+ time=timestamp,
929
+ actor=UNKNOWN_USER,
930
+ ),
931
+ )
932
+ else:
933
+ return UpstreamClass(
934
+ dataset=dataset_urn,
935
+ type=lineage_type,
936
+ )
937
+
719
938
  def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
720
939
  domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
940
+ schema_tags = []
941
+ if self.config.include_tags:
942
+ schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
943
+ schema.catalog.name
944
+ ).get(f"{schema.catalog.name}.{schema.name}", [])
945
+ logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
946
+ # Generate platform resources for schema tags
947
+ yield from self.gen_platform_resources(schema_tags)
721
948
 
722
949
  schema_container_key = self.gen_schema_key(schema)
723
950
  yield from gen_containers(
@@ -729,6 +956,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
729
956
  description=schema.comment,
730
957
  owner_urn=self.get_owner_urn(schema.owner),
731
958
  external_url=f"{self.external_url_base}/{schema.catalog.name}/{schema.name}",
959
+ tags=[tag.to_datahub_tag_urn().name for tag in schema_tags]
960
+ if schema_tags
961
+ else None,
732
962
  )
733
963
 
734
964
  def gen_metastore_containers(
@@ -749,6 +979,14 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
749
979
 
750
980
  def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
751
981
  domain_urn = self._gen_domain_urn(catalog.name)
982
+ catalog_tags = []
983
+ if self.config.include_tags:
984
+ catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(
985
+ catalog.name
986
+ ).get(catalog.name, [])
987
+ logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
988
+ # Generate platform resources for schema tags
989
+ yield from self.gen_platform_resources(catalog_tags)
752
990
 
753
991
  catalog_container_key = self.gen_catalog_key(catalog)
754
992
  yield from gen_containers(
@@ -764,6 +1002,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
764
1002
  description=catalog.comment,
765
1003
  owner_urn=self.get_owner_urn(catalog.owner),
766
1004
  external_url=f"{self.external_url_base}/{catalog.name}",
1005
+ tags=[tag.to_datahub_tag_urn().name for tag in catalog_tags]
1006
+ if catalog_tags
1007
+ else None,
767
1008
  )
768
1009
 
769
1010
  def gen_schema_key(self, schema: Schema) -> ContainerKey:
@@ -832,6 +1073,50 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
832
1073
  dataset_urn=dataset_urn,
833
1074
  )
834
1075
 
1076
+ def add_model_to_schema_container(
1077
+ self, model_urn: str, schema: Schema
1078
+ ) -> Iterable[MetadataWorkUnit]:
1079
+ schema_container_key = self.gen_schema_key(schema)
1080
+ yield from add_entity_to_container(
1081
+ container_key=schema_container_key,
1082
+ entity_type=MlModelGroupUrn.ENTITY_TYPE,
1083
+ entity_urn=model_urn,
1084
+ )
1085
+
1086
+ def add_model_version_to_schema_container(
1087
+ self, model_version_urn: str, schema: Schema
1088
+ ) -> Iterable[MetadataWorkUnit]:
1089
+ schema_container_key = self.gen_schema_key(schema)
1090
+ yield from add_entity_to_container(
1091
+ container_key=schema_container_key,
1092
+ entity_type=MlModelUrn.ENTITY_TYPE,
1093
+ entity_urn=model_version_urn,
1094
+ )
1095
+
1096
+ def _get_catalog_tags(
1097
+ self, catalog: str, schema: str, table: str
1098
+ ) -> List[UnityCatalogTag]:
1099
+ all_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog)
1100
+ return all_tags.get(f"{catalog}", [])
1101
+
1102
+ def _get_schema_tags(
1103
+ self, catalog: str, schema: str, table: str
1104
+ ) -> List[UnityCatalogTag]:
1105
+ all_tags = self.unity_catalog_api_proxy.get_schema_tags(catalog)
1106
+ return all_tags.get(f"{catalog}.{schema}", [])
1107
+
1108
+ def _get_table_tags(
1109
+ self, catalog: str, schema: str, table: str
1110
+ ) -> List[UnityCatalogTag]:
1111
+ all_tags = self.unity_catalog_api_proxy.get_table_tags(catalog)
1112
+ return all_tags.get(f"{catalog}.{schema}.{table}", [])
1113
+
1114
+ def _get_column_tags(
1115
+ self, catalog: str, schema: str, table: str, column: str
1116
+ ) -> List[UnityCatalogTag]:
1117
+ all_tags = self.unity_catalog_api_proxy.get_column_tags(catalog)
1118
+ return all_tags.get(f"{catalog}.{schema}.{table}.{column}", [])
1119
+
835
1120
  def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass:
836
1121
  custom_properties: dict = {}
837
1122
  if table.storage_location is not None:
@@ -860,16 +1145,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
860
1145
  created: Optional[TimeStampClass] = None
861
1146
  if table.created_at:
862
1147
  custom_properties["created_at"] = str(table.created_at)
863
- created = TimeStampClass(
864
- int(table.created_at.timestamp() * 1000),
865
- make_user_urn(table.created_by) if table.created_by else None,
866
- )
1148
+ created_ts = make_ts_millis(table.created_at)
1149
+ if created_ts is not None:
1150
+ created = TimeStampClass(
1151
+ created_ts,
1152
+ make_user_urn(table.created_by) if table.created_by else None,
1153
+ )
867
1154
  last_modified = created
868
1155
  if table.updated_at:
869
- last_modified = TimeStampClass(
870
- int(table.updated_at.timestamp() * 1000),
871
- table.updated_by and make_user_urn(table.updated_by),
872
- )
1156
+ updated_ts = make_ts_millis(table.updated_at)
1157
+ if updated_ts is not None:
1158
+ last_modified = TimeStampClass(
1159
+ updated_ts,
1160
+ table.updated_by and make_user_urn(table.updated_by),
1161
+ )
873
1162
 
874
1163
  return DatasetPropertiesClass(
875
1164
  name=table.name,
@@ -921,30 +1210,127 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
921
1210
  materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
922
1211
  )
923
1212
 
924
- def _create_schema_metadata_aspect(self, table: Table) -> SchemaMetadataClass:
925
- schema_fields: List[SchemaFieldClass] = []
1213
+ def get_or_create_from_unity_tag(
1214
+ self,
1215
+ unity_tag: UnityCatalogTag,
1216
+ platform_instance: Optional[str],
1217
+ managed_by_datahub: bool = False,
1218
+ ) -> UnityCatalogTagPlatformResource:
1219
+ """
1220
+ Optimized helper to get or create a Unity Catalog tag platform resource.
1221
+ This eliminates the duplicate search by skipping the from_tag method which was
1222
+ doing a redundant search before get_from_datahub.
1223
+ """
1224
+ # Create the platform resource ID directly without the from_tag search
1225
+ platform_resource_id = UnityCatalogTagPlatformResourceId(
1226
+ tag_key=unity_tag.key.raw_text,
1227
+ tag_value=unity_tag.value.raw_text if unity_tag.value is not None else None,
1228
+ platform_instance=platform_instance,
1229
+ exists_in_unity_catalog=True, # We got it from Unity Catalog
1230
+ persisted=False,
1231
+ )
1232
+
1233
+ # Use the repository's get_entity_from_datahub method which handles
1234
+ # searching and caching internally - this is the ONLY search we need
1235
+ if self.platform_resource_repository is None:
1236
+ raise ValueError("Platform resource repository not initialized")
1237
+ return self.platform_resource_repository.get_entity_from_datahub(
1238
+ platform_resource_id,
1239
+ managed_by_datahub,
1240
+ )
1241
+
1242
+ def gen_platform_resources(
1243
+ self, tags: List[UnityCatalogTag]
1244
+ ) -> Iterable[MetadataWorkUnit]:
1245
+ if self.ctx.graph and self.platform_resource_repository:
1246
+ for tag in tags:
1247
+ try:
1248
+ # Use optimized helper method that combines ID creation and entity retrieval
1249
+ unity_catalog_tag = self.get_or_create_from_unity_tag(
1250
+ tag,
1251
+ self.platform_instance_name,
1252
+ managed_by_datahub=False,
1253
+ )
1254
+ logger.debug(
1255
+ f"Retrieved/created platform resource for tag {tag.key.raw_text}"
1256
+ )
1257
+ if (
1258
+ tag.to_datahub_tag_urn().urn()
1259
+ not in unity_catalog_tag.datahub_linked_resources().urns
1260
+ ):
1261
+ unity_catalog_tag.datahub_linked_resources().add(
1262
+ tag.to_datahub_tag_urn().urn()
1263
+ )
1264
+ platform_resource = unity_catalog_tag.as_platform_resource()
1265
+ for mcp in platform_resource.to_mcps():
1266
+ yield MetadataWorkUnit(
1267
+ id=f"platform_resource-{platform_resource.id}",
1268
+ mcp=mcp,
1269
+ )
1270
+ except Exception as e:
1271
+ logger.exception(
1272
+ f"Error processing platform resource for tag {tag}"
1273
+ )
1274
+ self.report.report_warning(
1275
+ message="Error processing platform resource for tag",
1276
+ context=str(tag),
1277
+ title="Error processing platform resource for tag",
1278
+ exc=e,
1279
+ )
1280
+ continue
926
1281
 
1282
+ def _create_schema_metadata_aspect(
1283
+ self, table: Table
1284
+ ) -> Tuple[SchemaMetadataClass, Iterable[MetadataWorkUnit]]:
1285
+ schema_fields: List[SchemaFieldClass] = []
1286
+ unique_tags: Set[UnityCatalogTag] = set()
927
1287
  for column in table.columns:
928
- schema_fields.extend(self._create_schema_field(column))
929
-
930
- return SchemaMetadataClass(
931
- schemaName=table.id,
932
- platform=make_data_platform_urn(self.platform),
933
- fields=schema_fields,
934
- hash="",
935
- version=0,
936
- platformSchema=MySqlDDLClass(tableSchema=""),
1288
+ tag_urns: Optional[List[TagUrn]] = None
1289
+ if self.config.include_tags:
1290
+ column_tags = self._get_column_tags(
1291
+ table.ref.catalog, table.ref.schema, table.ref.table, column.name
1292
+ )
1293
+ unique_tags.update(column_tags)
1294
+ tag_urns = [tag.to_datahub_tag_urn() for tag in column_tags]
1295
+ schema_fields.extend(self._create_schema_field(column, tag_urns))
1296
+
1297
+ platform_resources = self.gen_platform_resources(list(unique_tags))
1298
+ return (
1299
+ SchemaMetadataClass(
1300
+ schemaName=table.id,
1301
+ platform=make_data_platform_urn(self.platform),
1302
+ fields=schema_fields,
1303
+ hash="",
1304
+ version=0,
1305
+ platformSchema=MySqlDDLClass(tableSchema=""),
1306
+ ),
1307
+ platform_resources,
937
1308
  )
938
1309
 
939
1310
  @staticmethod
940
- def _create_schema_field(column: Column) -> List[SchemaFieldClass]:
1311
+ def _create_schema_field(
1312
+ column: Column, tags: Optional[List[TagUrn]]
1313
+ ) -> List[SchemaFieldClass]:
941
1314
  _COMPLEX_TYPE = re.compile("^(struct|array)")
942
-
1315
+ global_tags: Optional[GlobalTags] = None
943
1316
  if _COMPLEX_TYPE.match(column.type_text.lower()):
944
1317
  return get_schema_fields_for_hive_column(
945
1318
  column.name, column.type_text.lower(), description=column.comment
946
1319
  )
947
1320
  else:
1321
+ if tags is not None:
1322
+ attribution = MetadataAttribution(
1323
+ source="urn:li:dataPlatform:unity-catalog",
1324
+ actor="urn:li:corpuser:datahub",
1325
+ time=int(time.time() * 1000),
1326
+ )
1327
+ global_tags = GlobalTags(
1328
+ tags=[
1329
+ TagAssociation(tag=tag.urn(), attribution=attribution)
1330
+ for tag in tags
1331
+ ]
1332
+ )
1333
+
948
1334
  return [
949
1335
  SchemaFieldClass(
950
1336
  fieldPath=column.name,
@@ -954,78 +1340,26 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
954
1340
  nativeDataType=column.type_text,
955
1341
  nullable=column.nullable,
956
1342
  description=column.comment,
1343
+ globalTags=global_tags if tags else None,
957
1344
  )
958
1345
  ]
959
1346
 
960
- def _run_sql_parser(
961
- self, view_ref: TableReference, query: str, schema_resolver: SchemaResolver
962
- ) -> Optional[SqlParsingResult]:
963
- raw_lineage = sqlglot_lineage(
964
- query,
965
- schema_resolver=schema_resolver,
966
- default_db=view_ref.catalog,
967
- default_schema=view_ref.schema,
968
- )
969
- view_urn = self.gen_dataset_urn(view_ref)
970
-
971
- if raw_lineage.debug_info.table_error:
972
- logger.debug(
973
- f"Failed to parse lineage for view {view_ref}: "
974
- f"{raw_lineage.debug_info.table_error}"
975
- )
976
- self.report.num_view_definitions_failed_parsing += 1
977
- self.report.view_definitions_parsing_failures.append(
978
- f"Table-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.table_error}"
979
- )
980
- return None
981
-
982
- elif raw_lineage.debug_info.column_error:
983
- self.report.num_view_definitions_failed_column_parsing += 1
984
- self.report.view_definitions_parsing_failures.append(
985
- f"Column-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.column_error}"
986
- )
987
- else:
988
- self.report.num_view_definitions_parsed += 1
989
- if raw_lineage.out_tables != [view_urn]:
990
- self.report.num_view_definitions_view_urn_mismatch += 1
991
- return view_definition_lineage_helper(raw_lineage, view_urn)
992
-
993
1347
  def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
994
1348
  if not (
995
1349
  self.config.include_hive_metastore
996
1350
  and self.config.include_table_lineage
997
- and self.sql_parser_schema_resolver
1351
+ and self.sql_parsing_aggregator
998
1352
  ):
999
1353
  return
1000
- # This is only used for parsing view lineage. Usage, Operations are emitted elsewhere
1001
- builder = SqlParsingBuilder(
1002
- generate_lineage=True,
1003
- generate_usage_statistics=False,
1004
- generate_operations=False,
1005
- )
1006
- for dataset_name in self.view_definitions.keys():
1007
- view_ref, view_definition = self.view_definitions[dataset_name]
1008
- result = self._run_sql_parser(
1009
- view_ref,
1010
- view_definition,
1011
- self.sql_parser_schema_resolver,
1012
- )
1013
- if result and result.out_tables:
1014
- # This does not yield any workunits but we use
1015
- # yield here to execute this method
1016
- yield from builder.process_sql_parsing_result(
1017
- result=result,
1018
- query=view_definition,
1019
- is_view_ddl=True,
1020
- include_column_lineage=self.config.include_view_column_lineage,
1021
- )
1022
- yield from builder.gen_workunits()
1354
+
1355
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
1356
+ yield mcp.as_workunit()
1023
1357
 
1024
1358
  def close(self):
1025
1359
  if self.hive_metastore_proxy:
1026
1360
  self.hive_metastore_proxy.close()
1027
- if self.view_definitions:
1028
- self.view_definitions.close()
1361
+ if self.sql_parsing_aggregator:
1362
+ self.sql_parsing_aggregator.close()
1029
1363
  if self.sql_parser_schema_resolver:
1030
1364
  self.sql_parser_schema_resolver.close()
1031
1365