acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -4,16 +4,22 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
4
4
 
5
5
  import dataclasses
6
6
  import logging
7
+ import os
8
+ from concurrent.futures import ThreadPoolExecutor
7
9
  from datetime import datetime
8
- from typing import Any, Dict, Iterable, List, Optional, Union, cast
10
+ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast
9
11
  from unittest.mock import patch
10
12
 
13
+ import cachetools
14
+ from cachetools import cached
11
15
  from databricks.sdk import WorkspaceClient
12
16
  from databricks.sdk.service.catalog import (
13
17
  CatalogInfo,
14
18
  ColumnInfo,
15
19
  GetMetastoreSummaryResponse,
16
20
  MetastoreInfo,
21
+ ModelVersionInfo,
22
+ RegisteredModelInfo,
17
23
  SchemaInfo,
18
24
  TableInfo,
19
25
  )
@@ -25,9 +31,17 @@ from databricks.sdk.service.sql import (
25
31
  QueryStatus,
26
32
  )
27
33
  from databricks.sdk.service.workspace import ObjectType
34
+ from databricks.sql import connect
35
+ from databricks.sql.types import Row
36
+ from typing_extensions import assert_never
28
37
 
29
38
  from datahub._version import nice_version_name
39
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
30
40
  from datahub.emitter.mce_builder import parse_ts_millis
41
+ from datahub.ingestion.source.unity.config import (
42
+ LineageDataSource,
43
+ UsageDataSource,
44
+ )
31
45
  from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
32
46
  from datahub.ingestion.source.unity.proxy_profiling import (
33
47
  UnityCatalogProxyProfilingMixin,
@@ -39,7 +53,10 @@ from datahub.ingestion.source.unity.proxy_types import (
39
53
  CustomCatalogType,
40
54
  ExternalTableReference,
41
55
  Metastore,
56
+ Model,
57
+ ModelVersion,
42
58
  Notebook,
59
+ NotebookReference,
43
60
  Query,
44
61
  Schema,
45
62
  ServicePrincipal,
@@ -47,9 +64,31 @@ from datahub.ingestion.source.unity.proxy_types import (
47
64
  TableReference,
48
65
  )
49
66
  from datahub.ingestion.source.unity.report import UnityCatalogReport
67
+ from datahub.utilities.file_backed_collections import FileBackedDict
50
68
 
51
69
  logger: logging.Logger = logging.getLogger(__name__)
52
70
 
71
+ # It is enough to keep the cache size to 1, since we only process one catalog at a time
72
+ # We need to change this if we want to support parallel processing of multiple catalogs
73
+ _MAX_CONCURRENT_CATALOGS = 1
74
+
75
+
76
+ # Import and apply the proxy patch from separate module
77
+ try:
78
+ from datahub.ingestion.source.unity.proxy_patch import (
79
+ apply_databricks_proxy_fix,
80
+ mask_proxy_credentials,
81
+ )
82
+
83
+ # Apply the fix when the module is imported
84
+ apply_databricks_proxy_fix()
85
+ except ImportError as e:
86
+ logger.debug(f"Could not import proxy patch module: {e}")
87
+
88
+ # Fallback function for masking credentials
89
+ def mask_proxy_credentials(url: Optional[str]) -> str:
90
+ return "***MASKED***" if url else "None"
91
+
53
92
 
54
93
  @dataclasses.dataclass
55
94
  class TableInfoWithGeneration(TableInfo):
@@ -85,6 +124,32 @@ class QueryFilterWithStatementTypes(QueryFilter):
85
124
  return v
86
125
 
87
126
 
127
+ @dataclasses.dataclass
128
+ class TableUpstream:
129
+ table_name: str
130
+ source_type: str
131
+ last_updated: Optional[datetime] = None
132
+
133
+
134
+ @dataclasses.dataclass
135
+ class ExternalUpstream:
136
+ path: str
137
+ source_type: str
138
+ last_updated: Optional[datetime] = None
139
+
140
+
141
+ @dataclasses.dataclass
142
+ class TableLineageInfo:
143
+ upstreams: List[TableUpstream] = dataclasses.field(default_factory=list)
144
+ external_upstreams: List[ExternalUpstream] = dataclasses.field(default_factory=list)
145
+ upstream_notebooks: List[NotebookReference] = dataclasses.field(
146
+ default_factory=list
147
+ )
148
+ downstream_notebooks: List[NotebookReference] = dataclasses.field(
149
+ default_factory=list
150
+ )
151
+
152
+
88
153
  class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
89
154
  _workspace_client: WorkspaceClient
90
155
  _workspace_url: str
@@ -98,6 +163,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
98
163
  warehouse_id: Optional[str],
99
164
  report: UnityCatalogReport,
100
165
  hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
166
+ lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
167
+ usage_data_source: UsageDataSource = UsageDataSource.AUTO,
168
+ databricks_api_page_size: int = 0,
101
169
  ):
102
170
  self._workspace_client = WorkspaceClient(
103
171
  host=workspace_url,
@@ -108,9 +176,24 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
108
176
  self.warehouse_id = warehouse_id or ""
109
177
  self.report = report
110
178
  self.hive_metastore_proxy = hive_metastore_proxy
179
+ self.lineage_data_source = lineage_data_source
180
+ self.usage_data_source = usage_data_source
181
+ self.databricks_api_page_size = databricks_api_page_size
182
+ self._sql_connection_params = {
183
+ "server_hostname": self._workspace_client.config.host.replace(
184
+ "https://", ""
185
+ ),
186
+ "http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
187
+ "access_token": self._workspace_client.config.token,
188
+ "user_agent_entry": "datahub",
189
+ }
111
190
 
112
191
  def check_basic_connectivity(self) -> bool:
113
- return bool(self._workspace_client.catalogs.list(include_browse=True))
192
+ return bool(
193
+ self._workspace_client.catalogs.list(
194
+ include_browse=True, max_results=self.databricks_api_page_size
195
+ )
196
+ )
114
197
 
115
198
  def assigned_metastore(self) -> Optional[Metastore]:
116
199
  response = self._workspace_client.metastores.summary()
@@ -120,7 +203,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
120
203
  if self.hive_metastore_proxy:
121
204
  yield self.hive_metastore_proxy.hive_metastore_catalog(metastore)
122
205
 
123
- response = self._workspace_client.catalogs.list(include_browse=True)
206
+ response = self._workspace_client.catalogs.list(
207
+ include_browse=True, max_results=self.databricks_api_page_size
208
+ )
124
209
  if not response:
125
210
  logger.info("Catalogs not found")
126
211
  return
@@ -152,7 +237,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
152
237
  yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog)
153
238
  return
154
239
  response = self._workspace_client.schemas.list(
155
- catalog_name=catalog.name, include_browse=True
240
+ catalog_name=catalog.name,
241
+ include_browse=True,
242
+ max_results=self.databricks_api_page_size,
156
243
  )
157
244
  if not response:
158
245
  logger.info(f"Schemas not found for catalog {catalog.id}")
@@ -174,6 +261,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
174
261
  catalog_name=schema.catalog.name,
175
262
  schema_name=schema.name,
176
263
  include_browse=True,
264
+ max_results=self.databricks_api_page_size,
177
265
  )
178
266
  if not response:
179
267
  logger.info(f"Tables not found for schema {schema.id}")
@@ -189,6 +277,40 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
189
277
  logger.warning(f"Error parsing table: {e}")
190
278
  self.report.report_warning("table-parse", str(e))
191
279
 
280
+ def ml_models(
281
+ self, schema: Schema, max_results: Optional[int] = None
282
+ ) -> Iterable[Model]:
283
+ response = self._workspace_client.registered_models.list(
284
+ catalog_name=schema.catalog.name,
285
+ schema_name=schema.name,
286
+ max_results=max_results,
287
+ )
288
+ for ml_model in response:
289
+ optional_ml_model = self._create_ml_model(schema, ml_model)
290
+ if optional_ml_model:
291
+ yield optional_ml_model
292
+
293
+ def ml_model_versions(
294
+ self, ml_model: Model, include_aliases: bool = False
295
+ ) -> Iterable[ModelVersion]:
296
+ response = self._workspace_client.model_versions.list(
297
+ full_name=ml_model.id,
298
+ include_browse=True,
299
+ max_results=self.databricks_api_page_size,
300
+ )
301
+ for version in response:
302
+ if version.version is not None:
303
+ if include_aliases:
304
+ # to get aliases info, use GET
305
+ version = self._workspace_client.model_versions.get(
306
+ ml_model.id, version.version, include_aliases=True
307
+ )
308
+ optional_ml_model_version = self._create_ml_model_version(
309
+ ml_model, version
310
+ )
311
+ if optional_ml_model_version:
312
+ yield optional_ml_model_version
313
+
192
314
  def service_principals(self) -> Iterable[ServicePrincipal]:
193
315
  for principal in self._workspace_client.service_principals.list():
194
316
  optional_sp = self._create_service_principal(principal)
@@ -206,7 +328,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
206
328
  return group_list
207
329
 
208
330
  def workspace_notebooks(self) -> Iterable[Notebook]:
209
- for obj in self._workspace_client.workspace.list("/", recursive=True):
331
+ workspace_objects_iter = self._workspace_client.workspace.list(
332
+ "/", recursive=True, max_results=self.databricks_api_page_size
333
+ )
334
+ for obj in workspace_objects_iter:
210
335
  if obj.object_type == ObjectType.NOTEBOOK and obj.object_id and obj.path:
211
336
  yield Notebook(
212
337
  id=obj.object_id,
@@ -248,7 +373,6 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
248
373
  def _query_history(
249
374
  self,
250
375
  filter_by: QueryFilterWithStatementTypes,
251
- max_results: int = 1000,
252
376
  include_metrics: bool = False,
253
377
  ) -> Iterable[QueryInfo]:
254
378
  """Manual implementation of the query_history.list() endpoint.
@@ -260,9 +384,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
260
384
  """
261
385
  method = "GET"
262
386
  path = "/api/2.0/sql/history/queries"
387
+
263
388
  body: Dict[str, Any] = {
264
389
  "include_metrics": include_metrics,
265
- "max_results": max_results, # Max batch size
390
+ "max_results": self.databricks_api_page_size, # Max batch size
266
391
  }
267
392
 
268
393
  response: dict = self._workspace_client.api_client.do( # type: ignore
@@ -280,10 +405,257 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
280
405
  method, path, body={**body, "page_token": response["next_page_token"]}
281
406
  )
282
407
 
283
- def list_lineages_by_table(
408
+ def get_query_history_via_system_tables(
409
+ self,
410
+ start_time: datetime,
411
+ end_time: datetime,
412
+ ) -> Iterable[Query]:
413
+ """Get query history using system.query.history table.
414
+
415
+ This method provides an alternative to the REST API for fetching query history,
416
+ offering better performance and richer data for large query volumes.
417
+ """
418
+ logger.info(
419
+ f"Fetching query history from system.query.history for period: {start_time} to {end_time}"
420
+ )
421
+
422
+ allowed_types = [typ.value for typ in ALLOWED_STATEMENT_TYPES]
423
+ statement_type_filter = ", ".join(f"'{typ}'" for typ in allowed_types)
424
+
425
+ query = f"""
426
+ SELECT
427
+ statement_id,
428
+ statement_text,
429
+ statement_type,
430
+ start_time,
431
+ end_time,
432
+ executed_by,
433
+ executed_as,
434
+ executed_by_user_id,
435
+ executed_as_user_id
436
+ FROM system.query.history
437
+ WHERE
438
+ start_time >= %s
439
+ AND end_time <= %s
440
+ AND execution_status = 'FINISHED'
441
+ AND statement_type IN ({statement_type_filter})
442
+ ORDER BY start_time
443
+ """
444
+
445
+ try:
446
+ rows = self._execute_sql_query(query, (start_time, end_time))
447
+ for row in rows:
448
+ try:
449
+ yield Query(
450
+ query_id=row.statement_id,
451
+ query_text=row.statement_text,
452
+ statement_type=(
453
+ QueryStatementType(row.statement_type)
454
+ if row.statement_type
455
+ else None
456
+ ),
457
+ start_time=row.start_time,
458
+ end_time=row.end_time,
459
+ user_id=row.executed_by_user_id,
460
+ user_name=row.executed_by,
461
+ executed_as_user_id=row.executed_as_user_id,
462
+ executed_as_user_name=row.executed_as,
463
+ )
464
+ except Exception as e:
465
+ logger.warning(f"Error parsing query from system table: {e}")
466
+ self.report.report_warning("query-parse-system-table", str(e))
467
+ except Exception as e:
468
+ logger.error(
469
+ f"Error fetching query history from system tables: {e}", exc_info=True
470
+ )
471
+ self.report.report_failure(
472
+ title="Failed to fetch query history from system tables",
473
+ message="Error querying system.query.history table",
474
+ context=f"Query period: {start_time} to {end_time}",
475
+ )
476
+
477
+ def _build_datetime_where_conditions(
478
+ self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
479
+ ) -> str:
480
+ """Build datetime filtering conditions for lineage queries."""
481
+ conditions = []
482
+ if start_time:
483
+ conditions.append(f"event_time >= '{start_time.isoformat()}'")
484
+ if end_time:
485
+ conditions.append(f"event_time <= '{end_time.isoformat()}'")
486
+ return " AND " + " AND ".join(conditions) if conditions else ""
487
+
488
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
489
+ def get_catalog_table_lineage_via_system_tables(
490
+ self,
491
+ catalog: str,
492
+ start_time: Optional[datetime] = None,
493
+ end_time: Optional[datetime] = None,
494
+ ) -> FileBackedDict[TableLineageInfo]:
495
+ """Get table lineage for all tables in a catalog using system tables."""
496
+ logger.info(f"Fetching table lineage for catalog: {catalog}")
497
+ try:
498
+ additional_where = self._build_datetime_where_conditions(
499
+ start_time, end_time
500
+ )
501
+
502
+ query = f"""
503
+ SELECT
504
+ entity_type, entity_id,
505
+ source_table_full_name, source_type, source_path,
506
+ target_table_full_name, target_type,
507
+ max(event_time) as last_updated
508
+ FROM system.access.table_lineage
509
+ WHERE
510
+ (target_table_catalog = %s or source_table_catalog = %s)
511
+ {additional_where}
512
+ GROUP BY
513
+ entity_type, entity_id,
514
+ source_table_full_name, source_type, source_path,
515
+ target_table_full_name, target_type
516
+ """
517
+ rows = self._execute_sql_query(query, [catalog, catalog])
518
+
519
+ result_dict: FileBackedDict[TableLineageInfo] = FileBackedDict()
520
+ for row in rows:
521
+ entity_type = row["entity_type"]
522
+ entity_id = row["entity_id"]
523
+ source_full_name = row["source_table_full_name"]
524
+ target_full_name = row["target_table_full_name"]
525
+ source_type = row["source_type"]
526
+ source_path = row["source_path"]
527
+ last_updated = row["last_updated"]
528
+
529
+ # Initialize TableLineageInfo for both source and target tables if they're in our catalog
530
+ for table_name in [source_full_name, target_full_name]:
531
+ if (
532
+ table_name
533
+ and table_name.startswith(f"{catalog}.")
534
+ and table_name not in result_dict
535
+ ):
536
+ result_dict[table_name] = TableLineageInfo()
537
+
538
+ # Process upstream relationships (target table gets upstreams)
539
+ if target_full_name and target_full_name.startswith(f"{catalog}."):
540
+ # Handle table upstreams
541
+ if (
542
+ source_type in ["TABLE", "VIEW"]
543
+ and source_full_name != target_full_name
544
+ ):
545
+ upstream = TableUpstream(
546
+ table_name=source_full_name,
547
+ source_type=source_type,
548
+ last_updated=last_updated,
549
+ )
550
+ result_dict[target_full_name].upstreams.append(upstream)
551
+
552
+ # Handle external upstreams (PATH type)
553
+ elif source_type == "PATH":
554
+ external_upstream = ExternalUpstream(
555
+ path=source_path,
556
+ source_type=source_type,
557
+ last_updated=last_updated,
558
+ )
559
+ result_dict[target_full_name].external_upstreams.append(
560
+ external_upstream
561
+ )
562
+
563
+ # Handle upstream notebooks (notebook -> table)
564
+ elif entity_type == "NOTEBOOK":
565
+ notebook_ref = NotebookReference(
566
+ id=entity_id,
567
+ last_updated=last_updated,
568
+ )
569
+ result_dict[target_full_name].upstream_notebooks.append(
570
+ notebook_ref
571
+ )
572
+
573
+ # Process downstream relationships (source table gets downstream notebooks)
574
+ if (
575
+ entity_type == "NOTEBOOK"
576
+ and source_full_name
577
+ and source_full_name.startswith(f"{catalog}.")
578
+ ):
579
+ notebook_ref = NotebookReference(
580
+ id=entity_id,
581
+ last_updated=last_updated,
582
+ )
583
+ result_dict[source_full_name].downstream_notebooks.append(
584
+ notebook_ref
585
+ )
586
+
587
+ return result_dict
588
+ except Exception as e:
589
+ logger.warning(
590
+ f"Error getting table lineage for catalog {catalog}: {e}",
591
+ exc_info=True,
592
+ )
593
+ return FileBackedDict()
594
+
595
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
596
+ def get_catalog_column_lineage_via_system_tables(
597
+ self,
598
+ catalog: str,
599
+ start_time: Optional[datetime] = None,
600
+ end_time: Optional[datetime] = None,
601
+ ) -> FileBackedDict[Dict[str, dict]]:
602
+ """Get column lineage for all tables in a catalog using system tables."""
603
+ logger.info(f"Fetching column lineage for catalog: {catalog}")
604
+ try:
605
+ additional_where = self._build_datetime_where_conditions(
606
+ start_time, end_time
607
+ )
608
+
609
+ query = f"""
610
+ SELECT
611
+ source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
612
+ target_table_schema, target_table_name, target_column_name,
613
+ max(event_time) as last_updated
614
+ FROM system.access.column_lineage
615
+ WHERE
616
+ target_table_catalog = %s
617
+ AND target_table_schema IS NOT NULL
618
+ AND target_table_name IS NOT NULL
619
+ AND target_column_name IS NOT NULL
620
+ AND source_table_catalog IS NOT NULL
621
+ AND source_table_schema IS NOT NULL
622
+ AND source_table_name IS NOT NULL
623
+ AND source_column_name IS NOT NULL
624
+ {additional_where}
625
+ GROUP BY
626
+ source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
627
+ target_table_schema, target_table_name, target_column_name
628
+ """
629
+ rows = self._execute_sql_query(query, [catalog])
630
+
631
+ result_dict: FileBackedDict[Dict[str, dict]] = FileBackedDict()
632
+ for row in rows:
633
+ result_dict.setdefault(row["target_table_schema"], {}).setdefault(
634
+ row["target_table_name"], {}
635
+ ).setdefault(row["target_column_name"], []).append(
636
+ # make fields look like the response from the older HTTP API
637
+ {
638
+ "catalog_name": row["source_table_catalog"],
639
+ "schema_name": row["source_table_schema"],
640
+ "table_name": row["source_table_name"],
641
+ "name": row["source_column_name"],
642
+ "last_updated": row["last_updated"],
643
+ }
644
+ )
645
+
646
+ return result_dict
647
+ except Exception as e:
648
+ logger.warning(
649
+ f"Error getting column lineage for catalog {catalog}: {e}",
650
+ exc_info=True,
651
+ )
652
+ return FileBackedDict()
653
+
654
+ def list_lineages_by_table_via_http_api(
284
655
  self, table_name: str, include_entity_lineage: bool
285
656
  ) -> dict:
286
657
  """List table lineage by table name."""
658
+ logger.debug(f"Getting table lineage for {table_name}")
287
659
  return self._workspace_client.api_client.do( # type: ignore
288
660
  method="GET",
289
661
  path="/api/2.0/lineage-tracking/table-lineage",
@@ -293,67 +665,226 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
293
665
  },
294
666
  )
295
667
 
296
- def list_lineages_by_column(self, table_name: str, column_name: str) -> dict:
668
+ def list_lineages_by_column_via_http_api(
669
+ self, table_name: str, column_name: str
670
+ ) -> list:
297
671
  """List column lineage by table name and column name."""
298
- return self._workspace_client.api_client.do( # type: ignore
299
- "GET",
300
- "/api/2.0/lineage-tracking/column-lineage",
301
- body={"table_name": table_name, "column_name": column_name},
302
- )
672
+ logger.debug(f"Getting column lineage for {table_name}.{column_name}")
673
+ try:
674
+ return (
675
+ self._workspace_client.api_client.do( # type: ignore
676
+ "GET",
677
+ "/api/2.0/lineage-tracking/column-lineage",
678
+ body={"table_name": table_name, "column_name": column_name},
679
+ ).get("upstream_cols")
680
+ or []
681
+ )
682
+ except Exception as e:
683
+ logger.warning(
684
+ f"Error getting column lineage on table {table_name}, column {column_name}: {e}",
685
+ exc_info=True,
686
+ )
687
+ return []
303
688
 
304
- def table_lineage(self, table: Table, include_entity_lineage: bool) -> None:
689
+ def table_lineage(
690
+ self,
691
+ table: Table,
692
+ include_entity_lineage: bool,
693
+ start_time: Optional[datetime] = None,
694
+ end_time: Optional[datetime] = None,
695
+ ) -> None:
305
696
  if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG:
306
697
  # Lineage is not available for Hive Metastore Tables.
307
698
  return None
308
- # Lineage endpoint doesn't exists on 2.1 version
309
- try:
310
- response: dict = self.list_lineages_by_table(
311
- table_name=table.ref.qualified_table_name,
312
- include_entity_lineage=include_entity_lineage,
313
- )
314
699
 
315
- for item in response.get("upstreams") or []:
316
- if "tableInfo" in item:
317
- table_ref = TableReference.create_from_lineage(
318
- item["tableInfo"], table.schema.catalog.metastore
319
- )
320
- if table_ref:
321
- table.upstreams[table_ref] = {}
322
- elif "fileInfo" in item:
323
- external_ref = ExternalTableReference.create_from_lineage(
324
- item["fileInfo"]
325
- )
326
- if external_ref:
327
- table.external_upstreams.add(external_ref)
328
-
329
- for notebook in item.get("notebookInfos") or []:
330
- table.upstream_notebooks.add(notebook["notebook_id"])
700
+ try:
701
+ # Determine lineage data source based on config
702
+ use_system_tables = False
703
+ if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
704
+ use_system_tables = True
705
+ elif self.lineage_data_source == LineageDataSource.API:
706
+ use_system_tables = False
707
+ elif self.lineage_data_source == LineageDataSource.AUTO:
708
+ # Use the newer system tables if we have a SQL warehouse, otherwise fall back
709
+ # to the older (and slower) HTTP API.
710
+ use_system_tables = bool(self.warehouse_id)
711
+ else:
712
+ assert_never(self.lineage_data_source)
331
713
 
332
- for item in response.get("downstreams") or []:
333
- for notebook in item.get("notebookInfos") or []:
334
- table.downstream_notebooks.add(notebook["notebook_id"])
714
+ if use_system_tables:
715
+ self._process_system_table_lineage(table, start_time, end_time)
716
+ else:
717
+ self._process_table_lineage_via_http_api(table, include_entity_lineage)
335
718
  except Exception as e:
336
719
  logger.warning(
337
720
  f"Error getting lineage on table {table.ref}: {e}", exc_info=True
338
721
  )
339
722
 
340
- def get_column_lineage(self, table: Table, column_name: str) -> None:
341
- try:
342
- response: dict = self.list_lineages_by_column(
343
- table_name=table.ref.qualified_table_name,
344
- column_name=column_name,
723
+ def _process_system_table_lineage(
724
+ self,
725
+ table: Table,
726
+ start_time: Optional[datetime] = None,
727
+ end_time: Optional[datetime] = None,
728
+ ) -> None:
729
+ """Process table lineage using system.access.table_lineage table."""
730
+ catalog_lineage = self.get_catalog_table_lineage_via_system_tables(
731
+ table.ref.catalog, start_time, end_time
732
+ )
733
+ table_full_name = table.ref.qualified_table_name
734
+
735
+ lineage_info = catalog_lineage.get(table_full_name, TableLineageInfo())
736
+
737
+ # Process table upstreams
738
+ for upstream in lineage_info.upstreams:
739
+ upstream_table_name = upstream.table_name
740
+ # Parse catalog.schema.table format
741
+ parts = upstream_table_name.split(".")
742
+ if len(parts) == 3:
743
+ catalog_name, schema_name, table_name = parts[0], parts[1], parts[2]
744
+ table_ref = TableReference(
745
+ metastore=table.schema.catalog.metastore.id
746
+ if table.schema.catalog.metastore
747
+ else None,
748
+ catalog=catalog_name,
749
+ schema=schema_name,
750
+ table=table_name,
751
+ last_updated=upstream.last_updated,
752
+ )
753
+ table.upstreams[table_ref] = {}
754
+ else:
755
+ logger.warning(
756
+ f"Unexpected upstream table format: {upstream_table_name} for table {table_full_name}"
757
+ )
758
+ continue
759
+
760
+ # Process external upstreams
761
+ for external_upstream in lineage_info.external_upstreams:
762
+ external_ref = ExternalTableReference(
763
+ path=external_upstream.path,
764
+ has_permission=True,
765
+ name=None,
766
+ type=None,
767
+ storage_location=external_upstream.path,
768
+ last_updated=external_upstream.last_updated,
345
769
  )
346
- for item in response.get("upstream_cols") or []:
770
+ table.external_upstreams.add(external_ref)
771
+
772
+ # Process upstream notebook lineage
773
+ for notebook_ref in lineage_info.upstream_notebooks:
774
+ existing_ref = table.upstream_notebooks.get(notebook_ref.id)
775
+ if existing_ref is None or (
776
+ notebook_ref.last_updated
777
+ and existing_ref.last_updated
778
+ and notebook_ref.last_updated > existing_ref.last_updated
779
+ ):
780
+ table.upstream_notebooks[notebook_ref.id] = notebook_ref
781
+
782
+ # Process downstream notebook lineage
783
+ for notebook_ref in lineage_info.downstream_notebooks:
784
+ existing_ref = table.downstream_notebooks.get(notebook_ref.id)
785
+ if existing_ref is None or (
786
+ notebook_ref.last_updated
787
+ and existing_ref.last_updated
788
+ and notebook_ref.last_updated > existing_ref.last_updated
789
+ ):
790
+ table.downstream_notebooks[notebook_ref.id] = notebook_ref
791
+
792
+ def _process_table_lineage_via_http_api(
793
+ self, table: Table, include_entity_lineage: bool
794
+ ) -> None:
795
+ """Process table lineage using the HTTP API (legacy fallback)."""
796
+ response: dict = self.list_lineages_by_table_via_http_api(
797
+ table_name=table.ref.qualified_table_name,
798
+ include_entity_lineage=include_entity_lineage,
799
+ )
800
+
801
+ for item in response.get("upstreams") or []:
802
+ if "tableInfo" in item:
347
803
  table_ref = TableReference.create_from_lineage(
348
- item, table.schema.catalog.metastore
804
+ item["tableInfo"], table.schema.catalog.metastore
349
805
  )
350
806
  if table_ref:
351
- table.upstreams.setdefault(table_ref, {}).setdefault(
352
- column_name, []
353
- ).append(item["name"])
807
+ table.upstreams[table_ref] = {}
808
+ elif "fileInfo" in item:
809
+ external_ref = ExternalTableReference.create_from_lineage(
810
+ item["fileInfo"]
811
+ )
812
+ if external_ref:
813
+ table.external_upstreams.add(external_ref)
814
+
815
+ for notebook in item.get("notebookInfos") or []:
816
+ notebook_ref = NotebookReference(
817
+ id=notebook["notebook_id"],
818
+ )
819
+ table.upstream_notebooks[notebook_ref.id] = notebook_ref
820
+
821
+ for item in response.get("downstreams") or []:
822
+ for notebook in item.get("notebookInfos") or []:
823
+ notebook_ref = NotebookReference(
824
+ id=notebook["notebook_id"],
825
+ )
826
+ table.downstream_notebooks[notebook_ref.id] = notebook_ref
827
+
828
+ def get_column_lineage(
829
+ self,
830
+ table: Table,
831
+ column_names: List[str],
832
+ *,
833
+ max_workers: Optional[int] = None,
834
+ start_time: Optional[datetime] = None,
835
+ end_time: Optional[datetime] = None,
836
+ ) -> None:
837
+ try:
838
+ # Determine lineage data source based on config
839
+ use_system_tables = False
840
+ if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
841
+ use_system_tables = True
842
+ elif self.lineage_data_source == LineageDataSource.API:
843
+ use_system_tables = False
844
+ elif self.lineage_data_source == LineageDataSource.AUTO:
845
+ # Use the newer system tables if we have a SQL warehouse, otherwise fall back
846
+ # to the older (and slower) HTTP API.
847
+ use_system_tables = bool(self.warehouse_id)
848
+ else:
849
+ assert_never(self.lineage_data_source)
850
+
851
+ if use_system_tables:
852
+ lineage = (
853
+ self.get_catalog_column_lineage_via_system_tables(
854
+ table.ref.catalog, start_time, end_time
855
+ )
856
+ .get(table.ref.schema, {})
857
+ .get(table.ref.table, {})
858
+ )
859
+ else:
860
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
861
+ futures = [
862
+ executor.submit(
863
+ self.list_lineages_by_column_via_http_api,
864
+ table.ref.qualified_table_name,
865
+ column_name,
866
+ )
867
+ for column_name in column_names
868
+ ]
869
+ lineage = {
870
+ column_name: future.result()
871
+ for column_name, future in zip(column_names, futures)
872
+ }
873
+
874
+ for column_name in column_names:
875
+ for item in lineage.get(column_name) or []:
876
+ table_ref = TableReference.create_from_lineage(
877
+ item,
878
+ table.schema.catalog.metastore,
879
+ )
880
+ if table_ref:
881
+ table.upstreams.setdefault(table_ref, {}).setdefault(
882
+ column_name, []
883
+ ).append(item["name"])
884
+
354
885
  except Exception as e:
355
886
  logger.warning(
356
- f"Error getting column lineage on table {table.ref}, column {column_name}: {e}",
887
+ f"Error getting column lineage on table {table.ref}: {e}",
357
888
  exc_info=True,
358
889
  )
359
890
 
@@ -461,6 +992,45 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
461
992
  if optional_column:
462
993
  yield optional_column
463
994
 
995
+ def _create_ml_model(
996
+ self, schema: Schema, obj: RegisteredModelInfo
997
+ ) -> Optional[Model]:
998
+ if not obj.name or not obj.full_name:
999
+ self.report.num_ml_models_missing_name += 1
1000
+ return None
1001
+ return Model(
1002
+ id=obj.full_name,
1003
+ name=obj.name,
1004
+ description=obj.comment,
1005
+ schema_name=schema.name,
1006
+ catalog_name=schema.catalog.name,
1007
+ created_at=parse_ts_millis(obj.created_at),
1008
+ updated_at=parse_ts_millis(obj.updated_at),
1009
+ )
1010
+
1011
+ def _create_ml_model_version(
1012
+ self, model: Model, obj: ModelVersionInfo
1013
+ ) -> Optional[ModelVersion]:
1014
+ if obj.version is None:
1015
+ return None
1016
+
1017
+ aliases = []
1018
+ if obj.aliases:
1019
+ for alias in obj.aliases:
1020
+ if alias.alias_name:
1021
+ aliases.append(alias.alias_name)
1022
+ return ModelVersion(
1023
+ id=f"{model.id}_{obj.version}",
1024
+ name=f"{model.name}_{obj.version}",
1025
+ model=model,
1026
+ version=str(obj.version),
1027
+ aliases=aliases,
1028
+ description=obj.comment,
1029
+ created_at=parse_ts_millis(obj.created_at),
1030
+ updated_at=parse_ts_millis(obj.updated_at),
1031
+ created_by=obj.created_by,
1032
+ )
1033
+
464
1034
  def _create_service_principal(
465
1035
  self, obj: DatabricksServicePrincipal
466
1036
  ) -> Optional[ServicePrincipal]:
@@ -492,3 +1062,176 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
492
1062
  executed_as_user_id=info.executed_as_user_id,
493
1063
  executed_as_user_name=info.executed_as_user_name,
494
1064
  )
1065
+
1066
+ def _execute_sql_query(self, query: str, params: Sequence[Any] = ()) -> List[Row]:
1067
+ """Execute SQL query using databricks-sql connector for better performance"""
1068
+ logger.debug(f"Executing SQL query with {len(params)} parameters")
1069
+ if logger.isEnabledFor(logging.DEBUG):
1070
+ # Only log full query in debug mode to avoid performance overhead
1071
+ logger.debug(f"Full SQL query: {query}")
1072
+ if params:
1073
+ logger.debug(f"Query parameters: {params}")
1074
+
1075
+ # Check if warehouse_id is available for SQL operations
1076
+ if not self.warehouse_id:
1077
+ self.report.report_warning(
1078
+ "Cannot execute SQL query",
1079
+ "warehouse_id is not configured. SQL operations require a valid warehouse_id to be set in the Unity Catalog configuration",
1080
+ )
1081
+ logger.warning(
1082
+ "Cannot execute SQL query: warehouse_id is not configured. "
1083
+ "SQL operations require a valid warehouse_id to be set in the Unity Catalog configuration."
1084
+ )
1085
+ return []
1086
+
1087
+ # Log connection parameters (with masked token)
1088
+ masked_params = {**self._sql_connection_params}
1089
+ if "access_token" in masked_params:
1090
+ masked_params["access_token"] = "***MASKED***"
1091
+ logger.debug(f"Using connection parameters: {masked_params}")
1092
+
1093
+ # Log proxy environment variables that affect SQL connections
1094
+ proxy_env_debug = {}
1095
+ for var in ["HTTP_PROXY", "HTTPS_PROXY", "http_proxy", "https_proxy"]:
1096
+ value = os.environ.get(var)
1097
+ if value:
1098
+ proxy_env_debug[var] = mask_proxy_credentials(value)
1099
+
1100
+ if proxy_env_debug:
1101
+ logger.debug(
1102
+ f"SQL connection will use proxy environment variables: {proxy_env_debug}"
1103
+ )
1104
+ else:
1105
+ logger.debug("No proxy environment variables detected for SQL connection")
1106
+
1107
+ try:
1108
+ with (
1109
+ connect(**self._sql_connection_params) as connection,
1110
+ connection.cursor() as cursor,
1111
+ ):
1112
+ cursor.execute(query, list(params))
1113
+ rows = cursor.fetchall()
1114
+ logger.debug(
1115
+ f"SQL query executed successfully, returned {len(rows)} rows"
1116
+ )
1117
+ return rows
1118
+
1119
+ except Exception as e:
1120
+ logger.warning(f"Failed to execute SQL query: {e}", exc_info=True)
1121
+ if logger.isEnabledFor(logging.DEBUG):
1122
+ # Only log failed query details in debug mode for security
1123
+ logger.debug(f"SQL query that failed: {query}")
1124
+ logger.debug(f"SQL query parameters: {params}")
1125
+
1126
+ # Check if this might be a proxy-related error
1127
+ error_str = str(e).lower()
1128
+ if any(
1129
+ proxy_keyword in error_str
1130
+ for proxy_keyword in [
1131
+ "proxy",
1132
+ "407",
1133
+ "authentication required",
1134
+ "tunnel",
1135
+ "connect",
1136
+ ]
1137
+ ):
1138
+ logger.error(
1139
+ "SQL query failure appears to be proxy-related. "
1140
+ "Please check proxy configuration and authentication. "
1141
+ f"Proxy environment variables detected: {list(proxy_env_debug.keys())}"
1142
+ )
1143
+
1144
+ return []
1145
+
1146
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
1147
+ def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
1148
+ """Optimized version using databricks-sql"""
1149
+ logger.info(f"Fetching schema tags for catalog: `{catalog}`")
1150
+
1151
+ query = f"SELECT * FROM `{catalog}`.information_schema.schema_tags"
1152
+ rows = self._execute_sql_query(query)
1153
+
1154
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
1155
+
1156
+ for row in rows:
1157
+ catalog_name, schema_name, tag_name, tag_value = row
1158
+ schema_key = f"{catalog_name}.{schema_name}"
1159
+
1160
+ if schema_key not in result_dict:
1161
+ result_dict[schema_key] = []
1162
+
1163
+ result_dict[schema_key].append(
1164
+ UnityCatalogTag(key=tag_name, value=tag_value)
1165
+ )
1166
+
1167
+ return result_dict
1168
+
1169
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
1170
+ def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
1171
+ """Optimized version using databricks-sql"""
1172
+ logger.info(f"Fetching table tags for catalog: `{catalog}`")
1173
+
1174
+ query = f"SELECT * FROM `{catalog}`.information_schema.catalog_tags"
1175
+ rows = self._execute_sql_query(query)
1176
+
1177
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
1178
+
1179
+ for row in rows:
1180
+ catalog_name, tag_name, tag_value = row
1181
+
1182
+ if catalog_name not in result_dict:
1183
+ result_dict[catalog_name] = []
1184
+
1185
+ result_dict[catalog_name].append(
1186
+ UnityCatalogTag(key=tag_name, value=tag_value)
1187
+ )
1188
+
1189
+ return result_dict
1190
+
1191
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
1192
+ def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
1193
+ """Optimized version using databricks-sql"""
1194
+ logger.info(f"Fetching table tags for catalog: `{catalog}`")
1195
+
1196
+ query = f"SELECT * FROM `{catalog}`.information_schema.table_tags"
1197
+ rows = self._execute_sql_query(query)
1198
+
1199
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
1200
+
1201
+ for row in rows:
1202
+ catalog_name, schema_name, table_name, tag_name, tag_value = row
1203
+ table_key = f"{catalog_name}.{schema_name}.{table_name}"
1204
+
1205
+ if table_key not in result_dict:
1206
+ result_dict[table_key] = []
1207
+
1208
+ result_dict[table_key].append(
1209
+ UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
1210
+ )
1211
+
1212
+ return result_dict
1213
+
1214
+ @cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
1215
+ def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
1216
+ """Optimized version using databricks-sql"""
1217
+ logger.info(f"Fetching column tags for catalog: `{catalog}`")
1218
+
1219
+ query = f"SELECT * FROM `{catalog}`.information_schema.column_tags"
1220
+ rows = self._execute_sql_query(query)
1221
+
1222
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
1223
+
1224
+ for row in rows:
1225
+ catalog_name, schema_name, table_name, column_name, tag_name, tag_value = (
1226
+ row
1227
+ )
1228
+ column_key = f"{catalog_name}.{schema_name}.{table_name}.{column_name}"
1229
+
1230
+ if column_key not in result_dict:
1231
+ result_dict[column_key] = []
1232
+
1233
+ result_dict[column_key].append(
1234
+ UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
1235
+ )
1236
+
1237
+ return result_dict