acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -70,3 +70,30 @@ class HasStructuredPropertiesPatch(MetadataPatchProposal):
70
70
  ),
71
71
  )
72
72
  return self
73
+
74
+ def set_structured_property_manual(
75
+ self, property: StructuredPropertyValueAssignmentClass
76
+ ) -> Self:
77
+ """Add or update a structured property, using a StructuredPropertyValueAssignmentClass object."""
78
+
79
+ self.remove_structured_property(property.propertyUrn)
80
+ self._add_patch(
81
+ StructuredPropertiesClass.ASPECT_NAME,
82
+ "add",
83
+ path=("properties", property.propertyUrn),
84
+ value=property,
85
+ )
86
+ return self
87
+
88
+ def add_structured_property_manual(
89
+ self, property: StructuredPropertyValueAssignmentClass
90
+ ) -> Self:
91
+ """Add a structured property, using a StructuredPropertyValueAssignmentClass object."""
92
+
93
+ self._add_patch(
94
+ StructuredPropertiesClass.ASPECT_NAME,
95
+ "add",
96
+ path=("properties", property.propertyUrn),
97
+ value=property,
98
+ )
99
+ return self
datahub/specific/chart.py CHANGED
@@ -77,7 +77,7 @@ class ChartPatchBuilder(
77
77
  ChartInfo.ASPECT_NAME,
78
78
  "add",
79
79
  path=("inputEdges", input_urn),
80
- value=input_urn,
80
+ value=input_edge,
81
81
  )
82
82
  return self
83
83
 
@@ -1,15 +1,19 @@
1
- from typing import List, Optional, Tuple, Union
1
+ from typing import List, Optional, Set, Tuple, Union
2
2
 
3
3
  from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
4
4
  from datahub.metadata.schema_classes import (
5
5
  DataJobInfoClass as DataJobInfo,
6
6
  DataJobInputOutputClass as DataJobInputOutput,
7
7
  EdgeClass as Edge,
8
+ FineGrainedLineageClass as FineGrainedLineage,
8
9
  KafkaAuditHeaderClass,
9
10
  SystemMetadataClass,
10
11
  )
11
12
  from datahub.metadata.urns import SchemaFieldUrn, Urn
12
13
  from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
14
+ from datahub.specific.aspect_helpers.fine_grained_lineage import (
15
+ HasFineGrainedLineagePatch,
16
+ )
13
17
  from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
14
18
  from datahub.specific.aspect_helpers.tags import HasTagsPatch
15
19
  from datahub.specific.aspect_helpers.terms import HasTermsPatch
@@ -20,6 +24,7 @@ class DataJobPatchBuilder(
20
24
  HasCustomPropertiesPatch,
21
25
  HasTagsPatch,
22
26
  HasTermsPatch,
27
+ HasFineGrainedLineagePatch,
23
28
  MetadataPatchProposal,
24
29
  ):
25
30
  def __init__(
@@ -40,10 +45,19 @@ class DataJobPatchBuilder(
40
45
  urn, system_metadata=system_metadata, audit_header=audit_header
41
46
  )
42
47
 
48
+ # Track fine-grained lineages for DataJob-specific handling
49
+ self._fine_grained_lineages_to_add: List[FineGrainedLineage] = []
50
+ self._fine_grained_lineage_keys_to_remove: Set[Tuple[str, str, str]] = set()
51
+ self._fine_grained_lineages_set: Optional[List[FineGrainedLineage]] = None
52
+
43
53
  @classmethod
44
54
  def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
45
55
  return DataJobInfo.ASPECT_NAME, ("customProperties",)
46
56
 
57
+ @classmethod
58
+ def _fine_grained_lineage_location(cls) -> Tuple[str, PatchPath]:
59
+ return DataJobInputOutput.ASPECT_NAME, ("fineGrainedLineages",)
60
+
47
61
  def add_input_datajob(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilder":
48
62
  """
49
63
  Adds an input data job to the DataJobPatchBuilder.
@@ -9,6 +9,9 @@ from datahub.metadata.schema_classes import (
9
9
  )
10
10
  from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
11
11
  from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
12
+ from datahub.specific.aspect_helpers.structured_properties import (
13
+ HasStructuredPropertiesPatch,
14
+ )
12
15
  from datahub.specific.aspect_helpers.tags import HasTagsPatch
13
16
  from datahub.specific.aspect_helpers.terms import HasTermsPatch
14
17
 
@@ -16,6 +19,7 @@ from datahub.specific.aspect_helpers.terms import HasTermsPatch
16
19
  class DataProductPatchBuilder(
17
20
  HasOwnershipPatch,
18
21
  HasCustomPropertiesPatch,
22
+ HasStructuredPropertiesPatch,
19
23
  HasTagsPatch,
20
24
  HasTermsPatch,
21
25
  MetadataPatchProposal,
@@ -1,3 +1,4 @@
1
+ import warnings
1
2
  from typing import Generic, List, Optional, Tuple, TypeVar, Union
2
3
 
3
4
  from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
@@ -17,7 +18,11 @@ from datahub.metadata.schema_classes import (
17
18
  )
18
19
  from datahub.metadata.urns import DatasetUrn, TagUrn, Urn
19
20
  from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
21
+ from datahub.specific.aspect_helpers.fine_grained_lineage import (
22
+ HasFineGrainedLineagePatch,
23
+ )
20
24
  from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
25
+ from datahub.specific.aspect_helpers.siblings import HasSiblingsPatch
21
26
  from datahub.specific.aspect_helpers.structured_properties import (
22
27
  HasStructuredPropertiesPatch,
23
28
  )
@@ -99,6 +104,8 @@ class DatasetPatchBuilder(
99
104
  HasStructuredPropertiesPatch,
100
105
  HasTagsPatch,
101
106
  HasTermsPatch,
107
+ HasFineGrainedLineagePatch,
108
+ HasSiblingsPatch,
102
109
  MetadataPatchProposal,
103
110
  ):
104
111
  def __init__(
@@ -115,6 +122,10 @@ class DatasetPatchBuilder(
115
122
  def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
116
123
  return DatasetProperties.ASPECT_NAME, ("customProperties",)
117
124
 
125
+ @classmethod
126
+ def _fine_grained_lineage_location(cls) -> Tuple[str, PatchPath]:
127
+ return UpstreamLineage.ASPECT_NAME, ("fineGrainedLineages",)
128
+
118
129
  def add_upstream_lineage(self, upstream: Upstream) -> "DatasetPatchBuilder":
119
130
  self._add_patch(
120
131
  UpstreamLineage.ASPECT_NAME,
@@ -144,75 +155,44 @@ class DatasetPatchBuilder(
144
155
  def add_fine_grained_upstream_lineage(
145
156
  self, fine_grained_lineage: FineGrainedLineage
146
157
  ) -> "DatasetPatchBuilder":
147
- (
148
- transform_op,
149
- downstream_urn,
150
- query_id,
151
- ) = DatasetPatchBuilder.get_fine_grained_key(fine_grained_lineage)
152
- for upstream_urn in fine_grained_lineage.upstreams or []:
153
- self._add_patch(
154
- UpstreamLineage.ASPECT_NAME,
155
- "add",
156
- path=self._build_fine_grained_path(
157
- transform_op, downstream_urn, query_id, upstream_urn
158
- ),
159
- value={"confidenceScore": fine_grained_lineage.confidenceScore},
160
- )
161
- return self
162
-
163
- @staticmethod
164
- def get_fine_grained_key(
165
- fine_grained_lineage: FineGrainedLineage,
166
- ) -> Tuple[str, str, str]:
167
- downstreams = fine_grained_lineage.downstreams or []
168
- if len(downstreams) != 1:
169
- raise TypeError("Cannot patch with more or less than one downstream.")
170
- transform_op = fine_grained_lineage.transformOperation or "NONE"
171
- downstream_urn = downstreams[0]
172
- query_id = fine_grained_lineage.query or "NONE"
173
- return transform_op, downstream_urn, query_id
174
-
175
- @classmethod
176
- def _build_fine_grained_path(
177
- cls, transform_op: str, downstream_urn: str, query_id: str, upstream_urn: str
178
- ) -> PatchPath:
179
- return (
180
- "fineGrainedLineages",
181
- transform_op,
182
- downstream_urn,
183
- query_id,
184
- upstream_urn,
158
+ """
159
+ Deprecated: Use `add_fine_grained_lineage` instead.
160
+ """
161
+ warnings.warn(
162
+ "add_fine_grained_upstream_lineage() is deprecated."
163
+ " Use add_fine_grained_lineage() instead.",
164
+ DeprecationWarning,
165
+ stacklevel=2,
185
166
  )
167
+ return self.add_fine_grained_lineage(fine_grained_lineage)
186
168
 
187
169
  def remove_fine_grained_upstream_lineage(
188
170
  self, fine_grained_lineage: FineGrainedLineage
189
171
  ) -> "DatasetPatchBuilder":
190
- (
191
- transform_op,
192
- downstream_urn,
193
- query_id,
194
- ) = DatasetPatchBuilder.get_fine_grained_key(fine_grained_lineage)
195
- for upstream_urn in fine_grained_lineage.upstreams or []:
196
- self._add_patch(
197
- UpstreamLineage.ASPECT_NAME,
198
- "remove",
199
- path=self._build_fine_grained_path(
200
- transform_op, downstream_urn, query_id, upstream_urn
201
- ),
202
- value={},
203
- )
204
- return self
172
+ """
173
+ Deprecated: Use `remove_fine_grained_lineage` instead.
174
+ """
175
+ warnings.warn(
176
+ "remove_fine_grained_upstream_lineage() is deprecated."
177
+ " Use remove_fine_grained_lineage() instead.",
178
+ DeprecationWarning,
179
+ stacklevel=2,
180
+ )
181
+ return self.remove_fine_grained_lineage(fine_grained_lineage)
205
182
 
206
183
  def set_fine_grained_upstream_lineages(
207
184
  self, fine_grained_lineages: List[FineGrainedLineage]
208
185
  ) -> "DatasetPatchBuilder":
209
- self._add_patch(
210
- UpstreamLineage.ASPECT_NAME,
211
- "add",
212
- path=("fineGrainedLineages",),
213
- value=fine_grained_lineages,
186
+ """
187
+ Deprecated: Use `set_fine_grained_lineages` instead.
188
+ """
189
+ warnings.warn(
190
+ "set_fine_grained_upstream_lineages() is deprecated."
191
+ " Use set_fine_grained_lineages() instead.",
192
+ DeprecationWarning,
193
+ stacklevel=2,
214
194
  )
215
- return self
195
+ return self.set_fine_grained_lineages(fine_grained_lineages)
216
196
 
217
197
  def for_field(
218
198
  self, field_path: str, editable: bool = True
@@ -52,6 +52,7 @@ class ParserState(Enum):
52
52
  STRING = 2
53
53
  COMMENT = 3
54
54
  MULTILINE_COMMENT = 4
55
+ BRACKETED_IDENTIFIER = 5
55
56
 
56
57
 
57
58
  class _StatementSplitter:
@@ -141,6 +142,10 @@ class _StatementSplitter:
141
142
  self.state = ParserState.STRING
142
143
  self.current_statement.append(c)
143
144
  prev_real_char = c
145
+ elif c == "[":
146
+ self.state = ParserState.BRACKETED_IDENTIFIER
147
+ self.current_statement.append(c)
148
+ prev_real_char = c
144
149
  elif c == "-" and next_char == "-":
145
150
  self.state = ParserState.COMMENT
146
151
  self.current_statement.append(c)
@@ -172,6 +177,14 @@ class _StatementSplitter:
172
177
  elif c == "'":
173
178
  self.state = ParserState.NORMAL
174
179
 
180
+ elif self.state == ParserState.BRACKETED_IDENTIFIER:
181
+ self.current_statement.append(c)
182
+ if c == "]" and next_char == "]":
183
+ self.current_statement.append(next_char)
184
+ self.i += 1
185
+ elif c == "]":
186
+ self.state = ParserState.NORMAL
187
+
175
188
  elif self.state == ParserState.COMMENT:
176
189
  self.current_statement.append(c)
177
190
  if c == "\n":
@@ -4,7 +4,6 @@ import enum
4
4
  import functools
5
5
  import json
6
6
  import logging
7
- import os
8
7
  import pathlib
9
8
  import tempfile
10
9
  import uuid
@@ -14,10 +13,10 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Union, cast
14
13
 
15
14
  import datahub.emitter.mce_builder as builder
16
15
  import datahub.metadata.schema_classes as models
16
+ from datahub.configuration.env_vars import get_sql_agg_query_log
17
17
  from datahub.configuration.time_window_config import get_time_bucket
18
18
  from datahub.emitter.mce_builder import get_sys_time, make_ts_millis
19
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
- from datahub.emitter.sql_parsing_builder import compute_upstream_fields
21
20
  from datahub.ingestion.api.closeable import Closeable
22
21
  from datahub.ingestion.api.report import Report
23
22
  from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -49,6 +48,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
49
48
  sqlglot_lineage,
50
49
  )
51
50
  from datahub.sql_parsing.sqlglot_utils import (
51
+ DialectOrStr,
52
52
  _parse_statement,
53
53
  get_query_fingerprint,
54
54
  try_format_query,
@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
58
58
  ToolMetaExtractorReport,
59
59
  )
60
60
  from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
61
+ from datahub.utilities.dedup_list import deduplicate_list
61
62
  from datahub.utilities.file_backed_collections import (
62
63
  ConnectionWrapper,
63
64
  FileBackedDict,
@@ -82,7 +83,7 @@ class QueryLogSetting(enum.Enum):
82
83
  _DEFAULT_USER_URN = CorpUserUrn("_ingestion")
83
84
  _MISSING_SESSION_ID = "__MISSING_SESSION_ID"
84
85
  _DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[
85
- os.getenv("DATAHUB_SQL_AGG_QUERY_LOG") or QueryLogSetting.DISABLED.name
86
+ get_sql_agg_query_log() or QueryLogSetting.DISABLED.name
86
87
  ]
87
88
  MAX_UPSTREAM_TABLES_COUNT = 300
88
89
  MAX_FINEGRAINEDLINEAGE_COUNT = 2000
@@ -108,6 +109,7 @@ class ObservedQuery:
108
109
  default_schema: Optional[str] = None
109
110
  query_hash: Optional[str] = None
110
111
  usage_multiplier: int = 1
112
+ override_dialect: Optional[DialectOrStr] = None
111
113
 
112
114
  # Use this to store additional key-value information about the query for debugging.
113
115
  extra_info: Optional[dict] = None
@@ -140,6 +142,7 @@ class QueryMetadata:
140
142
 
141
143
  used_temp_tables: bool = True
142
144
 
145
+ extra_info: Optional[dict] = None
143
146
  origin: Optional[Urn] = None
144
147
 
145
148
  def make_created_audit_stamp(self) -> models.AuditStampClass:
@@ -188,6 +191,7 @@ class QueryMetadata:
188
191
  source=models.QuerySourceClass.SYSTEM,
189
192
  created=self.make_created_audit_stamp(),
190
193
  lastModified=self.make_last_modified_audit_stamp(),
194
+ origin=self.origin.urn() if self.origin else None,
191
195
  )
192
196
 
193
197
 
@@ -263,7 +267,7 @@ class PreparsedQuery:
263
267
  query_type_props: QueryTypeProps = dataclasses.field(
264
268
  default_factory=lambda: QueryTypeProps()
265
269
  )
266
- # Use this to store addtitional key-value information about query for debugging
270
+ # Use this to store additional key-value information about the query for debugging.
267
271
  extra_info: Optional[dict] = None
268
272
  origin: Optional[Urn] = None
269
273
 
@@ -629,6 +633,9 @@ class SqlParsingAggregator(Closeable):
629
633
  TableSwap,
630
634
  ],
631
635
  ) -> None:
636
+ """
637
+ This assumes that queries come in order of increasing timestamps.
638
+ """
632
639
  if isinstance(item, KnownQueryLineageInfo):
633
640
  self.add_known_query_lineage(item)
634
641
  elif isinstance(item, KnownLineageMapping):
@@ -831,6 +838,7 @@ class SqlParsingAggregator(Closeable):
831
838
  session_id=session_id,
832
839
  timestamp=observed.timestamp,
833
840
  user=observed.user,
841
+ override_dialect=observed.override_dialect,
834
842
  )
835
843
  if parsed.debug_info.error:
836
844
  self.report.observed_query_parse_failures.append(
@@ -859,7 +867,7 @@ class SqlParsingAggregator(Closeable):
859
867
  downstream=parsed.out_tables[0] if parsed.out_tables else None,
860
868
  column_lineage=parsed.column_lineage,
861
869
  # TODO: We need a full list of columns referenced, not just the out tables.
862
- column_usage=compute_upstream_fields(parsed),
870
+ column_usage=self._compute_upstream_fields(parsed),
863
871
  inferred_schema=infer_output_schema(parsed),
864
872
  confidence_score=parsed.debug_info.confidence,
865
873
  extra_info=observed.extra_info,
@@ -948,6 +956,7 @@ class SqlParsingAggregator(Closeable):
948
956
  column_usage=parsed.column_usage or {},
949
957
  confidence_score=parsed.confidence_score,
950
958
  used_temp_tables=session_has_temp_tables,
959
+ extra_info=parsed.extra_info,
951
960
  origin=parsed.origin,
952
961
  )
953
962
  )
@@ -1147,7 +1156,7 @@ class SqlParsingAggregator(Closeable):
1147
1156
  actor=None,
1148
1157
  upstreams=parsed.in_tables,
1149
1158
  column_lineage=parsed.column_lineage or [],
1150
- column_usage=compute_upstream_fields(parsed),
1159
+ column_usage=self._compute_upstream_fields(parsed),
1151
1160
  confidence_score=parsed.debug_info.confidence,
1152
1161
  )
1153
1162
  )
@@ -1164,6 +1173,7 @@ class SqlParsingAggregator(Closeable):
1164
1173
  session_id: str = _MISSING_SESSION_ID,
1165
1174
  timestamp: Optional[datetime] = None,
1166
1175
  user: Optional[Union[CorpUserUrn, CorpGroupUrn]] = None,
1176
+ override_dialect: Optional[DialectOrStr] = None,
1167
1177
  ) -> SqlParsingResult:
1168
1178
  with self.report.sql_parsing_timer:
1169
1179
  parsed = sqlglot_lineage(
@@ -1171,6 +1181,7 @@ class SqlParsingAggregator(Closeable):
1171
1181
  schema_resolver=schema_resolver,
1172
1182
  default_db=default_db,
1173
1183
  default_schema=default_schema,
1184
+ override_dialect=override_dialect,
1174
1185
  )
1175
1186
  self.report.num_sql_parsed += 1
1176
1187
 
@@ -1329,11 +1340,25 @@ class SqlParsingAggregator(Closeable):
1329
1340
  upstreams.setdefault(upstream, query.query_id)
1330
1341
 
1331
1342
  for lineage_info in query.column_lineage:
1332
- for upstream_ref in lineage_info.upstreams:
1333
- cll[lineage_info.downstream.column].setdefault(
1334
- SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
1335
- query.query_id,
1343
+ if (
1344
+ not lineage_info.downstream.column
1345
+ or not lineage_info.downstream.column.strip()
1346
+ ):
1347
+ logger.debug(
1348
+ f"Skipping lineage entry with empty downstream column in query {query.query_id}"
1336
1349
  )
1350
+ continue
1351
+
1352
+ for upstream_ref in lineage_info.upstreams:
1353
+ if upstream_ref.column and upstream_ref.column.strip():
1354
+ cll[lineage_info.downstream.column].setdefault(
1355
+ SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
1356
+ query.query_id,
1357
+ )
1358
+ else:
1359
+ logger.debug(
1360
+ f"Skipping empty column reference in lineage for query {query.query_id}"
1361
+ )
1337
1362
 
1338
1363
  # Finally, we can build our lineage edge.
1339
1364
  required_queries = OrderedSet[QueryId]()
@@ -1491,9 +1516,9 @@ class SqlParsingAggregator(Closeable):
1491
1516
  return
1492
1517
 
1493
1518
  # If a query doesn't involve any allowed tables, skip it.
1494
- if downstream_urn is None and not any(
1495
- self.is_allowed_table(urn) for urn in query.upstreams
1496
- ):
1519
+ if (
1520
+ downstream_urn is None or not self.is_allowed_table(downstream_urn)
1521
+ ) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
1497
1522
  self.report.num_queries_skipped_due_to_filters += 1
1498
1523
  return
1499
1524
 
@@ -1574,27 +1599,33 @@ class SqlParsingAggregator(Closeable):
1574
1599
 
1575
1600
  @dataclasses.dataclass
1576
1601
  class QueryLineageInfo:
1577
- upstreams: List[UrnStr] # this is direct upstreams, with *no temp tables*
1578
- column_lineage: List[ColumnLineageInfo]
1602
+ upstreams: OrderedSet[
1603
+ UrnStr
1604
+ ] # this is direct upstreams, with *no temp tables*
1605
+ column_lineage: OrderedSet[ColumnLineageInfo]
1579
1606
  confidence_score: float
1580
1607
 
1581
1608
  def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
1582
- self.upstreams += other_query.upstreams
1583
- self.column_lineage += other_query.column_lineage
1609
+ self.upstreams.update(other_query.upstreams)
1610
+ self.column_lineage.update(other_query.column_lineage)
1584
1611
  self.confidence_score = min(
1585
1612
  self.confidence_score, other_query.confidence_score
1586
1613
  )
1587
1614
 
1615
+ cache: Dict[str, QueryLineageInfo] = {}
1616
+
1588
1617
  def _recurse_into_query(
1589
1618
  query: QueryMetadata, recursion_path: List[QueryId]
1590
1619
  ) -> QueryLineageInfo:
1591
1620
  if query.query_id in recursion_path:
1592
1621
  # This is a cycle, so we just return the query as-is.
1593
1622
  return QueryLineageInfo(
1594
- upstreams=query.upstreams,
1595
- column_lineage=query.column_lineage,
1623
+ upstreams=OrderedSet(query.upstreams),
1624
+ column_lineage=OrderedSet(query.column_lineage),
1596
1625
  confidence_score=query.confidence_score,
1597
1626
  )
1627
+ if query.query_id in cache:
1628
+ return cache[query.query_id]
1598
1629
  recursion_path = [*recursion_path, query.query_id]
1599
1630
  composed_of_queries.add(query.query_id)
1600
1631
 
@@ -1609,7 +1640,7 @@ class SqlParsingAggregator(Closeable):
1609
1640
  upstream_query = self._query_map.get(upstream_query_id)
1610
1641
  if (
1611
1642
  upstream_query
1612
- and upstream_query.query_id not in composed_of_queries
1643
+ and upstream_query.query_id not in recursion_path
1613
1644
  ):
1614
1645
  temp_query_lineage_info = _recurse_into_query(
1615
1646
  upstream_query, recursion_path
@@ -1669,11 +1700,14 @@ class SqlParsingAggregator(Closeable):
1669
1700
  ]
1670
1701
  )
1671
1702
 
1672
- return QueryLineageInfo(
1673
- upstreams=list(new_upstreams),
1674
- column_lineage=new_cll,
1703
+ ret = QueryLineageInfo(
1704
+ upstreams=new_upstreams,
1705
+ column_lineage=OrderedSet(new_cll),
1675
1706
  confidence_score=new_confidence_score,
1676
1707
  )
1708
+ cache[query.query_id] = ret
1709
+
1710
+ return ret
1677
1711
 
1678
1712
  resolved_lineage_info = _recurse_into_query(base_query, [])
1679
1713
 
@@ -1706,20 +1740,30 @@ class SqlParsingAggregator(Closeable):
1706
1740
  )
1707
1741
 
1708
1742
  merged_query_text = ";\n\n".join(
1709
- [q.formatted_query_string for q in ordered_queries]
1743
+ deduplicate_list([q.formatted_query_string for q in ordered_queries])
1710
1744
  )
1711
1745
 
1712
1746
  resolved_query = dataclasses.replace(
1713
1747
  base_query,
1714
1748
  query_id=composite_query_id,
1715
1749
  formatted_query_string=merged_query_text,
1716
- upstreams=resolved_lineage_info.upstreams,
1717
- column_lineage=resolved_lineage_info.column_lineage,
1750
+ upstreams=list(resolved_lineage_info.upstreams),
1751
+ column_lineage=list(resolved_lineage_info.column_lineage),
1718
1752
  confidence_score=resolved_lineage_info.confidence_score,
1719
1753
  )
1720
1754
 
1721
1755
  return resolved_query
1722
1756
 
1757
+ @staticmethod
1758
+ def _compute_upstream_fields(
1759
+ result: SqlParsingResult,
1760
+ ) -> Dict[UrnStr, Set[UrnStr]]:
1761
+ upstream_fields: Dict[UrnStr, Set[UrnStr]] = defaultdict(set)
1762
+ for cl in result.column_lineage or []:
1763
+ for upstream in cl.upstreams:
1764
+ upstream_fields[upstream.table].add(upstream.column)
1765
+ return upstream_fields
1766
+
1723
1767
  def _gen_usage_statistics_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
1724
1768
  if not self._usage_aggregator:
1725
1769
  return