acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,18 @@ import functools
5
5
  import logging
6
6
  import traceback
7
7
  from collections import defaultdict
8
- from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
8
+ from typing import (
9
+ AbstractSet,
10
+ Any,
11
+ Dict,
12
+ Iterable,
13
+ List,
14
+ Optional,
15
+ Set,
16
+ Tuple,
17
+ TypeVar,
18
+ Union,
19
+ )
9
20
 
10
21
  import pydantic.dataclasses
11
22
  import sqlglot
@@ -45,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
45
56
  QueryTypeProps,
46
57
  )
47
58
  from datahub.sql_parsing.sqlglot_utils import (
59
+ DialectOrStr,
48
60
  get_dialect,
49
61
  get_query_fingerprint_debug,
50
62
  is_dialect_instance,
@@ -54,6 +66,7 @@ from datahub.utilities.cooperative_timeout import (
54
66
  CooperativeTimeoutError,
55
67
  cooperative_timeout,
56
68
  )
69
+ from datahub.utilities.ordered_set import OrderedSet
57
70
 
58
71
  assert SQLGLOT_PATCHED
59
72
 
@@ -112,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
112
125
 
113
126
 
114
127
  class DownstreamColumnRef(_ParserBaseModel):
128
+ """
129
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
130
+ What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
131
+ auto-generated class from .pdl model files. We need generic solution allowing us to either:
132
+ 1. Implement hashing for .pdl model objects
133
+ 2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
134
+ hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
135
+ to understand that instruction as well.
136
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
137
+ """
138
+
115
139
  table: Optional[Urn] = None
116
140
  column: str
117
141
  column_type: Optional[SchemaFieldDataTypeClass] = None
@@ -127,20 +151,53 @@ class DownstreamColumnRef(_ParserBaseModel):
127
151
  return v
128
152
  return SchemaFieldDataTypeClass.from_obj(v)
129
153
 
154
+ def __hash__(self) -> int:
155
+ return hash((self.table, self.column, self.native_column_type))
156
+
157
+
158
+ class ColumnTransformation(_FrozenModel):
159
+ is_direct_copy: bool
160
+ column_logic: str
161
+
130
162
 
131
163
  class _ColumnLineageInfo(_ParserBaseModel):
132
164
  downstream: _DownstreamColumnRef
133
165
  upstreams: List[_ColumnRef]
134
166
 
135
- logic: Optional[str] = None
167
+ logic: Optional[ColumnTransformation] = None
136
168
 
137
169
 
138
170
  class ColumnLineageInfo(_ParserBaseModel):
171
+ """
172
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
173
+ To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
174
+ depending on it.
175
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
176
+ """
177
+
139
178
  downstream: DownstreamColumnRef
140
179
  upstreams: List[ColumnRef]
141
180
 
142
- # Logic for this column, as a SQL expression.
143
- logic: Optional[str] = pydantic.Field(default=None, exclude=True)
181
+ logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
182
+
183
+ def __hash__(self) -> int:
184
+ return hash((self.downstream, tuple(self.upstreams), self.logic))
185
+
186
+
187
+ class _JoinInfo(_ParserBaseModel):
188
+ join_type: str
189
+ left_tables: List[_TableName]
190
+ right_tables: List[_TableName]
191
+ on_clause: Optional[str]
192
+ columns_involved: List[_ColumnRef]
193
+
194
+
195
+ class JoinInfo(_ParserBaseModel):
196
+ join_type: str
197
+ left_tables: List[Urn]
198
+ right_tables: List[Urn]
199
+ on_clause: Optional[str]
200
+ columns_involved: List[ColumnRef]
144
201
 
145
202
 
146
203
  class SqlParsingDebugInfo(_ParserBaseModel):
@@ -178,6 +235,7 @@ class SqlParsingResult(_ParserBaseModel):
178
235
  out_tables: List[Urn]
179
236
 
180
237
  column_lineage: Optional[List[ColumnLineageInfo]] = None
238
+ joins: Optional[List[JoinInfo]] = None
181
239
 
182
240
  # TODO include formatted original sql logic
183
241
  # TODO include list of referenced columns
@@ -197,13 +255,19 @@ class SqlParsingResult(_ParserBaseModel):
197
255
  )
198
256
 
199
257
 
258
+ def _extract_table_names(
259
+ iterable: Iterable[sqlglot.exp.Table],
260
+ ) -> OrderedSet[_TableName]:
261
+ return OrderedSet(_TableName.from_sqlglot_table(table) for table in iterable)
262
+
263
+
200
264
  def _table_level_lineage(
201
265
  statement: sqlglot.Expression, dialect: sqlglot.Dialect
202
- ) -> Tuple[Set[_TableName], Set[_TableName]]:
266
+ ) -> Tuple[AbstractSet[_TableName], AbstractSet[_TableName]]:
203
267
  # Generate table-level lineage.
204
268
  modified = (
205
- {
206
- _TableName.from_sqlglot_table(expr.this)
269
+ _extract_table_names(
270
+ expr.this
207
271
  for expr in statement.find_all(
208
272
  sqlglot.exp.Create,
209
273
  sqlglot.exp.Insert,
@@ -215,36 +279,36 @@ def _table_level_lineage(
215
279
  # In some cases like "MERGE ... then INSERT (col1, col2) VALUES (col1, col2)",
216
280
  # the `this` on the INSERT part isn't a table.
217
281
  if isinstance(expr.this, sqlglot.exp.Table)
218
- }
219
- | {
282
+ )
283
+ | _extract_table_names(
220
284
  # For statements that include a column list, like
221
285
  # CREATE DDL statements and `INSERT INTO table (col1, col2) SELECT ...`
222
286
  # the table name is nested inside a Schema object.
223
- _TableName.from_sqlglot_table(expr.this.this)
287
+ expr.this.this
224
288
  for expr in statement.find_all(
225
289
  sqlglot.exp.Create,
226
290
  sqlglot.exp.Insert,
227
291
  )
228
292
  if isinstance(expr.this, sqlglot.exp.Schema)
229
293
  and isinstance(expr.this.this, sqlglot.exp.Table)
230
- }
231
- | {
294
+ )
295
+ | _extract_table_names(
232
296
  # For drop statements, we only want it if a table/view is being dropped.
233
297
  # Other "kinds" will not have table.name populated.
234
- _TableName.from_sqlglot_table(expr.this)
298
+ expr.this
235
299
  for expr in ([statement] if isinstance(statement, sqlglot.exp.Drop) else [])
236
300
  if isinstance(expr.this, sqlglot.exp.Table)
237
301
  and expr.this.this
238
302
  and expr.this.name
239
- }
303
+ )
240
304
  )
241
305
 
242
306
  tables = (
243
- {
244
- _TableName.from_sqlglot_table(table)
307
+ _extract_table_names(
308
+ table
245
309
  for table in statement.find_all(sqlglot.exp.Table)
246
310
  if not isinstance(table.parent, sqlglot.exp.Drop)
247
- }
311
+ )
248
312
  # ignore references created in this query
249
313
  - modified
250
314
  # ignore CTEs created in this statement
@@ -479,16 +543,19 @@ def _select_statement_cll(
479
543
  root_scope: sqlglot.optimizer.Scope,
480
544
  column_resolver: _ColumnResolver,
481
545
  output_table: Optional[_TableName],
546
+ table_name_schema_mapping: Dict[_TableName, SchemaInfo],
547
+ default_db: Optional[str] = None,
548
+ default_schema: Optional[str] = None,
482
549
  ) -> List[_ColumnLineageInfo]:
483
550
  column_lineage: List[_ColumnLineageInfo] = []
484
551
 
485
552
  try:
486
- # List output columns.
487
553
  output_columns = [
488
554
  (select_col.alias_or_name, select_col) for select_col in statement.selects
489
555
  ]
490
556
  logger.debug("output columns: %s", [col[0] for col in output_columns])
491
- for output_col, original_col_expression in output_columns:
557
+
558
+ for output_col, _original_col_expression in output_columns:
492
559
  if not output_col or output_col == "*":
493
560
  # If schema information is available, the * will be expanded to the actual columns.
494
561
  # Otherwise, we can't process it.
@@ -512,15 +579,14 @@ def _select_statement_cll(
512
579
  trim_selects=False,
513
580
  # We don't need to pass the schema in here, since we've already qualified the columns.
514
581
  )
515
- # import pathlib
516
- # pathlib.Path("sqlglot.html").write_text(
517
- # str(lineage_node.to_html(dialect=dialect))
518
- # )
519
582
 
520
583
  # Generate SELECT lineage.
521
- direct_raw_col_upstreams = _get_direct_raw_col_upstreams(lineage_node)
522
-
523
- # column_logic = lineage_node.source
584
+ direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
585
+ lineage_node,
586
+ dialect,
587
+ default_db,
588
+ default_schema,
589
+ )
524
590
 
525
591
  # Fuzzy resolve the output column.
526
592
  original_col_expression = lineage_node.expression
@@ -539,7 +605,7 @@ def _select_statement_cll(
539
605
  if original_col_expression.type:
540
606
  output_col_type = original_col_expression.type
541
607
 
542
- # Fuzzy resolve upstream columns.
608
+ # Resolve upstream columns - table names should already be qualified from placeholder processing
543
609
  direct_resolved_col_upstreams = {
544
610
  _ColumnRef(
545
611
  table=edge.table,
@@ -560,7 +626,7 @@ def _select_statement_cll(
560
626
  column_type=output_col_type,
561
627
  ),
562
628
  upstreams=sorted(direct_resolved_col_upstreams),
563
- # logic=column_logic.sql(pretty=True, dialect=dialect),
629
+ logic=_get_column_transformation(lineage_node, dialect),
564
630
  )
565
631
  )
566
632
 
@@ -575,6 +641,7 @@ def _select_statement_cll(
575
641
 
576
642
  class _ColumnLineageWithDebugInfo(_ParserBaseModel):
577
643
  column_lineage: List[_ColumnLineageInfo]
644
+ joins: Optional[List[_JoinInfo]] = None
578
645
 
579
646
  select_statement: Optional[sqlglot.exp.Expression] = None
580
647
  # TODO: Add column exceptions here.
@@ -624,6 +691,13 @@ def _column_level_lineage(
624
691
  select_statement=select_statement,
625
692
  )
626
693
 
694
+ # Handle VALUES expressions separately - they have no upstream tables and no column lineage
695
+ if isinstance(select_statement, sqlglot.exp.Values):
696
+ return _ColumnLineageWithDebugInfo(
697
+ column_lineage=[],
698
+ select_statement=select_statement,
699
+ )
700
+
627
701
  assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
628
702
  try:
629
703
  root_scope = sqlglot.optimizer.build_scope(select_statement)
@@ -643,19 +717,35 @@ def _column_level_lineage(
643
717
  root_scope=root_scope,
644
718
  column_resolver=column_resolver,
645
719
  output_table=downstream_table,
720
+ table_name_schema_mapping=table_name_schema_mapping,
721
+ default_db=default_db,
722
+ default_schema=default_schema,
646
723
  )
647
724
 
725
+ joins: Optional[List[_JoinInfo]] = None
726
+ try:
727
+ # List join clauses.
728
+ joins = _list_joins(dialect=dialect, root_scope=root_scope)
729
+ logger.debug("Joins: %s", joins)
730
+ except Exception as e:
731
+ # This is a non-fatal error, so we can continue.
732
+ logger.debug("Failed to list joins: %s", e)
733
+
648
734
  return _ColumnLineageWithDebugInfo(
649
735
  column_lineage=column_lineage,
736
+ joins=joins,
650
737
  select_statement=select_statement,
651
738
  )
652
739
 
653
740
 
654
741
  def _get_direct_raw_col_upstreams(
655
742
  lineage_node: sqlglot.lineage.Node,
656
- ) -> Set[_ColumnRef]:
657
- # Using a set here to deduplicate upstreams.
658
- direct_raw_col_upstreams: Set[_ColumnRef] = set()
743
+ dialect: Optional[sqlglot.Dialect] = None,
744
+ default_db: Optional[str] = None,
745
+ default_schema: Optional[str] = None,
746
+ ) -> OrderedSet[_ColumnRef]:
747
+ # Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
748
+ direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
659
749
 
660
750
  for node in lineage_node.walk():
661
751
  if node.downstream:
@@ -682,6 +772,53 @@ def _get_direct_raw_col_upstreams(
682
772
  direct_raw_col_upstreams.add(
683
773
  _ColumnRef(table=table_ref, column=normalized_col)
684
774
  )
775
+ elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
776
+ # Handle placeholder expressions from lateral joins.
777
+ #
778
+ # In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
779
+ # expressions instead of regular table references. This is critical for lateral join column lineage.
780
+ #
781
+ # Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
782
+ # The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
783
+ # which we need to parse to establish the lineage: output.value <- my_table2.value
784
+ #
785
+ # Without this handling, lateral join column lineage would be incomplete/missing.
786
+ try:
787
+ parsed = sqlglot.parse_one(node.name, dialect=dialect)
788
+ if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
789
+ table_ref = _TableName.from_sqlglot_table(
790
+ sqlglot.parse_one(
791
+ parsed.table, into=sqlglot.exp.Table, dialect=dialect
792
+ )
793
+ )
794
+
795
+ # SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
796
+ # Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
797
+ # We need this runtime qualification to ensure proper lineage resolution.
798
+ # Only qualify if this appears to be a real table reference (not a temporary construct)
799
+ if (
800
+ not (table_ref.database or table_ref.db_schema)
801
+ and dialect is not None
802
+ ):
803
+ table_ref = table_ref.qualified(
804
+ dialect=dialect,
805
+ default_db=default_db,
806
+ default_schema=default_schema,
807
+ )
808
+
809
+ # Extract column name using proper isinstance check
810
+ if isinstance(parsed.this, sqlglot.exp.Identifier):
811
+ column_name = parsed.this.name
812
+ else:
813
+ column_name = str(parsed.this)
814
+ direct_raw_col_upstreams.add(
815
+ _ColumnRef(table=table_ref, column=column_name)
816
+ )
817
+ except Exception as e:
818
+ logger.debug(
819
+ f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
820
+ exc_info=True,
821
+ )
685
822
  else:
686
823
  # This branch doesn't matter. For example, a count(*) column would go here, and
687
824
  # we don't get any column-level lineage for that.
@@ -690,6 +827,268 @@ def _get_direct_raw_col_upstreams(
690
827
  return direct_raw_col_upstreams
691
828
 
692
829
 
830
+ def _is_single_column_expression(
831
+ expression: sqlglot.exp.Expression,
832
+ ) -> bool:
833
+ # Check if the expression is trivial, i.e. it's just a single column.
834
+ # Things like count(*) or coalesce(col, 0) are not single columns.
835
+ if isinstance(expression, sqlglot.exp.Alias):
836
+ expression = expression.this
837
+
838
+ return isinstance(expression, sqlglot.exp.Column)
839
+
840
+
841
+ def _get_column_transformation(
842
+ lineage_node: sqlglot.lineage.Node,
843
+ dialect: sqlglot.Dialect,
844
+ parent: Optional[sqlglot.lineage.Node] = None,
845
+ ) -> ColumnTransformation:
846
+ # expression = lineage_node.expression
847
+ # is_single_column_expression = _is_single_column_expression(lineage_node.expression)
848
+ if not lineage_node.downstream:
849
+ # parent_expression = parent.expression if parent else expression
850
+ if parent:
851
+ expression = parent.expression
852
+ is_copy = _is_single_column_expression(expression)
853
+ else:
854
+ # This case should rarely happen.
855
+ is_copy = True
856
+ expression = lineage_node.expression
857
+ return ColumnTransformation(
858
+ is_direct_copy=is_copy,
859
+ column_logic=expression.sql(dialect=dialect),
860
+ )
861
+
862
+ elif len(lineage_node.downstream) > 1 or not _is_single_column_expression(
863
+ lineage_node.expression
864
+ ):
865
+ return ColumnTransformation(
866
+ is_direct_copy=False,
867
+ column_logic=lineage_node.expression.sql(dialect=dialect),
868
+ )
869
+
870
+ else:
871
+ return _get_column_transformation(
872
+ lineage_node=lineage_node.downstream[0],
873
+ dialect=dialect,
874
+ parent=lineage_node,
875
+ )
876
+
877
+
878
+ def _get_join_side_tables(
879
+ target: sqlglot.exp.Expression,
880
+ dialect: sqlglot.Dialect,
881
+ scope: sqlglot.optimizer.Scope,
882
+ ) -> OrderedSet[_TableName]:
883
+ target_alias_or_name = target.alias_or_name
884
+ if (source := scope.sources.get(target_alias_or_name)) and isinstance(
885
+ source, sqlglot.exp.Table
886
+ ):
887
+ # If the source is a Scope, we need to do some resolution work.
888
+ return OrderedSet([_TableName.from_sqlglot_table(source)])
889
+
890
+ column = sqlglot.exp.Column(
891
+ this=sqlglot.exp.Star(),
892
+ table=sqlglot.exp.Identifier(this=target.alias_or_name),
893
+ )
894
+ columns_used = _get_raw_col_upstreams_for_expression(
895
+ select=column,
896
+ dialect=dialect,
897
+ scope=scope,
898
+ )
899
+ return OrderedSet(col.table for col in columns_used)
900
+
901
+
902
+ def _get_raw_col_upstreams_for_expression(
903
+ select: sqlglot.exp.Expression,
904
+ dialect: sqlglot.Dialect,
905
+ scope: sqlglot.optimizer.Scope,
906
+ ) -> OrderedSet[_ColumnRef]:
907
+ if not isinstance(scope.expression, sqlglot.exp.Query):
908
+ # Note that Select, Subquery, SetOperation, etc. are all subclasses of Query.
909
+ # So this line should basically never happen.
910
+ return OrderedSet()
911
+
912
+ original_expression = scope.expression
913
+ updated_expression = scope.expression.select(select, append=False, copy=True)
914
+
915
+ try:
916
+ scope.expression = updated_expression
917
+ node = sqlglot.lineage.to_node(
918
+ column=0,
919
+ scope=scope,
920
+ dialect=dialect,
921
+ trim_selects=False,
922
+ )
923
+
924
+ return _get_direct_raw_col_upstreams(node, dialect, None, None)
925
+ finally:
926
+ scope.expression = original_expression
927
+
928
+
929
+ def _list_joins(
930
+ dialect: sqlglot.Dialect,
931
+ root_scope: sqlglot.optimizer.Scope,
932
+ ) -> List[_JoinInfo]:
933
+ # TODO: Add a confidence tracker here.
934
+
935
+ joins: List[_JoinInfo] = []
936
+
937
+ scope: sqlglot.optimizer.Scope
938
+ for scope in root_scope.traverse():
939
+ # PART 1: Handle regular explicit JOINs (updated API)
940
+ join: sqlglot.exp.Join
941
+ for join in scope.expression.find_all(sqlglot.exp.Join):
942
+ left_side_tables: OrderedSet[_TableName] = OrderedSet()
943
+ from_clause: sqlglot.exp.From
944
+ for from_clause in scope.find_all(sqlglot.exp.From):
945
+ left_side_tables.update(
946
+ _get_join_side_tables(
947
+ target=from_clause.this,
948
+ dialect=dialect,
949
+ scope=scope,
950
+ )
951
+ )
952
+
953
+ right_side_tables: OrderedSet[_TableName] = OrderedSet()
954
+ if join_target := join.this:
955
+ right_side_tables = _get_join_side_tables(
956
+ target=join_target,
957
+ dialect=dialect,
958
+ scope=scope,
959
+ )
960
+
961
+ # We don't need to check for `using` here because it's normalized to `on`
962
+ # by the sqlglot optimizer.
963
+ on_clause: Optional[sqlglot.exp.Expression] = join.args.get("on")
964
+ if on_clause:
965
+ joined_columns = _get_raw_col_upstreams_for_expression(
966
+ select=on_clause, dialect=dialect, scope=scope
967
+ )
968
+
969
+ unique_tables = OrderedSet(col.table for col in joined_columns)
970
+ if not unique_tables:
971
+ logger.debug(
972
+ "Skipping join because we couldn't resolve the tables from the join condition: %s",
973
+ join.sql(dialect=dialect),
974
+ )
975
+ continue
976
+
977
+ # When we have an `on` clause, we only want to include tables whose columns are
978
+ # involved in the join condition. Without this, a statement like this:
979
+ # WITH cte_alias AS (select t1.id, t1.user_id, t2.other_col from t1 join t2 on t1.id = t2.id)
980
+ # SELECT * FROM users
981
+ # JOIN cte_alias ON users.id = cte_alias.user_id
982
+ # would incorrectly include t2 as part of the left side tables.
983
+ left_side_tables = OrderedSet(left_side_tables & unique_tables)
984
+ right_side_tables = OrderedSet(right_side_tables & unique_tables)
985
+ else:
986
+ # Some joins (cross join, lateral join, etc.) don't have an ON clause.
987
+ # In those cases, we have some best-effort logic at least extract the
988
+ # tables involved.
989
+ joined_columns = OrderedSet()
990
+
991
+ if not left_side_tables and not right_side_tables:
992
+ logger.debug(
993
+ "Skipping join because we couldn't resolve any tables from the join operands: %s",
994
+ join.sql(dialect=dialect),
995
+ )
996
+ continue
997
+ elif len(left_side_tables | right_side_tables) == 1:
998
+ # When we don't have an ON clause, we're more strict about the
999
+ # minimum number of tables we need to resolve to avoid false positives.
1000
+ # On the off chance someone is doing a self-cross-join, we'll miss it.
1001
+ logger.debug(
1002
+ "Skipping join because we couldn't resolve enough tables from the join operands: %s",
1003
+ join.sql(dialect=dialect),
1004
+ )
1005
+ continue
1006
+
1007
+ joins.append(
1008
+ _JoinInfo(
1009
+ join_type=_get_join_type(join),
1010
+ left_tables=list(left_side_tables),
1011
+ right_tables=list(right_side_tables),
1012
+ on_clause=on_clause.sql(dialect=dialect) if on_clause else None,
1013
+ columns_involved=list(sorted(joined_columns)),
1014
+ )
1015
+ )
1016
+
1017
+ # Handle LATERAL constructs
1018
+ for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
1019
+ # Get tables from non-lateral FROM clauses
1020
+ qualified_left: OrderedSet[_TableName] = OrderedSet()
1021
+ for from_clause in scope.find_all(sqlglot.exp.From):
1022
+ if not isinstance(from_clause.this, sqlglot.exp.Lateral):
1023
+ qualified_left.update(
1024
+ _get_join_side_tables(from_clause.this, dialect, scope)
1025
+ )
1026
+
1027
+ # Get tables from lateral subquery
1028
+ qualified_right: OrderedSet[_TableName] = OrderedSet()
1029
+ if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
1030
+ qualified_right.update(
1031
+ _TableName.from_sqlglot_table(t)
1032
+ for t in lateral.this.find_all(sqlglot.exp.Table)
1033
+ )
1034
+ qualified_right.update(qualified_left)
1035
+
1036
+ if qualified_left and qualified_right:
1037
+ joins.append(
1038
+ _JoinInfo(
1039
+ join_type="LATERAL JOIN",
1040
+ left_tables=list(qualified_left),
1041
+ right_tables=list(qualified_right),
1042
+ on_clause=None,
1043
+ columns_involved=[],
1044
+ )
1045
+ )
1046
+
1047
+ return joins
1048
+
1049
+
1050
+ def _get_join_type(join: sqlglot.exp.Join) -> str:
1051
+ """Returns the type of join as a string.
1052
+
1053
+ Args:
1054
+ join: A sqlglot Join expression.
1055
+
1056
+ Returns:
1057
+ Stringified join type e.g. "LEFT JOIN", "RIGHT OUTER JOIN", "LATERAL JOIN", etc.
1058
+ """
1059
+ # This logic was derived from the sqlglot join_sql method.
1060
+ # https://github.com/tobymao/sqlglot/blob/07bf71bae5d2a5c381104a86bb52c06809c21174/sqlglot/generator.py#L2248
1061
+
1062
+ # Special case for lateral joins
1063
+ if isinstance(join.this, sqlglot.exp.Lateral):
1064
+ if join.this.args.get("cross_apply") is not None:
1065
+ return "CROSS APPLY"
1066
+ return "LATERAL JOIN"
1067
+
1068
+ # Special case for STRAIGHT_JOIN (MySQL)
1069
+ if join.args.get("kind") == "STRAIGHT":
1070
+ return "STRAIGHT_JOIN"
1071
+
1072
+ # <method> <global> <side> <kind> JOIN
1073
+ # - method = "HASH", "MERGE"
1074
+ # - global = "GLOBAL"
1075
+ # - side = "LEFT", "RIGHT"
1076
+ # - kind = "INNER", "OUTER", "SEMI", "ANTI"
1077
+ components = []
1078
+ if method := join.args.get("method"):
1079
+ components.append(method)
1080
+ if join.args.get("global"):
1081
+ components.append("GLOBAL")
1082
+ if side := join.args.get("side"):
1083
+ # For SEMI/ANTI joins, side is optional
1084
+ components.append(side)
1085
+ if kind := join.args.get("kind"):
1086
+ components.append(kind)
1087
+
1088
+ components.append("JOIN")
1089
+ return " ".join(components)
1090
+
1091
+
693
1092
  def _extract_select_from_create(
694
1093
  statement: sqlglot.exp.Create,
695
1094
  ) -> sqlglot.exp.Expression:
@@ -784,7 +1183,12 @@ def _try_extract_select(
784
1183
  statement = sqlglot.exp.Select().select("*").from_(statement)
785
1184
  elif isinstance(statement, sqlglot.exp.Insert):
786
1185
  # TODO Need to map column renames in the expressions part of the statement.
787
- statement = statement.expression
1186
+ # Preserve CTEs when extracting the SELECT expression from INSERT
1187
+ original_ctes = statement.ctes
1188
+ statement = statement.expression # Get the SELECT expression from the INSERT
1189
+ if isinstance(statement, sqlglot.exp.Query) and original_ctes:
1190
+ for cte in original_ctes:
1191
+ statement = statement.with_(alias=cte.alias, as_=cte.this)
788
1192
  elif isinstance(statement, sqlglot.exp.Update):
789
1193
  # Assumption: the output table is already captured in the modified tables list.
790
1194
  statement = _extract_select_from_update(statement)
@@ -875,6 +1279,40 @@ def _translate_internal_column_lineage(
875
1279
  )
876
1280
 
877
1281
 
1282
+ def _translate_internal_joins(
1283
+ table_name_urn_mapping: Dict[_TableName, str],
1284
+ raw_joins: List[_JoinInfo],
1285
+ dialect: sqlglot.Dialect,
1286
+ ) -> List[JoinInfo]:
1287
+ joins = []
1288
+ for raw_join in raw_joins:
1289
+ try:
1290
+ joins.append(
1291
+ JoinInfo(
1292
+ join_type=raw_join.join_type,
1293
+ left_tables=[
1294
+ table_name_urn_mapping[table] for table in raw_join.left_tables
1295
+ ],
1296
+ right_tables=[
1297
+ table_name_urn_mapping[table] for table in raw_join.right_tables
1298
+ ],
1299
+ on_clause=raw_join.on_clause,
1300
+ columns_involved=[
1301
+ ColumnRef(
1302
+ table=table_name_urn_mapping[col.table],
1303
+ column=col.column,
1304
+ )
1305
+ for col in raw_join.columns_involved
1306
+ ],
1307
+ )
1308
+ )
1309
+ except KeyError as e:
1310
+ # Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
1311
+ logger.debug(f"Skipping join with unresolvable table: {e}")
1312
+ continue
1313
+ return joins
1314
+
1315
+
878
1316
  _StrOrNone = TypeVar("_StrOrNone", str, Optional[str])
879
1317
 
880
1318
 
@@ -923,12 +1361,12 @@ def _sqlglot_lineage_inner(
923
1361
  schema_resolver: SchemaResolverInterface,
924
1362
  default_db: Optional[str] = None,
925
1363
  default_schema: Optional[str] = None,
926
- default_dialect: Optional[str] = None,
1364
+ override_dialect: Optional[DialectOrStr] = None,
927
1365
  ) -> SqlParsingResult:
928
- if not default_dialect:
929
- dialect = get_dialect(schema_resolver.platform)
1366
+ if override_dialect:
1367
+ dialect = get_dialect(override_dialect)
930
1368
  else:
931
- dialect = get_dialect(default_dialect)
1369
+ dialect = get_dialect(schema_resolver.platform)
932
1370
 
933
1371
  default_db = _normalize_db_or_schema(default_db, dialect)
934
1372
  default_schema = _normalize_db_or_schema(default_schema, dialect)
@@ -1034,6 +1472,7 @@ def _sqlglot_lineage_inner(
1034
1472
  )
1035
1473
 
1036
1474
  column_lineage: Optional[List[_ColumnLineageInfo]] = None
1475
+ joins = None
1037
1476
  try:
1038
1477
  with cooperative_timeout(
1039
1478
  timeout=(
@@ -1049,6 +1488,7 @@ def _sqlglot_lineage_inner(
1049
1488
  default_schema=default_schema,
1050
1489
  )
1051
1490
  column_lineage = column_lineage_debug_info.column_lineage
1491
+ joins = column_lineage_debug_info.joins
1052
1492
  except CooperativeTimeoutError as e:
1053
1493
  logger.debug(f"Timed out while generating column-level lineage: {e}")
1054
1494
  debug_info.column_error = e
@@ -1081,6 +1521,14 @@ def _sqlglot_lineage_inner(
1081
1521
  f"Failed to translate column lineage to urns: {e}", exc_info=True
1082
1522
  )
1083
1523
  debug_info.column_error = e
1524
+ joins_urns = None
1525
+ if joins is not None:
1526
+ try:
1527
+ joins_urns = _translate_internal_joins(
1528
+ table_name_urn_mapping, raw_joins=joins, dialect=dialect
1529
+ )
1530
+ except KeyError as e:
1531
+ logger.debug(f"Failed to translate joins to urns: {e}", exc_info=True)
1084
1532
 
1085
1533
  query_type, query_type_props = get_query_type_of_sql(
1086
1534
  original_statement, dialect=dialect
@@ -1095,6 +1543,7 @@ def _sqlglot_lineage_inner(
1095
1543
  in_tables=in_urns,
1096
1544
  out_tables=out_urns,
1097
1545
  column_lineage=column_lineage_urns,
1546
+ joins=joins_urns,
1098
1547
  debug_info=debug_info,
1099
1548
  )
1100
1549
 
@@ -1104,7 +1553,7 @@ def _sqlglot_lineage_nocache(
1104
1553
  schema_resolver: SchemaResolverInterface,
1105
1554
  default_db: Optional[str] = None,
1106
1555
  default_schema: Optional[str] = None,
1107
- default_dialect: Optional[str] = None,
1556
+ override_dialect: Optional[DialectOrStr] = None,
1108
1557
  ) -> SqlParsingResult:
1109
1558
  """Parse a SQL statement and generate lineage information.
1110
1559
 
@@ -1122,8 +1571,8 @@ def _sqlglot_lineage_nocache(
1122
1571
  can be brittle with respect to missing schema information and complex
1123
1572
  SQL logic like UNNESTs.
1124
1573
 
1125
- The SQL dialect can be given as an argument called default_dialect or it can
1126
- be inferred from the schema_resolver's platform.
1574
+ The SQL dialect will be inferred from the schema_resolver's platform.
1575
+ That inference can be overridden by passing an override_dialect argument.
1127
1576
  The set of supported dialects is the same as sqlglot's. See their
1128
1577
  `documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
1129
1578
  for the full list.
@@ -1138,7 +1587,7 @@ def _sqlglot_lineage_nocache(
1138
1587
  schema_resolver: The schema resolver to use for resolving table schemas.
1139
1588
  default_db: The default database to use for unqualified table names.
1140
1589
  default_schema: The default schema to use for unqualified table names.
1141
- default_dialect: A default dialect to override the dialect provided by 'schema_resolver'.
1590
+ override_dialect: Override the dialect provided by 'schema_resolver'.
1142
1591
 
1143
1592
  Returns:
1144
1593
  A SqlParsingResult object containing the parsed lineage information.
@@ -1163,10 +1612,32 @@ def _sqlglot_lineage_nocache(
1163
1612
  schema_resolver=schema_resolver,
1164
1613
  default_db=default_db,
1165
1614
  default_schema=default_schema,
1166
- default_dialect=default_dialect,
1615
+ override_dialect=override_dialect,
1167
1616
  )
1168
1617
  except Exception as e:
1169
1618
  return SqlParsingResult.make_from_error(e)
1619
+ except BaseException as e:
1620
+ # Check if this is a PanicException from SQLGlot's Rust tokenizer
1621
+ # We use runtime type checking instead of isinstance() because pyo3_runtime
1622
+ # is only available when sqlglot[rs] is installed and may not be importable
1623
+ # at module load time, but the exception can still be raised at runtime
1624
+ if (
1625
+ e.__class__.__name__ == "PanicException"
1626
+ and e.__class__.__module__ == "pyo3_runtime"
1627
+ ):
1628
+ # Handle pyo3_runtime.PanicException from SQLGlot's Rust tokenizer.
1629
+ # pyo3_runtime.PanicException inherits from BaseException (like SystemExit or
1630
+ # KeyboardInterrupt) rather than Exception, so it bypasses normal exception handling.
1631
+ # Avoid catching BaseException, as it includes KeyboardInterrupt
1632
+ # and would prevent Ctrl+C from working.
1633
+ wrapped_exception = Exception(
1634
+ f"pyo3_runtime.PanicException during SQL parsing: {e}"
1635
+ )
1636
+ wrapped_exception.__cause__ = e
1637
+ return SqlParsingResult.make_from_error(wrapped_exception)
1638
+ else:
1639
+ # Re-raise other BaseException types (SystemExit, KeyboardInterrupt, etc.)
1640
+ raise
1170
1641
 
1171
1642
 
1172
1643
  _sqlglot_lineage_cached = functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)(
@@ -1179,15 +1650,15 @@ def sqlglot_lineage(
1179
1650
  schema_resolver: SchemaResolverInterface,
1180
1651
  default_db: Optional[str] = None,
1181
1652
  default_schema: Optional[str] = None,
1182
- default_dialect: Optional[str] = None,
1653
+ override_dialect: Optional[DialectOrStr] = None,
1183
1654
  ) -> SqlParsingResult:
1184
1655
  if schema_resolver.includes_temp_tables():
1185
1656
  return _sqlglot_lineage_nocache(
1186
- sql, schema_resolver, default_db, default_schema, default_dialect
1657
+ sql, schema_resolver, default_db, default_schema, override_dialect
1187
1658
  )
1188
1659
  else:
1189
1660
  return _sqlglot_lineage_cached(
1190
- sql, schema_resolver, default_db, default_schema, default_dialect
1661
+ sql, schema_resolver, default_db, default_schema, override_dialect
1191
1662
  )
1192
1663
 
1193
1664
 
@@ -1239,6 +1710,7 @@ def create_lineage_sql_parsed_result(
1239
1710
  default_schema: Optional[str] = None,
1240
1711
  graph: Optional[DataHubGraph] = None,
1241
1712
  schema_aware: bool = True,
1713
+ override_dialect: Optional[DialectOrStr] = None,
1242
1714
  ) -> SqlParsingResult:
1243
1715
  schema_resolver = create_schema_resolver(
1244
1716
  platform=platform,
@@ -1258,6 +1730,7 @@ def create_lineage_sql_parsed_result(
1258
1730
  schema_resolver=schema_resolver,
1259
1731
  default_db=default_db,
1260
1732
  default_schema=default_schema,
1733
+ override_dialect=override_dialect,
1261
1734
  )
1262
1735
  except Exception as e:
1263
1736
  return SqlParsingResult.make_from_error(e)