acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
56
56
  QueryTypeProps,
57
57
  )
58
58
  from datahub.sql_parsing.sqlglot_utils import (
59
+ DialectOrStr,
59
60
  get_dialect,
60
61
  get_query_fingerprint_debug,
61
62
  is_dialect_instance,
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
124
125
 
125
126
 
126
127
  class DownstreamColumnRef(_ParserBaseModel):
128
+ """
129
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
130
+ What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
131
+ auto-generated class from .pdl model files. We need generic solution allowing us to either:
132
+ 1. Implement hashing for .pdl model objects
133
+ 2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
134
+ hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
135
+ to understand that instruction as well.
136
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
137
+ """
138
+
127
139
  table: Optional[Urn] = None
128
140
  column: str
129
141
  column_type: Optional[SchemaFieldDataTypeClass] = None
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
139
151
  return v
140
152
  return SchemaFieldDataTypeClass.from_obj(v)
141
153
 
154
+ def __hash__(self) -> int:
155
+ return hash((self.table, self.column, self.native_column_type))
142
156
 
143
- class ColumnTransformation(_ParserBaseModel):
157
+
158
+ class ColumnTransformation(_FrozenModel):
144
159
  is_direct_copy: bool
145
160
  column_logic: str
146
161
 
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
153
168
 
154
169
 
155
170
  class ColumnLineageInfo(_ParserBaseModel):
171
+ """
172
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
173
+ To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
174
+ depending on it.
175
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
176
+ """
177
+
156
178
  downstream: DownstreamColumnRef
157
179
  upstreams: List[ColumnRef]
158
180
 
159
181
  logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
160
182
 
183
+ def __hash__(self) -> int:
184
+ return hash((self.downstream, tuple(self.upstreams), self.logic))
185
+
161
186
 
162
187
  class _JoinInfo(_ParserBaseModel):
163
188
  join_type: str
@@ -518,16 +543,19 @@ def _select_statement_cll(
518
543
  root_scope: sqlglot.optimizer.Scope,
519
544
  column_resolver: _ColumnResolver,
520
545
  output_table: Optional[_TableName],
546
+ table_name_schema_mapping: Dict[_TableName, SchemaInfo],
547
+ default_db: Optional[str] = None,
548
+ default_schema: Optional[str] = None,
521
549
  ) -> List[_ColumnLineageInfo]:
522
550
  column_lineage: List[_ColumnLineageInfo] = []
523
551
 
524
552
  try:
525
- # List output columns.
526
553
  output_columns = [
527
554
  (select_col.alias_or_name, select_col) for select_col in statement.selects
528
555
  ]
529
556
  logger.debug("output columns: %s", [col[0] for col in output_columns])
530
- for output_col, original_col_expression in output_columns:
557
+
558
+ for output_col, _original_col_expression in output_columns:
531
559
  if not output_col or output_col == "*":
532
560
  # If schema information is available, the * will be expanded to the actual columns.
533
561
  # Otherwise, we can't process it.
@@ -551,13 +579,14 @@ def _select_statement_cll(
551
579
  trim_selects=False,
552
580
  # We don't need to pass the schema in here, since we've already qualified the columns.
553
581
  )
554
- # import pathlib
555
- # pathlib.Path("sqlglot.html").write_text(
556
- # str(lineage_node.to_html(dialect=dialect))
557
- # )
558
582
 
559
583
  # Generate SELECT lineage.
560
- direct_raw_col_upstreams = _get_direct_raw_col_upstreams(lineage_node)
584
+ direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
585
+ lineage_node,
586
+ dialect,
587
+ default_db,
588
+ default_schema,
589
+ )
561
590
 
562
591
  # Fuzzy resolve the output column.
563
592
  original_col_expression = lineage_node.expression
@@ -576,7 +605,7 @@ def _select_statement_cll(
576
605
  if original_col_expression.type:
577
606
  output_col_type = original_col_expression.type
578
607
 
579
- # Fuzzy resolve upstream columns.
608
+ # Resolve upstream columns - table names should already be qualified from placeholder processing
580
609
  direct_resolved_col_upstreams = {
581
610
  _ColumnRef(
582
611
  table=edge.table,
@@ -662,6 +691,13 @@ def _column_level_lineage(
662
691
  select_statement=select_statement,
663
692
  )
664
693
 
694
+ # Handle VALUES expressions separately - they have no upstream tables and no column lineage
695
+ if isinstance(select_statement, sqlglot.exp.Values):
696
+ return _ColumnLineageWithDebugInfo(
697
+ column_lineage=[],
698
+ select_statement=select_statement,
699
+ )
700
+
665
701
  assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
666
702
  try:
667
703
  root_scope = sqlglot.optimizer.build_scope(select_statement)
@@ -681,6 +717,9 @@ def _column_level_lineage(
681
717
  root_scope=root_scope,
682
718
  column_resolver=column_resolver,
683
719
  output_table=downstream_table,
720
+ table_name_schema_mapping=table_name_schema_mapping,
721
+ default_db=default_db,
722
+ default_schema=default_schema,
684
723
  )
685
724
 
686
725
  joins: Optional[List[_JoinInfo]] = None
@@ -701,6 +740,9 @@ def _column_level_lineage(
701
740
 
702
741
  def _get_direct_raw_col_upstreams(
703
742
  lineage_node: sqlglot.lineage.Node,
743
+ dialect: Optional[sqlglot.Dialect] = None,
744
+ default_db: Optional[str] = None,
745
+ default_schema: Optional[str] = None,
704
746
  ) -> OrderedSet[_ColumnRef]:
705
747
  # Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
706
748
  direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
@@ -730,6 +772,53 @@ def _get_direct_raw_col_upstreams(
730
772
  direct_raw_col_upstreams.add(
731
773
  _ColumnRef(table=table_ref, column=normalized_col)
732
774
  )
775
+ elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
776
+ # Handle placeholder expressions from lateral joins.
777
+ #
778
+ # In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
779
+ # expressions instead of regular table references. This is critical for lateral join column lineage.
780
+ #
781
+ # Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
782
+ # The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
783
+ # which we need to parse to establish the lineage: output.value <- my_table2.value
784
+ #
785
+ # Without this handling, lateral join column lineage would be incomplete/missing.
786
+ try:
787
+ parsed = sqlglot.parse_one(node.name, dialect=dialect)
788
+ if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
789
+ table_ref = _TableName.from_sqlglot_table(
790
+ sqlglot.parse_one(
791
+ parsed.table, into=sqlglot.exp.Table, dialect=dialect
792
+ )
793
+ )
794
+
795
+ # SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
796
+ # Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
797
+ # We need this runtime qualification to ensure proper lineage resolution.
798
+ # Only qualify if this appears to be a real table reference (not a temporary construct)
799
+ if (
800
+ not (table_ref.database or table_ref.db_schema)
801
+ and dialect is not None
802
+ ):
803
+ table_ref = table_ref.qualified(
804
+ dialect=dialect,
805
+ default_db=default_db,
806
+ default_schema=default_schema,
807
+ )
808
+
809
+ # Extract column name using proper isinstance check
810
+ if isinstance(parsed.this, sqlglot.exp.Identifier):
811
+ column_name = parsed.this.name
812
+ else:
813
+ column_name = str(parsed.this)
814
+ direct_raw_col_upstreams.add(
815
+ _ColumnRef(table=table_ref, column=column_name)
816
+ )
817
+ except Exception as e:
818
+ logger.debug(
819
+ f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
820
+ exc_info=True,
821
+ )
733
822
  else:
734
823
  # This branch doesn't matter. For example, a count(*) column would go here, and
735
824
  # we don't get any column-level lineage for that.
@@ -832,7 +921,7 @@ def _get_raw_col_upstreams_for_expression(
832
921
  trim_selects=False,
833
922
  )
834
923
 
835
- return _get_direct_raw_col_upstreams(node)
924
+ return _get_direct_raw_col_upstreams(node, dialect, None, None)
836
925
  finally:
837
926
  scope.expression = original_expression
838
927
 
@@ -847,8 +936,9 @@ def _list_joins(
847
936
 
848
937
  scope: sqlglot.optimizer.Scope
849
938
  for scope in root_scope.traverse():
939
+ # PART 1: Handle regular explicit JOINs (updated API)
850
940
  join: sqlglot.exp.Join
851
- for join in scope.find_all(sqlglot.exp.Join):
941
+ for join in scope.expression.find_all(sqlglot.exp.Join):
852
942
  left_side_tables: OrderedSet[_TableName] = OrderedSet()
853
943
  from_clause: sqlglot.exp.From
854
944
  for from_clause in scope.find_all(sqlglot.exp.From):
@@ -924,6 +1014,36 @@ def _list_joins(
924
1014
  )
925
1015
  )
926
1016
 
1017
+ # Handle LATERAL constructs
1018
+ for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
1019
+ # Get tables from non-lateral FROM clauses
1020
+ qualified_left: OrderedSet[_TableName] = OrderedSet()
1021
+ for from_clause in scope.find_all(sqlglot.exp.From):
1022
+ if not isinstance(from_clause.this, sqlglot.exp.Lateral):
1023
+ qualified_left.update(
1024
+ _get_join_side_tables(from_clause.this, dialect, scope)
1025
+ )
1026
+
1027
+ # Get tables from lateral subquery
1028
+ qualified_right: OrderedSet[_TableName] = OrderedSet()
1029
+ if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
1030
+ qualified_right.update(
1031
+ _TableName.from_sqlglot_table(t)
1032
+ for t in lateral.this.find_all(sqlglot.exp.Table)
1033
+ )
1034
+ qualified_right.update(qualified_left)
1035
+
1036
+ if qualified_left and qualified_right:
1037
+ joins.append(
1038
+ _JoinInfo(
1039
+ join_type="LATERAL JOIN",
1040
+ left_tables=list(qualified_left),
1041
+ right_tables=list(qualified_right),
1042
+ on_clause=None,
1043
+ columns_involved=[],
1044
+ )
1045
+ )
1046
+
927
1047
  return joins
928
1048
 
929
1049
 
@@ -1063,7 +1183,12 @@ def _try_extract_select(
1063
1183
  statement = sqlglot.exp.Select().select("*").from_(statement)
1064
1184
  elif isinstance(statement, sqlglot.exp.Insert):
1065
1185
  # TODO Need to map column renames in the expressions part of the statement.
1066
- statement = statement.expression
1186
+ # Preserve CTEs when extracting the SELECT expression from INSERT
1187
+ original_ctes = statement.ctes
1188
+ statement = statement.expression # Get the SELECT expression from the INSERT
1189
+ if isinstance(statement, sqlglot.exp.Query) and original_ctes:
1190
+ for cte in original_ctes:
1191
+ statement = statement.with_(alias=cte.alias, as_=cte.this)
1067
1192
  elif isinstance(statement, sqlglot.exp.Update):
1068
1193
  # Assumption: the output table is already captured in the modified tables list.
1069
1194
  statement = _extract_select_from_update(statement)
@@ -1161,25 +1286,30 @@ def _translate_internal_joins(
1161
1286
  ) -> List[JoinInfo]:
1162
1287
  joins = []
1163
1288
  for raw_join in raw_joins:
1164
- joins.append(
1165
- JoinInfo(
1166
- join_type=raw_join.join_type,
1167
- left_tables=[
1168
- table_name_urn_mapping[table] for table in raw_join.left_tables
1169
- ],
1170
- right_tables=[
1171
- table_name_urn_mapping[table] for table in raw_join.right_tables
1172
- ],
1173
- on_clause=raw_join.on_clause,
1174
- columns_involved=[
1175
- ColumnRef(
1176
- table=table_name_urn_mapping[col.table],
1177
- column=col.column,
1178
- )
1179
- for col in raw_join.columns_involved
1180
- ],
1289
+ try:
1290
+ joins.append(
1291
+ JoinInfo(
1292
+ join_type=raw_join.join_type,
1293
+ left_tables=[
1294
+ table_name_urn_mapping[table] for table in raw_join.left_tables
1295
+ ],
1296
+ right_tables=[
1297
+ table_name_urn_mapping[table] for table in raw_join.right_tables
1298
+ ],
1299
+ on_clause=raw_join.on_clause,
1300
+ columns_involved=[
1301
+ ColumnRef(
1302
+ table=table_name_urn_mapping[col.table],
1303
+ column=col.column,
1304
+ )
1305
+ for col in raw_join.columns_involved
1306
+ ],
1307
+ )
1181
1308
  )
1182
- )
1309
+ except KeyError as e:
1310
+ # Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
1311
+ logger.debug(f"Skipping join with unresolvable table: {e}")
1312
+ continue
1183
1313
  return joins
1184
1314
 
1185
1315
 
@@ -1231,12 +1361,12 @@ def _sqlglot_lineage_inner(
1231
1361
  schema_resolver: SchemaResolverInterface,
1232
1362
  default_db: Optional[str] = None,
1233
1363
  default_schema: Optional[str] = None,
1234
- default_dialect: Optional[str] = None,
1364
+ override_dialect: Optional[DialectOrStr] = None,
1235
1365
  ) -> SqlParsingResult:
1236
- if not default_dialect:
1237
- dialect = get_dialect(schema_resolver.platform)
1366
+ if override_dialect:
1367
+ dialect = get_dialect(override_dialect)
1238
1368
  else:
1239
- dialect = get_dialect(default_dialect)
1369
+ dialect = get_dialect(schema_resolver.platform)
1240
1370
 
1241
1371
  default_db = _normalize_db_or_schema(default_db, dialect)
1242
1372
  default_schema = _normalize_db_or_schema(default_schema, dialect)
@@ -1423,7 +1553,7 @@ def _sqlglot_lineage_nocache(
1423
1553
  schema_resolver: SchemaResolverInterface,
1424
1554
  default_db: Optional[str] = None,
1425
1555
  default_schema: Optional[str] = None,
1426
- default_dialect: Optional[str] = None,
1556
+ override_dialect: Optional[DialectOrStr] = None,
1427
1557
  ) -> SqlParsingResult:
1428
1558
  """Parse a SQL statement and generate lineage information.
1429
1559
 
@@ -1441,8 +1571,8 @@ def _sqlglot_lineage_nocache(
1441
1571
  can be brittle with respect to missing schema information and complex
1442
1572
  SQL logic like UNNESTs.
1443
1573
 
1444
- The SQL dialect can be given as an argument called default_dialect or it can
1445
- be inferred from the schema_resolver's platform.
1574
+ The SQL dialect will be inferred from the schema_resolver's platform.
1575
+ That inference can be overridden by passing an override_dialect argument.
1446
1576
  The set of supported dialects is the same as sqlglot's. See their
1447
1577
  `documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
1448
1578
  for the full list.
@@ -1457,7 +1587,7 @@ def _sqlglot_lineage_nocache(
1457
1587
  schema_resolver: The schema resolver to use for resolving table schemas.
1458
1588
  default_db: The default database to use for unqualified table names.
1459
1589
  default_schema: The default schema to use for unqualified table names.
1460
- default_dialect: A default dialect to override the dialect provided by 'schema_resolver'.
1590
+ override_dialect: Override the dialect provided by 'schema_resolver'.
1461
1591
 
1462
1592
  Returns:
1463
1593
  A SqlParsingResult object containing the parsed lineage information.
@@ -1482,10 +1612,32 @@ def _sqlglot_lineage_nocache(
1482
1612
  schema_resolver=schema_resolver,
1483
1613
  default_db=default_db,
1484
1614
  default_schema=default_schema,
1485
- default_dialect=default_dialect,
1615
+ override_dialect=override_dialect,
1486
1616
  )
1487
1617
  except Exception as e:
1488
1618
  return SqlParsingResult.make_from_error(e)
1619
+ except BaseException as e:
1620
+ # Check if this is a PanicException from SQLGlot's Rust tokenizer
1621
+ # We use runtime type checking instead of isinstance() because pyo3_runtime
1622
+ # is only available when sqlglot[rs] is installed and may not be importable
1623
+ # at module load time, but the exception can still be raised at runtime
1624
+ if (
1625
+ e.__class__.__name__ == "PanicException"
1626
+ and e.__class__.__module__ == "pyo3_runtime"
1627
+ ):
1628
+ # Handle pyo3_runtime.PanicException from SQLGlot's Rust tokenizer.
1629
+ # pyo3_runtime.PanicException inherits from BaseException (like SystemExit or
1630
+ # KeyboardInterrupt) rather than Exception, so it bypasses normal exception handling.
1631
+ # Avoid catching BaseException, as it includes KeyboardInterrupt
1632
+ # and would prevent Ctrl+C from working.
1633
+ wrapped_exception = Exception(
1634
+ f"pyo3_runtime.PanicException during SQL parsing: {e}"
1635
+ )
1636
+ wrapped_exception.__cause__ = e
1637
+ return SqlParsingResult.make_from_error(wrapped_exception)
1638
+ else:
1639
+ # Re-raise other BaseException types (SystemExit, KeyboardInterrupt, etc.)
1640
+ raise
1489
1641
 
1490
1642
 
1491
1643
  _sqlglot_lineage_cached = functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)(
@@ -1498,15 +1650,15 @@ def sqlglot_lineage(
1498
1650
  schema_resolver: SchemaResolverInterface,
1499
1651
  default_db: Optional[str] = None,
1500
1652
  default_schema: Optional[str] = None,
1501
- default_dialect: Optional[str] = None,
1653
+ override_dialect: Optional[DialectOrStr] = None,
1502
1654
  ) -> SqlParsingResult:
1503
1655
  if schema_resolver.includes_temp_tables():
1504
1656
  return _sqlglot_lineage_nocache(
1505
- sql, schema_resolver, default_db, default_schema, default_dialect
1657
+ sql, schema_resolver, default_db, default_schema, override_dialect
1506
1658
  )
1507
1659
  else:
1508
1660
  return _sqlglot_lineage_cached(
1509
- sql, schema_resolver, default_db, default_schema, default_dialect
1661
+ sql, schema_resolver, default_db, default_schema, override_dialect
1510
1662
  )
1511
1663
 
1512
1664
 
@@ -1558,6 +1710,7 @@ def create_lineage_sql_parsed_result(
1558
1710
  default_schema: Optional[str] = None,
1559
1711
  graph: Optional[DataHubGraph] = None,
1560
1712
  schema_aware: bool = True,
1713
+ override_dialect: Optional[DialectOrStr] = None,
1561
1714
  ) -> SqlParsingResult:
1562
1715
  schema_resolver = create_schema_resolver(
1563
1716
  platform=platform,
@@ -1577,6 +1730,7 @@ def create_lineage_sql_parsed_result(
1577
1730
  schema_resolver=schema_resolver,
1578
1731
  default_db=default_db,
1579
1732
  default_schema=default_schema,
1733
+ override_dialect=override_dialect,
1580
1734
  )
1581
1735
  except Exception as e:
1582
1736
  return SqlParsingResult.make_from_error(e)
@@ -40,9 +40,6 @@ def _get_dialect_str(platform: str) -> str:
40
40
  # let the fuzzy resolution logic handle it.
41
41
  # MariaDB is a fork of MySQL, so we reuse the same dialect.
42
42
  return "mysql, normalization_strategy = lowercase"
43
- # Dremio is based upon drill. Not 100% compatibility
44
- elif platform == "dremio":
45
- return "drill"
46
43
  else:
47
44
  return platform
48
45
 
@@ -115,6 +112,8 @@ def _expression_to_string(
115
112
  return expression.sql(dialect=get_dialect(platform))
116
113
 
117
114
 
115
+ PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION = re.compile(r"(%s|\$\d|\?)")
116
+
118
117
  _BASIC_NORMALIZATION_RULES = {
119
118
  # Remove /* */ comments.
120
119
  re.compile(r"/\*.*?\*/", re.DOTALL): "",
@@ -130,7 +129,9 @@ _BASIC_NORMALIZATION_RULES = {
130
129
  re.compile(r"'[^']*'"): "?",
131
130
  # Replace sequences of IN/VALUES with a single placeholder.
132
131
  # The r" ?" makes it more robust to uneven spacing.
133
- re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
132
+ re.compile(
133
+ r"\b(IN|VALUES)\s*\( ?(?:%s|\$\d|\?)(?:, ?(?:%s|\$\d|\?))* ?\)", re.IGNORECASE
134
+ ): r"\1 (?)",
134
135
  # Normalize parenthesis spacing.
135
136
  re.compile(r"\( "): "(",
136
137
  re.compile(r" \)"): ")",
@@ -139,6 +140,9 @@ _BASIC_NORMALIZATION_RULES = {
139
140
  # e.g. "col1,col2" -> "col1, col2"
140
141
  re.compile(r"\b ,"): ",",
141
142
  re.compile(r"\b,\b"): ", ",
143
+ # MAKE SURE THAT THIS IS AFTER THE ABOVE REPLACEMENT
144
+ # Replace all versions of placeholders with generic ? placeholder.
145
+ PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION: "?",
142
146
  }
143
147
  _TABLE_NAME_NORMALIZATION_RULES = {
144
148
  # Replace UUID-like strings with a placeholder (both - and _ variants).
@@ -262,6 +266,10 @@ def get_query_fingerprint_debug(
262
266
  if not fast:
263
267
  dialect = get_dialect(platform)
264
268
  expression_sql = generalize_query(expression, dialect=dialect)
269
+ # Normalize placeholders for consistent fingerprinting -> this only needs to be backward compatible with earlier sqglot generated generalized queries where the placeholders were always ?
270
+ expression_sql = PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION.sub(
271
+ "?", expression_sql
272
+ )
265
273
  else:
266
274
  expression_sql = generalize_query_fast(expression, dialect=platform)
267
275
  except (ValueError, sqlglot.errors.SqlglotError) as e:
@@ -208,9 +208,7 @@ class ToolMetaExtractor:
208
208
  Returns:
209
209
  bool: whether QueryLog entry is that of hex.
210
210
  """
211
- last_line = _get_last_line(entry.query_text)
212
-
213
- if not last_line.startswith("-- Hex query metadata:"):
211
+ if "-- Hex query metadata:" not in entry.query_text:
214
212
  return False
215
213
 
216
214
  entry.origin = HEX_PLATFORM_URN
@@ -16,6 +16,11 @@ from datahub._version import __version__, nice_version_name
16
16
  from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
17
17
  from datahub.cli.env_utils import get_boolean_env_variable
18
18
  from datahub.configuration.common import ExceptionWithProps
19
+ from datahub.configuration.env_vars import (
20
+ get_sentry_dsn,
21
+ get_sentry_environment,
22
+ get_telemetry_timeout,
23
+ )
19
24
  from datahub.metadata.schema_classes import _custom_package_path
20
25
  from datahub.utilities.perf_timer import PerfTimer
21
26
 
@@ -97,14 +102,14 @@ if any(var in os.environ for var in CI_ENV_VARS):
97
102
  if _custom_package_path:
98
103
  ENV_ENABLED = False
99
104
 
100
- TIMEOUT = int(os.environ.get("DATAHUB_TELEMETRY_TIMEOUT", "10"))
105
+ TIMEOUT = int(get_telemetry_timeout())
101
106
  MIXPANEL_ENDPOINT = "track.datahubproject.io/mp"
102
107
  MIXPANEL_TOKEN = "5ee83d940754d63cacbf7d34daa6f44a"
103
- SENTRY_DSN: Optional[str] = os.environ.get("SENTRY_DSN", None)
104
- SENTRY_ENVIRONMENT: str = os.environ.get("SENTRY_ENVIRONMENT", "dev")
108
+ SENTRY_DSN: Optional[str] = get_sentry_dsn()
109
+ SENTRY_ENVIRONMENT: str = get_sentry_environment()
105
110
 
106
111
 
107
- def _default_telemetry_properties() -> Dict[str, Any]:
112
+ def _default_global_properties() -> Dict[str, Any]:
108
113
  return {
109
114
  "datahub_version": nice_version_name(),
110
115
  "python_version": platform.python_version(),
@@ -122,6 +127,7 @@ class Telemetry:
122
127
  context_properties: Dict[str, Any] = {}
123
128
 
124
129
  def __init__(self):
130
+ self.global_properties = _default_global_properties()
125
131
  self.context_properties = {}
126
132
 
127
133
  if SENTRY_DSN:
@@ -247,6 +253,10 @@ class Telemetry:
247
253
 
248
254
  return False
249
255
 
256
+ def add_global_property(self, key: str, value: Any) -> None:
257
+ self.global_properties[key] = value
258
+ self._update_sentry_properties()
259
+
250
260
  def set_context(
251
261
  self,
252
262
  server: Optional["DataHubGraph"] = None,
@@ -257,16 +267,20 @@ class Telemetry:
257
267
  **(properties or {}),
258
268
  }
259
269
 
260
- if self.sentry_enabled:
261
- from sentry_sdk import set_tag
270
+ self._update_sentry_properties()
262
271
 
263
- properties = {
264
- **_default_telemetry_properties(),
265
- **self.context_properties,
266
- }
272
+ def _update_sentry_properties(self) -> None:
273
+ properties = {
274
+ **self.global_properties,
275
+ **self.context_properties,
276
+ }
277
+ if self.sentry_enabled:
278
+ import sentry_sdk
267
279
 
268
- for key in properties:
269
- set_tag(key, properties[key])
280
+ # Note: once we're on sentry-sdk 2.1.0+, we can use sentry_sdk.set_tags(properties)
281
+ # See https://github.com/getsentry/sentry-python/commit/6c960d752c7c7aff3fd7469d2e9ad98f19663aa8
282
+ for key, value in properties.items():
283
+ sentry_sdk.set_tag(key, value)
270
284
 
271
285
  def init_capture_exception(self) -> None:
272
286
  if self.sentry_enabled:
@@ -300,7 +314,7 @@ class Telemetry:
300
314
  try:
301
315
  self.mp.people_set(
302
316
  self.client_id,
303
- _default_telemetry_properties(),
317
+ self.global_properties,
304
318
  )
305
319
  except Exception as e:
306
320
  logger.debug(f"Error initializing telemetry: {e}")
@@ -334,7 +348,7 @@ class Telemetry:
334
348
  logger.debug(f"Sending telemetry for {event_name}")
335
349
 
336
350
  properties = {
337
- **_default_telemetry_properties(),
351
+ **self.global_properties,
338
352
  **self.context_properties,
339
353
  **properties,
340
354
  }
@@ -1,12 +1,18 @@
1
1
  import pathlib
2
+ from typing import Sequence
2
3
 
3
4
  from datahub.sdk.entity import Entity
4
5
  from datahub.testing import mce_helpers
5
6
 
6
7
 
7
- def assert_entity_golden(entity: Entity, golden_path: pathlib.Path) -> None:
8
+ def assert_entity_golden(
9
+ entity: Entity,
10
+ golden_path: pathlib.Path,
11
+ ignore_paths: Sequence[str] = (),
12
+ ) -> None:
8
13
  mce_helpers.check_goldens_stream(
9
14
  outputs=entity.as_mcps(),
10
15
  golden_path=golden_path,
11
16
  ignore_order=False,
17
+ ignore_paths=ignore_paths,
12
18
  )