acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
58
58
  ToolMetaExtractorReport,
59
59
  )
60
60
  from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
61
+ from datahub.utilities.dedup_list import deduplicate_list
61
62
  from datahub.utilities.file_backed_collections import (
62
63
  ConnectionWrapper,
63
64
  FileBackedDict,
@@ -140,6 +141,7 @@ class QueryMetadata:
140
141
 
141
142
  used_temp_tables: bool = True
142
143
 
144
+ extra_info: Optional[dict] = None
143
145
  origin: Optional[Urn] = None
144
146
 
145
147
  def make_created_audit_stamp(self) -> models.AuditStampClass:
@@ -263,7 +265,7 @@ class PreparsedQuery:
263
265
  query_type_props: QueryTypeProps = dataclasses.field(
264
266
  default_factory=lambda: QueryTypeProps()
265
267
  )
266
- # Use this to store addtitional key-value information about query for debugging
268
+ # Use this to store additional key-value information about the query for debugging.
267
269
  extra_info: Optional[dict] = None
268
270
  origin: Optional[Urn] = None
269
271
 
@@ -948,6 +950,7 @@ class SqlParsingAggregator(Closeable):
948
950
  column_usage=parsed.column_usage or {},
949
951
  confidence_score=parsed.confidence_score,
950
952
  used_temp_tables=session_has_temp_tables,
953
+ extra_info=parsed.extra_info,
951
954
  origin=parsed.origin,
952
955
  )
953
956
  )
@@ -1491,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
1491
1494
  return
1492
1495
 
1493
1496
  # If a query doesn't involve any allowed tables, skip it.
1494
- if downstream_urn is None and not any(
1495
- self.is_allowed_table(urn) for urn in query.upstreams
1496
- ):
1497
+ if (
1498
+ downstream_urn is None or not self.is_allowed_table(downstream_urn)
1499
+ ) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
1497
1500
  self.report.num_queries_skipped_due_to_filters += 1
1498
1501
  return
1499
1502
 
@@ -1574,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
1574
1577
 
1575
1578
  @dataclasses.dataclass
1576
1579
  class QueryLineageInfo:
1577
- upstreams: List[UrnStr] # this is direct upstreams, with *no temp tables*
1578
- column_lineage: List[ColumnLineageInfo]
1580
+ upstreams: OrderedSet[
1581
+ UrnStr
1582
+ ] # this is direct upstreams, with *no temp tables*
1583
+ column_lineage: OrderedSet[ColumnLineageInfo]
1579
1584
  confidence_score: float
1580
1585
 
1581
1586
  def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
1582
- self.upstreams += other_query.upstreams
1583
- self.column_lineage += other_query.column_lineage
1587
+ self.upstreams.update(other_query.upstreams)
1588
+ self.column_lineage.update(other_query.column_lineage)
1584
1589
  self.confidence_score = min(
1585
1590
  self.confidence_score, other_query.confidence_score
1586
1591
  )
1587
1592
 
1593
+ cache: Dict[str, QueryLineageInfo] = {}
1594
+
1588
1595
  def _recurse_into_query(
1589
1596
  query: QueryMetadata, recursion_path: List[QueryId]
1590
1597
  ) -> QueryLineageInfo:
1591
1598
  if query.query_id in recursion_path:
1592
1599
  # This is a cycle, so we just return the query as-is.
1593
1600
  return QueryLineageInfo(
1594
- upstreams=query.upstreams,
1595
- column_lineage=query.column_lineage,
1601
+ upstreams=OrderedSet(query.upstreams),
1602
+ column_lineage=OrderedSet(query.column_lineage),
1596
1603
  confidence_score=query.confidence_score,
1597
1604
  )
1605
+ if query.query_id in cache:
1606
+ return cache[query.query_id]
1598
1607
  recursion_path = [*recursion_path, query.query_id]
1599
1608
  composed_of_queries.add(query.query_id)
1600
1609
 
@@ -1609,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
1609
1618
  upstream_query = self._query_map.get(upstream_query_id)
1610
1619
  if (
1611
1620
  upstream_query
1612
- and upstream_query.query_id not in composed_of_queries
1621
+ and upstream_query.query_id not in recursion_path
1613
1622
  ):
1614
1623
  temp_query_lineage_info = _recurse_into_query(
1615
1624
  upstream_query, recursion_path
@@ -1669,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
1669
1678
  ]
1670
1679
  )
1671
1680
 
1672
- return QueryLineageInfo(
1673
- upstreams=list(new_upstreams),
1674
- column_lineage=new_cll,
1681
+ ret = QueryLineageInfo(
1682
+ upstreams=new_upstreams,
1683
+ column_lineage=OrderedSet(new_cll),
1675
1684
  confidence_score=new_confidence_score,
1676
1685
  )
1686
+ cache[query.query_id] = ret
1687
+
1688
+ return ret
1677
1689
 
1678
1690
  resolved_lineage_info = _recurse_into_query(base_query, [])
1679
1691
 
@@ -1706,15 +1718,15 @@ class SqlParsingAggregator(Closeable):
1706
1718
  )
1707
1719
 
1708
1720
  merged_query_text = ";\n\n".join(
1709
- [q.formatted_query_string for q in ordered_queries]
1721
+ deduplicate_list([q.formatted_query_string for q in ordered_queries])
1710
1722
  )
1711
1723
 
1712
1724
  resolved_query = dataclasses.replace(
1713
1725
  base_query,
1714
1726
  query_id=composite_query_id,
1715
1727
  formatted_query_string=merged_query_text,
1716
- upstreams=resolved_lineage_info.upstreams,
1717
- column_lineage=resolved_lineage_info.column_lineage,
1728
+ upstreams=list(resolved_lineage_info.upstreams),
1729
+ column_lineage=list(resolved_lineage_info.column_lineage),
1718
1730
  confidence_score=resolved_lineage_info.confidence_score,
1719
1731
  )
1720
1732
 
@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
56
56
  QueryTypeProps,
57
57
  )
58
58
  from datahub.sql_parsing.sqlglot_utils import (
59
+ DialectOrStr,
59
60
  get_dialect,
60
61
  get_query_fingerprint_debug,
61
62
  is_dialect_instance,
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
124
125
 
125
126
 
126
127
  class DownstreamColumnRef(_ParserBaseModel):
128
+ """
129
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
130
+ What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
131
+ auto-generated class from .pdl model files. We need generic solution allowing us to either:
132
+ 1. Implement hashing for .pdl model objects
133
+ 2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
134
+ hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
135
+ to understand that instruction as well.
136
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
137
+ """
138
+
127
139
  table: Optional[Urn] = None
128
140
  column: str
129
141
  column_type: Optional[SchemaFieldDataTypeClass] = None
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
139
151
  return v
140
152
  return SchemaFieldDataTypeClass.from_obj(v)
141
153
 
154
+ def __hash__(self) -> int:
155
+ return hash((self.table, self.column, self.native_column_type))
156
+
142
157
 
143
- class ColumnTransformation(_ParserBaseModel):
158
+ class ColumnTransformation(_FrozenModel):
144
159
  is_direct_copy: bool
145
160
  column_logic: str
146
161
 
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
153
168
 
154
169
 
155
170
  class ColumnLineageInfo(_ParserBaseModel):
171
+ """
172
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
173
+ To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
174
+ depending on it.
175
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
176
+ """
177
+
156
178
  downstream: DownstreamColumnRef
157
179
  upstreams: List[ColumnRef]
158
180
 
159
181
  logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
160
182
 
183
+ def __hash__(self) -> int:
184
+ return hash((self.downstream, tuple(self.upstreams), self.logic))
185
+
161
186
 
162
187
  class _JoinInfo(_ParserBaseModel):
163
188
  join_type: str
@@ -1231,12 +1256,12 @@ def _sqlglot_lineage_inner(
1231
1256
  schema_resolver: SchemaResolverInterface,
1232
1257
  default_db: Optional[str] = None,
1233
1258
  default_schema: Optional[str] = None,
1234
- default_dialect: Optional[str] = None,
1259
+ override_dialect: Optional[DialectOrStr] = None,
1235
1260
  ) -> SqlParsingResult:
1236
- if not default_dialect:
1237
- dialect = get_dialect(schema_resolver.platform)
1261
+ if override_dialect:
1262
+ dialect = get_dialect(override_dialect)
1238
1263
  else:
1239
- dialect = get_dialect(default_dialect)
1264
+ dialect = get_dialect(schema_resolver.platform)
1240
1265
 
1241
1266
  default_db = _normalize_db_or_schema(default_db, dialect)
1242
1267
  default_schema = _normalize_db_or_schema(default_schema, dialect)
@@ -1423,7 +1448,7 @@ def _sqlglot_lineage_nocache(
1423
1448
  schema_resolver: SchemaResolverInterface,
1424
1449
  default_db: Optional[str] = None,
1425
1450
  default_schema: Optional[str] = None,
1426
- default_dialect: Optional[str] = None,
1451
+ override_dialect: Optional[DialectOrStr] = None,
1427
1452
  ) -> SqlParsingResult:
1428
1453
  """Parse a SQL statement and generate lineage information.
1429
1454
 
@@ -1441,8 +1466,8 @@ def _sqlglot_lineage_nocache(
1441
1466
  can be brittle with respect to missing schema information and complex
1442
1467
  SQL logic like UNNESTs.
1443
1468
 
1444
- The SQL dialect can be given as an argument called default_dialect or it can
1445
- be inferred from the schema_resolver's platform.
1469
+ The SQL dialect will be inferred from the schema_resolver's platform.
1470
+ That inference can be overridden by passing an override_dialect argument.
1446
1471
  The set of supported dialects is the same as sqlglot's. See their
1447
1472
  `documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
1448
1473
  for the full list.
@@ -1457,7 +1482,7 @@ def _sqlglot_lineage_nocache(
1457
1482
  schema_resolver: The schema resolver to use for resolving table schemas.
1458
1483
  default_db: The default database to use for unqualified table names.
1459
1484
  default_schema: The default schema to use for unqualified table names.
1460
- default_dialect: A default dialect to override the dialect provided by 'schema_resolver'.
1485
+ override_dialect: Override the dialect provided by 'schema_resolver'.
1461
1486
 
1462
1487
  Returns:
1463
1488
  A SqlParsingResult object containing the parsed lineage information.
@@ -1482,10 +1507,32 @@ def _sqlglot_lineage_nocache(
1482
1507
  schema_resolver=schema_resolver,
1483
1508
  default_db=default_db,
1484
1509
  default_schema=default_schema,
1485
- default_dialect=default_dialect,
1510
+ override_dialect=override_dialect,
1486
1511
  )
1487
1512
  except Exception as e:
1488
1513
  return SqlParsingResult.make_from_error(e)
1514
+ except BaseException as e:
1515
+ # Check if this is a PanicException from SQLGlot's Rust tokenizer
1516
+ # We use runtime type checking instead of isinstance() because pyo3_runtime
1517
+ # is only available when sqlglot[rs] is installed and may not be importable
1518
+ # at module load time, but the exception can still be raised at runtime
1519
+ if (
1520
+ e.__class__.__name__ == "PanicException"
1521
+ and e.__class__.__module__ == "pyo3_runtime"
1522
+ ):
1523
+ # Handle pyo3_runtime.PanicException from SQLGlot's Rust tokenizer.
1524
+ # pyo3_runtime.PanicException inherits from BaseException (like SystemExit or
1525
+ # KeyboardInterrupt) rather than Exception, so it bypasses normal exception handling.
1526
+ # Avoid catching BaseException, as it includes KeyboardInterrupt
1527
+ # and would prevent Ctrl+C from working.
1528
+ wrapped_exception = Exception(
1529
+ f"pyo3_runtime.PanicException during SQL parsing: {e}"
1530
+ )
1531
+ wrapped_exception.__cause__ = e
1532
+ return SqlParsingResult.make_from_error(wrapped_exception)
1533
+ else:
1534
+ # Re-raise other BaseException types (SystemExit, KeyboardInterrupt, etc.)
1535
+ raise
1489
1536
 
1490
1537
 
1491
1538
  _sqlglot_lineage_cached = functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)(
@@ -1498,15 +1545,15 @@ def sqlglot_lineage(
1498
1545
  schema_resolver: SchemaResolverInterface,
1499
1546
  default_db: Optional[str] = None,
1500
1547
  default_schema: Optional[str] = None,
1501
- default_dialect: Optional[str] = None,
1548
+ override_dialect: Optional[DialectOrStr] = None,
1502
1549
  ) -> SqlParsingResult:
1503
1550
  if schema_resolver.includes_temp_tables():
1504
1551
  return _sqlglot_lineage_nocache(
1505
- sql, schema_resolver, default_db, default_schema, default_dialect
1552
+ sql, schema_resolver, default_db, default_schema, override_dialect
1506
1553
  )
1507
1554
  else:
1508
1555
  return _sqlglot_lineage_cached(
1509
- sql, schema_resolver, default_db, default_schema, default_dialect
1556
+ sql, schema_resolver, default_db, default_schema, override_dialect
1510
1557
  )
1511
1558
 
1512
1559
 
@@ -1558,6 +1605,7 @@ def create_lineage_sql_parsed_result(
1558
1605
  default_schema: Optional[str] = None,
1559
1606
  graph: Optional[DataHubGraph] = None,
1560
1607
  schema_aware: bool = True,
1608
+ override_dialect: Optional[DialectOrStr] = None,
1561
1609
  ) -> SqlParsingResult:
1562
1610
  schema_resolver = create_schema_resolver(
1563
1611
  platform=platform,
@@ -1577,6 +1625,7 @@ def create_lineage_sql_parsed_result(
1577
1625
  schema_resolver=schema_resolver,
1578
1626
  default_db=default_db,
1579
1627
  default_schema=default_schema,
1628
+ override_dialect=override_dialect,
1580
1629
  )
1581
1630
  except Exception as e:
1582
1631
  return SqlParsingResult.make_from_error(e)
@@ -104,7 +104,7 @@ SENTRY_DSN: Optional[str] = os.environ.get("SENTRY_DSN", None)
104
104
  SENTRY_ENVIRONMENT: str = os.environ.get("SENTRY_ENVIRONMENT", "dev")
105
105
 
106
106
 
107
- def _default_telemetry_properties() -> Dict[str, Any]:
107
+ def _default_global_properties() -> Dict[str, Any]:
108
108
  return {
109
109
  "datahub_version": nice_version_name(),
110
110
  "python_version": platform.python_version(),
@@ -122,6 +122,7 @@ class Telemetry:
122
122
  context_properties: Dict[str, Any] = {}
123
123
 
124
124
  def __init__(self):
125
+ self.global_properties = _default_global_properties()
125
126
  self.context_properties = {}
126
127
 
127
128
  if SENTRY_DSN:
@@ -247,6 +248,10 @@ class Telemetry:
247
248
 
248
249
  return False
249
250
 
251
+ def add_global_property(self, key: str, value: Any) -> None:
252
+ self.global_properties[key] = value
253
+ self._update_sentry_properties()
254
+
250
255
  def set_context(
251
256
  self,
252
257
  server: Optional["DataHubGraph"] = None,
@@ -257,16 +262,17 @@ class Telemetry:
257
262
  **(properties or {}),
258
263
  }
259
264
 
260
- if self.sentry_enabled:
261
- from sentry_sdk import set_tag
265
+ self._update_sentry_properties()
262
266
 
263
- properties = {
264
- **_default_telemetry_properties(),
265
- **self.context_properties,
266
- }
267
+ def _update_sentry_properties(self) -> None:
268
+ properties = {
269
+ **self.global_properties,
270
+ **self.context_properties,
271
+ }
272
+ if self.sentry_enabled:
273
+ import sentry_sdk
267
274
 
268
- for key in properties:
269
- set_tag(key, properties[key])
275
+ sentry_sdk.set_tags(properties)
270
276
 
271
277
  def init_capture_exception(self) -> None:
272
278
  if self.sentry_enabled:
@@ -300,7 +306,7 @@ class Telemetry:
300
306
  try:
301
307
  self.mp.people_set(
302
308
  self.client_id,
303
- _default_telemetry_properties(),
309
+ self.global_properties,
304
310
  )
305
311
  except Exception as e:
306
312
  logger.debug(f"Error initializing telemetry: {e}")
@@ -334,7 +340,7 @@ class Telemetry:
334
340
  logger.debug(f"Sending telemetry for {event_name}")
335
341
 
336
342
  properties = {
337
- **_default_telemetry_properties(),
343
+ **self.global_properties,
338
344
  **self.context_properties,
339
345
  **properties,
340
346
  }
@@ -1,12 +1,18 @@
1
1
  import pathlib
2
+ from typing import Sequence
2
3
 
3
4
  from datahub.sdk.entity import Entity
4
5
  from datahub.testing import mce_helpers
5
6
 
6
7
 
7
- def assert_entity_golden(entity: Entity, golden_path: pathlib.Path) -> None:
8
+ def assert_entity_golden(
9
+ entity: Entity,
10
+ golden_path: pathlib.Path,
11
+ ignore_paths: Sequence[str] = (),
12
+ ) -> None:
8
13
  mce_helpers.check_goldens_stream(
9
14
  outputs=entity.as_mcps(),
10
15
  golden_path=golden_path,
11
16
  ignore_order=False,
17
+ ignore_paths=ignore_paths,
12
18
  )
@@ -7,7 +7,7 @@ from typing import Any, Callable, Optional, Tuple, TypeVar
7
7
 
8
8
  import click
9
9
  import humanfriendly
10
- from packaging.version import Version
10
+ from packaging.version import InvalidVersion, Version
11
11
  from pydantic import BaseModel
12
12
 
13
13
  from datahub._version import __version__
@@ -28,10 +28,23 @@ class VersionStats(BaseModel, arbitrary_types_allowed=True):
28
28
  release_date: Optional[datetime] = None
29
29
 
30
30
 
31
+ def _safe_version_stats(version_string: str) -> Optional[VersionStats]:
32
+ """
33
+ Safely create a VersionStats object from a version string.
34
+ Returns None if the version string is invalid.
35
+ """
36
+ try:
37
+ return VersionStats(version=Version(version_string), release_date=None)
38
+ except InvalidVersion:
39
+ log.warning(f"Invalid version format received: {version_string!r}")
40
+ return None
41
+
42
+
31
43
  class ServerVersionStats(BaseModel):
32
44
  current: VersionStats
33
45
  latest: Optional[VersionStats] = None
34
46
  current_server_type: Optional[str] = None
47
+ current_server_default_cli_version: Optional[VersionStats] = None
35
48
 
36
49
 
37
50
  class ClientVersionStats(BaseModel):
@@ -44,7 +57,7 @@ class DataHubVersionStats(BaseModel):
44
57
  client: ClientVersionStats
45
58
 
46
59
 
47
- async def get_client_version_stats():
60
+ async def get_client_version_stats() -> ClientVersionStats:
48
61
  import aiohttp
49
62
 
50
63
  current_version_string = __version__
@@ -52,6 +65,7 @@ async def get_client_version_stats():
52
65
  client_version_stats: ClientVersionStats = ClientVersionStats(
53
66
  current=VersionStats(version=current_version, release_date=None), latest=None
54
67
  )
68
+
55
69
  async with aiohttp.ClientSession() as session:
56
70
  pypi_url = "https://pypi.org/pypi/acryl_datahub/json"
57
71
  async with session.get(pypi_url) as resp:
@@ -131,7 +145,7 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceCo
131
145
 
132
146
  async def get_server_version_stats(
133
147
  server: Optional[DataHubGraph] = None,
134
- ) -> Tuple[Optional[str], Optional[Version], Optional[datetime]]:
148
+ ) -> Tuple[Optional[str], Optional[Version], Optional[str], Optional[datetime]]:
135
149
  import aiohttp
136
150
 
137
151
  server_config: Optional[RestServiceConfig] = None
@@ -151,12 +165,13 @@ async def get_server_version_stats(
151
165
 
152
166
  server_type = None
153
167
  server_version: Optional[Version] = None
168
+ current_server_default_cli_version = None
154
169
  current_server_release_date = None
155
170
  if server_config:
156
171
  server_version_string = server_config.service_version
157
172
  commit_hash = server_config.commit_hash
158
173
  server_type = server_config.server_type
159
-
174
+ current_server_default_cli_version = server_config.default_cli_version
160
175
  if server_type == "quickstart" and commit_hash:
161
176
  async with aiohttp.ClientSession(
162
177
  headers={"Accept": "application/vnd.github.v3+json"}
@@ -171,7 +186,12 @@ async def get_server_version_stats(
171
186
  if server_version_string and server_version_string.startswith("v"):
172
187
  server_version = Version(server_version_string[1:])
173
188
 
174
- return (server_type, server_version, current_server_release_date)
189
+ return (
190
+ server_type,
191
+ server_version,
192
+ current_server_default_cli_version,
193
+ current_server_release_date,
194
+ )
175
195
 
176
196
 
177
197
  def retrieve_version_stats(
@@ -214,6 +234,7 @@ async def _retrieve_version_stats(
214
234
  (
215
235
  current_server_type,
216
236
  current_server_version,
237
+ current_server_default_cli_version,
217
238
  current_server_release_date,
218
239
  ) = results[2]
219
240
 
@@ -223,6 +244,11 @@ async def _retrieve_version_stats(
223
244
  current=VersionStats(
224
245
  version=current_server_version, release_date=current_server_release_date
225
246
  ),
247
+ current_server_default_cli_version=(
248
+ _safe_version_stats(current_server_default_cli_version)
249
+ if current_server_default_cli_version
250
+ else None
251
+ ),
226
252
  latest=(
227
253
  VersionStats(version=last_server_version, release_date=last_server_date)
228
254
  if last_server_version
@@ -255,21 +281,14 @@ def valid_client_version(version: Version) -> bool:
255
281
  """Only version strings like 0.4.5 and 0.6.7.8 are valid. 0.8.6.7rc1 is not"""
256
282
  if version.is_prerelease or version.is_postrelease or version.is_devrelease:
257
283
  return False
258
- if version.major == 0 and version.minor in [8, 9, 10, 11]:
259
- return True
260
-
261
- return False
284
+ return True
262
285
 
263
286
 
264
287
  def valid_server_version(version: Version) -> bool:
265
288
  """Only version strings like 0.8.x, 0.9.x or 0.10.x are valid. 0.1.x is not"""
266
289
  if version.is_prerelease or version.is_postrelease or version.is_devrelease:
267
290
  return False
268
-
269
- if version.major == 0 and version.minor in [8, 9, 10]:
270
- return True
271
-
272
- return False
291
+ return True
273
292
 
274
293
 
275
294
  def is_client_server_compatible(client: VersionStats, server: VersionStats) -> int:
@@ -291,6 +310,27 @@ def is_client_server_compatible(client: VersionStats, server: VersionStats) -> i
291
310
  return server.version.micro - client.version.micro
292
311
 
293
312
 
313
+ def is_server_default_cli_ahead(version_stats: DataHubVersionStats) -> bool:
314
+ """
315
+ Check if the server default CLI version is ahead of the current CLI version.
316
+ Returns True if server default CLI is newer and both versions are valid.
317
+ """
318
+ if not version_stats.server.current_server_default_cli_version:
319
+ return False
320
+
321
+ current_cli = version_stats.client.current
322
+ server_default_cli = version_stats.server.current_server_default_cli_version
323
+
324
+ is_valid_client_version = valid_client_version(current_cli.version)
325
+ is_valid_server_version = valid_client_version(server_default_cli.version)
326
+
327
+ if not (is_valid_client_version and is_valid_server_version):
328
+ return False
329
+
330
+ compatibility_result = is_client_server_compatible(current_cli, server_default_cli)
331
+ return compatibility_result > 0
332
+
333
+
294
334
  def _maybe_print_upgrade_message(
295
335
  version_stats: Optional[DataHubVersionStats],
296
336
  ) -> None:
@@ -429,6 +469,8 @@ def check_upgrade_post(
429
469
 
430
470
 
431
471
  def check_upgrade(func: Callable[..., T]) -> Callable[..., T]:
472
+ log.debug(f"Checking upgrade for {func.__module__}.{func.__name__}")
473
+
432
474
  @wraps(func)
433
475
  def async_wrapper(*args: Any, **kwargs: Any) -> Any:
434
476
  with PerfTimer() as timer:
@@ -183,6 +183,14 @@ class RestServiceConfig:
183
183
  managed_ingestion = self.raw_config.get("managedIngestion") or {}
184
184
  return managed_ingestion.get("enabled", False)
185
185
 
186
+ @property
187
+ def default_cli_version(self) -> Optional[str]:
188
+ """
189
+ Get the default CLI version.
190
+ """
191
+ managed_ingestion = self.raw_config.get("managedIngestion") or {}
192
+ return managed_ingestion.get("defaultCliVersion")
193
+
186
194
  @property
187
195
  def is_datahub_cloud(self) -> bool:
188
196
  """
@@ -272,8 +272,11 @@ class SQLAlchemyQueryCombiner:
272
272
  self.report.uncombined_queries_issued += 1
273
273
  return _sa_execute_underlying_method(conn, query, *args, **kwargs)
274
274
 
275
- with _sa_execute_method_patching_lock, unittest.mock.patch(
276
- "sqlalchemy.engine.Connection.execute", _sa_execute_fake
275
+ with (
276
+ _sa_execute_method_patching_lock,
277
+ unittest.mock.patch(
278
+ "sqlalchemy.engine.Connection.execute", _sa_execute_fake
279
+ ),
277
280
  ):
278
281
  yield self
279
282
 
@@ -56,3 +56,7 @@ class TopKDict(DefaultDict[_KT, _VT]):
56
56
 
57
57
  def int_top_k_dict() -> TopKDict[str, int]:
58
58
  return TopKDict(int)
59
+
60
+
61
+ def float_top_k_dict() -> TopKDict[str, float]:
62
+ return TopKDict(float)