acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
  2. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/api/report.py +183 -35
  26. datahub/ingestion/autogenerated/capability_summary.json +3431 -0
  27. datahub/ingestion/autogenerated/lineage.json +401 -0
  28. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  29. datahub/ingestion/extractor/schema_util.py +13 -4
  30. datahub/ingestion/graph/client.py +2 -2
  31. datahub/ingestion/run/pipeline.py +47 -1
  32. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  33. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  34. datahub/ingestion/source/common/subtypes.py +1 -1
  35. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  37. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  38. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  39. datahub/ingestion/source/ge_data_profiler.py +28 -20
  40. datahub/ingestion/source/identity/okta.py +0 -13
  41. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  42. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  43. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  44. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  45. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  46. datahub/ingestion/source/redshift/usage.py +4 -3
  47. datahub/ingestion/source/s3/source.py +19 -3
  48. datahub/ingestion/source/sigma/sigma.py +6 -1
  49. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  50. datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
  51. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  52. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  53. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  54. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  55. datahub/ingestion/source/sql/sql_common.py +4 -0
  56. datahub/ingestion/source/sql/vertica.py +0 -4
  57. datahub/ingestion/source/sql_queries.py +2 -2
  58. datahub/ingestion/source/superset.py +56 -1
  59. datahub/ingestion/source/tableau/tableau.py +40 -34
  60. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  61. datahub/ingestion/source/unity/proxy.py +4 -3
  62. datahub/ingestion/source/unity/source.py +19 -9
  63. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  64. datahub/metadata/_internal_schema_classes.py +85 -4
  65. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  66. datahub/metadata/schema.avsc +54 -1
  67. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  68. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  69. datahub/sdk/lineage_client.py +2 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
  71. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  72. datahub/upgrade/upgrade.py +46 -13
  73. datahub/utilities/server_config_util.py +8 -0
  74. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  75. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
  76. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
  77. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
  78. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
@@ -14127,26 +14127,39 @@ class CorpUserHomePageSettingsClass(DictWrapper):
14127
14127
 
14128
14128
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.identity.CorpUserHomePageSettings")
14129
14129
  def __init__(self,
14130
- pageTemplate: str,
14130
+ pageTemplate: Union[None, str]=None,
14131
+ dismissedAnnouncements: Union[None, List[str]]=None,
14131
14132
  ):
14132
14133
  super().__init__()
14133
14134
 
14134
14135
  self.pageTemplate = pageTemplate
14136
+ self.dismissedAnnouncements = dismissedAnnouncements
14135
14137
 
14136
14138
  def _restore_defaults(self) -> None:
14137
- self.pageTemplate = str()
14139
+ self.pageTemplate = self.RECORD_SCHEMA.fields_dict["pageTemplate"].default
14140
+ self.dismissedAnnouncements = self.RECORD_SCHEMA.fields_dict["dismissedAnnouncements"].default
14138
14141
 
14139
14142
 
14140
14143
  @property
14141
- def pageTemplate(self) -> str:
14144
+ def pageTemplate(self) -> Union[None, str]:
14142
14145
  """The page template that will be rendered in the UI by default for this user"""
14143
14146
  return self._inner_dict.get('pageTemplate') # type: ignore
14144
14147
 
14145
14148
  @pageTemplate.setter
14146
- def pageTemplate(self, value: str) -> None:
14149
+ def pageTemplate(self, value: Union[None, str]) -> None:
14147
14150
  self._inner_dict['pageTemplate'] = value
14148
14151
 
14149
14152
 
14153
+ @property
14154
+ def dismissedAnnouncements(self) -> Union[None, List[str]]:
14155
+ """The list of announcement urns that have been dismissed by the user"""
14156
+ return self._inner_dict.get('dismissedAnnouncements') # type: ignore
14157
+
14158
+ @dismissedAnnouncements.setter
14159
+ def dismissedAnnouncements(self, value: Union[None, List[str]]) -> None:
14160
+ self._inner_dict['dismissedAnnouncements'] = value
14161
+
14162
+
14150
14163
  class CorpUserInfoClass(_Aspect):
14151
14164
  """Linkedin corp user information"""
14152
14165
 
@@ -24394,6 +24407,59 @@ class DataHubSecretValueClass(_Aspect):
24394
24407
  self._inner_dict['created'] = value
24395
24408
 
24396
24409
 
24410
+ class ApplicationsSettingsClass(DictWrapper):
24411
+ # No docs available.
24412
+
24413
+ RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.settings.global.ApplicationsSettings")
24414
+ def __init__(self,
24415
+ enabled: bool,
24416
+ config: Union[None, str]=None,
24417
+ configVersion: Union[None, str]=None,
24418
+ ):
24419
+ super().__init__()
24420
+
24421
+ self.enabled = enabled
24422
+ self.config = config
24423
+ self.configVersion = configVersion
24424
+
24425
+ def _restore_defaults(self) -> None:
24426
+ self.enabled = bool()
24427
+ self.config = self.RECORD_SCHEMA.fields_dict["config"].default
24428
+ self.configVersion = self.RECORD_SCHEMA.fields_dict["configVersion"].default
24429
+
24430
+
24431
+ @property
24432
+ def enabled(self) -> bool:
24433
+ # No docs available.
24434
+ return self._inner_dict.get('enabled') # type: ignore
24435
+
24436
+ @enabled.setter
24437
+ def enabled(self, value: bool) -> None:
24438
+ self._inner_dict['enabled'] = value
24439
+
24440
+
24441
+ @property
24442
+ def config(self) -> Union[None, str]:
24443
+ """The configuration for the feature, in JSON format."""
24444
+ return self._inner_dict.get('config') # type: ignore
24445
+
24446
+ @config.setter
24447
+ def config(self, value: Union[None, str]) -> None:
24448
+ self._inner_dict['config'] = value
24449
+
24450
+
24451
+ @property
24452
+ def configVersion(self) -> Union[None, str]:
24453
+ """The version of the configuration schema that has been used to serialize
24454
+ the config.
24455
+ If not provided, the version is assumed to be the latest version."""
24456
+ return self._inner_dict.get('configVersion') # type: ignore
24457
+
24458
+ @configVersion.setter
24459
+ def configVersion(self, value: Union[None, str]) -> None:
24460
+ self._inner_dict['configVersion'] = value
24461
+
24462
+
24397
24463
  class DocPropagationFeatureSettingsClass(DictWrapper):
24398
24464
  # No docs available.
24399
24465
 
@@ -24502,6 +24568,7 @@ class GlobalSettingsInfoClass(_Aspect):
24502
24568
  views: Union[None, "GlobalViewsSettingsClass"]=None,
24503
24569
  docPropagation: Optional[Union["DocPropagationFeatureSettingsClass", None]]=None,
24504
24570
  homePage: Union[None, "GlobalHomePageSettingsClass"]=None,
24571
+ applications: Union[None, "ApplicationsSettingsClass"]=None,
24505
24572
  ):
24506
24573
  super().__init__()
24507
24574
 
@@ -24513,12 +24580,14 @@ class GlobalSettingsInfoClass(_Aspect):
24513
24580
  else:
24514
24581
  self.docPropagation = docPropagation
24515
24582
  self.homePage = homePage
24583
+ self.applications = applications
24516
24584
 
24517
24585
  def _restore_defaults(self) -> None:
24518
24586
  self.sso = self.RECORD_SCHEMA.fields_dict["sso"].default
24519
24587
  self.views = self.RECORD_SCHEMA.fields_dict["views"].default
24520
24588
  self.docPropagation = _json_converter.from_json_object(self.RECORD_SCHEMA.fields_dict["docPropagation"].default, writers_schema=self.RECORD_SCHEMA.fields_dict["docPropagation"].type)
24521
24589
  self.homePage = self.RECORD_SCHEMA.fields_dict["homePage"].default
24590
+ self.applications = self.RECORD_SCHEMA.fields_dict["applications"].default
24522
24591
 
24523
24592
 
24524
24593
  @property
@@ -24561,6 +24630,16 @@ class GlobalSettingsInfoClass(_Aspect):
24561
24630
  self._inner_dict['homePage'] = value
24562
24631
 
24563
24632
 
24633
+ @property
24634
+ def applications(self) -> Union[None, "ApplicationsSettingsClass"]:
24635
+ """Settings related to applications. If not enabled, applications won't show up in navigation"""
24636
+ return self._inner_dict.get('applications') # type: ignore
24637
+
24638
+ @applications.setter
24639
+ def applications(self, value: Union[None, "ApplicationsSettingsClass"]) -> None:
24640
+ self._inner_dict['applications'] = value
24641
+
24642
+
24564
24643
  class GlobalViewsSettingsClass(DictWrapper):
24565
24644
  """Settings for DataHub Views feature."""
24566
24645
 
@@ -27086,6 +27165,7 @@ __SCHEMA_TYPES = {
27086
27165
  'com.linkedin.pegasus2avro.schemafield.SchemaFieldAliases': SchemaFieldAliasesClass,
27087
27166
  'com.linkedin.pegasus2avro.schemafield.SchemaFieldInfo': SchemaFieldInfoClass,
27088
27167
  'com.linkedin.pegasus2avro.secret.DataHubSecretValue': DataHubSecretValueClass,
27168
+ 'com.linkedin.pegasus2avro.settings.global.ApplicationsSettings': ApplicationsSettingsClass,
27089
27169
  'com.linkedin.pegasus2avro.settings.global.DocPropagationFeatureSettings': DocPropagationFeatureSettingsClass,
27090
27170
  'com.linkedin.pegasus2avro.settings.global.GlobalHomePageSettings': GlobalHomePageSettingsClass,
27091
27171
  'com.linkedin.pegasus2avro.settings.global.GlobalSettingsInfo': GlobalSettingsInfoClass,
@@ -27592,6 +27672,7 @@ __SCHEMA_TYPES = {
27592
27672
  'SchemaFieldAliases': SchemaFieldAliasesClass,
27593
27673
  'SchemaFieldInfo': SchemaFieldInfoClass,
27594
27674
  'DataHubSecretValue': DataHubSecretValueClass,
27675
+ 'ApplicationsSettings': ApplicationsSettingsClass,
27595
27676
  'DocPropagationFeatureSettings': DocPropagationFeatureSettingsClass,
27596
27677
  'GlobalHomePageSettings': GlobalHomePageSettingsClass,
27597
27678
  'GlobalSettingsInfo': GlobalSettingsInfoClass,
@@ -7,6 +7,7 @@
7
7
  # pylint: skip-file
8
8
  # fmt: off
9
9
  # isort: skip_file
10
+ from ......schema_classes import ApplicationsSettingsClass
10
11
  from ......schema_classes import DocPropagationFeatureSettingsClass
11
12
  from ......schema_classes import GlobalHomePageSettingsClass
12
13
  from ......schema_classes import GlobalSettingsInfoClass
@@ -15,6 +16,7 @@ from ......schema_classes import OidcSettingsClass
15
16
  from ......schema_classes import SsoSettingsClass
16
17
 
17
18
 
19
+ ApplicationsSettings = ApplicationsSettingsClass
18
20
  DocPropagationFeatureSettings = DocPropagationFeatureSettingsClass
19
21
  GlobalHomePageSettings = GlobalHomePageSettingsClass
20
22
  GlobalSettingsInfo = GlobalSettingsInfoClass
@@ -751,9 +751,25 @@
751
751
  "class": "com.linkedin.pegasus2avro.common.urn.Urn"
752
752
  },
753
753
  "Urn": "Urn",
754
- "type": "string",
754
+ "type": [
755
+ "null",
756
+ "string"
757
+ ],
755
758
  "name": "pageTemplate",
759
+ "default": null,
756
760
  "doc": "The page template that will be rendered in the UI by default for this user"
761
+ },
762
+ {
763
+ "type": [
764
+ "null",
765
+ {
766
+ "type": "array",
767
+ "items": "string"
768
+ }
769
+ ],
770
+ "name": "dismissedAnnouncements",
771
+ "default": null,
772
+ "doc": "The list of announcement urns that have been dismissed by the user"
757
773
  }
758
774
  ],
759
775
  "doc": "Settings related to the home page for a user"
@@ -18531,6 +18547,43 @@
18531
18547
  "name": "homePage",
18532
18548
  "default": null,
18533
18549
  "doc": "Global settings related to the home page for an instance"
18550
+ },
18551
+ {
18552
+ "type": [
18553
+ "null",
18554
+ {
18555
+ "type": "record",
18556
+ "name": "ApplicationsSettings",
18557
+ "namespace": "com.linkedin.pegasus2avro.settings.global",
18558
+ "fields": [
18559
+ {
18560
+ "type": "boolean",
18561
+ "name": "enabled"
18562
+ },
18563
+ {
18564
+ "type": [
18565
+ "null",
18566
+ "string"
18567
+ ],
18568
+ "name": "config",
18569
+ "default": null,
18570
+ "doc": "The configuration for the feature, in JSON format."
18571
+ },
18572
+ {
18573
+ "type": [
18574
+ "null",
18575
+ "string"
18576
+ ],
18577
+ "name": "configVersion",
18578
+ "default": null,
18579
+ "doc": "The version of the configuration schema that has been used to serialize\n the config.\nIf not provided, the version is assumed to be the latest version."
18580
+ }
18581
+ ]
18582
+ }
18583
+ ],
18584
+ "name": "applications",
18585
+ "default": null,
18586
+ "doc": "Settings related to applications. If not enabled, applications won't show up in navigation"
18534
18587
  }
18535
18588
  ],
18536
18589
  "doc": "DataHub Global platform settings. Careful - these should not be modified by the outside world!"
@@ -172,10 +172,26 @@
172
172
  "java": {
173
173
  "class": "com.linkedin.pegasus2avro.common.urn.Urn"
174
174
  },
175
- "type": "string",
175
+ "type": [
176
+ "null",
177
+ "string"
178
+ ],
176
179
  "name": "pageTemplate",
180
+ "default": null,
177
181
  "doc": "The page template that will be rendered in the UI by default for this user",
178
182
  "Urn": "Urn"
183
+ },
184
+ {
185
+ "type": [
186
+ "null",
187
+ {
188
+ "type": "array",
189
+ "items": "string"
190
+ }
191
+ ],
192
+ "name": "dismissedAnnouncements",
193
+ "default": null,
194
+ "doc": "The list of announcement urns that have been dismissed by the user"
179
195
  }
180
196
  ],
181
197
  "doc": "Settings related to the home page for a user"
@@ -307,6 +307,43 @@
307
307
  "name": "homePage",
308
308
  "default": null,
309
309
  "doc": "Global settings related to the home page for an instance"
310
+ },
311
+ {
312
+ "type": [
313
+ "null",
314
+ {
315
+ "type": "record",
316
+ "name": "ApplicationsSettings",
317
+ "namespace": "com.linkedin.pegasus2avro.settings.global",
318
+ "fields": [
319
+ {
320
+ "type": "boolean",
321
+ "name": "enabled"
322
+ },
323
+ {
324
+ "type": [
325
+ "null",
326
+ "string"
327
+ ],
328
+ "name": "config",
329
+ "default": null,
330
+ "doc": "The configuration for the feature, in JSON format."
331
+ },
332
+ {
333
+ "type": [
334
+ "null",
335
+ "string"
336
+ ],
337
+ "name": "configVersion",
338
+ "default": null,
339
+ "doc": "The version of the configuration schema that has been used to serialize\n the config.\nIf not provided, the version is assumed to be the latest version."
340
+ }
341
+ ]
342
+ }
343
+ ],
344
+ "name": "applications",
345
+ "default": null,
346
+ "doc": "Settings related to applications. If not enabled, applications won't show up in navigation"
310
347
  }
311
348
  ],
312
349
  "doc": "DataHub Global platform settings. Careful - these should not be modified by the outside world!"
@@ -478,6 +478,7 @@ class LineageClient:
478
478
  env: str = "PROD",
479
479
  default_db: Optional[str] = None,
480
480
  default_schema: Optional[str] = None,
481
+ override_dialect: Optional[str] = None,
481
482
  ) -> None:
482
483
  """Add lineage by parsing a SQL query."""
483
484
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -493,6 +494,7 @@ class LineageClient:
493
494
  platform_instance=platform_instance,
494
495
  env=env,
495
496
  graph=self._client._graph,
497
+ override_dialect=override_dialect,
496
498
  )
497
499
 
498
500
  if parsed_result.debug_info.table_error:
@@ -1494,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
1494
1494
  return
1495
1495
 
1496
1496
  # If a query doesn't involve any allowed tables, skip it.
1497
- if downstream_urn is None and not any(
1498
- self.is_allowed_table(urn) for urn in query.upstreams
1499
- ):
1497
+ if (
1498
+ downstream_urn is None or not self.is_allowed_table(downstream_urn)
1499
+ ) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
1500
1500
  self.report.num_queries_skipped_due_to_filters += 1
1501
1501
  return
1502
1502
 
@@ -1577,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
1577
1577
 
1578
1578
  @dataclasses.dataclass
1579
1579
  class QueryLineageInfo:
1580
- upstreams: List[UrnStr] # this is direct upstreams, with *no temp tables*
1581
- column_lineage: List[ColumnLineageInfo]
1580
+ upstreams: OrderedSet[
1581
+ UrnStr
1582
+ ] # this is direct upstreams, with *no temp tables*
1583
+ column_lineage: OrderedSet[ColumnLineageInfo]
1582
1584
  confidence_score: float
1583
1585
 
1584
1586
  def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
1585
- self.upstreams += other_query.upstreams
1586
- self.column_lineage += other_query.column_lineage
1587
+ self.upstreams.update(other_query.upstreams)
1588
+ self.column_lineage.update(other_query.column_lineage)
1587
1589
  self.confidence_score = min(
1588
1590
  self.confidence_score, other_query.confidence_score
1589
1591
  )
1590
1592
 
1593
+ cache: Dict[str, QueryLineageInfo] = {}
1594
+
1591
1595
  def _recurse_into_query(
1592
1596
  query: QueryMetadata, recursion_path: List[QueryId]
1593
1597
  ) -> QueryLineageInfo:
1594
1598
  if query.query_id in recursion_path:
1595
1599
  # This is a cycle, so we just return the query as-is.
1596
1600
  return QueryLineageInfo(
1597
- upstreams=query.upstreams,
1598
- column_lineage=query.column_lineage,
1601
+ upstreams=OrderedSet(query.upstreams),
1602
+ column_lineage=OrderedSet(query.column_lineage),
1599
1603
  confidence_score=query.confidence_score,
1600
1604
  )
1605
+ if query.query_id in cache:
1606
+ return cache[query.query_id]
1601
1607
  recursion_path = [*recursion_path, query.query_id]
1602
1608
  composed_of_queries.add(query.query_id)
1603
1609
 
@@ -1612,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
1612
1618
  upstream_query = self._query_map.get(upstream_query_id)
1613
1619
  if (
1614
1620
  upstream_query
1615
- and upstream_query.query_id not in composed_of_queries
1621
+ and upstream_query.query_id not in recursion_path
1616
1622
  ):
1617
1623
  temp_query_lineage_info = _recurse_into_query(
1618
1624
  upstream_query, recursion_path
@@ -1672,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
1672
1678
  ]
1673
1679
  )
1674
1680
 
1675
- return QueryLineageInfo(
1676
- upstreams=list(new_upstreams),
1677
- column_lineage=new_cll,
1681
+ ret = QueryLineageInfo(
1682
+ upstreams=new_upstreams,
1683
+ column_lineage=OrderedSet(new_cll),
1678
1684
  confidence_score=new_confidence_score,
1679
1685
  )
1686
+ cache[query.query_id] = ret
1687
+
1688
+ return ret
1680
1689
 
1681
1690
  resolved_lineage_info = _recurse_into_query(base_query, [])
1682
1691
 
@@ -1716,8 +1725,8 @@ class SqlParsingAggregator(Closeable):
1716
1725
  base_query,
1717
1726
  query_id=composite_query_id,
1718
1727
  formatted_query_string=merged_query_text,
1719
- upstreams=resolved_lineage_info.upstreams,
1720
- column_lineage=resolved_lineage_info.column_lineage,
1728
+ upstreams=list(resolved_lineage_info.upstreams),
1729
+ column_lineage=list(resolved_lineage_info.column_lineage),
1721
1730
  confidence_score=resolved_lineage_info.confidence_score,
1722
1731
  )
1723
1732
 
@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
56
56
  QueryTypeProps,
57
57
  )
58
58
  from datahub.sql_parsing.sqlglot_utils import (
59
+ DialectOrStr,
59
60
  get_dialect,
60
61
  get_query_fingerprint_debug,
61
62
  is_dialect_instance,
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
124
125
 
125
126
 
126
127
  class DownstreamColumnRef(_ParserBaseModel):
128
+ """
129
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
130
+ What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
131
+ auto-generated class from .pdl model files. We need generic solution allowing us to either:
132
+ 1. Implement hashing for .pdl model objects
133
+ 2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
134
+ hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
135
+ to understand that instruction as well.
136
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
137
+ """
138
+
127
139
  table: Optional[Urn] = None
128
140
  column: str
129
141
  column_type: Optional[SchemaFieldDataTypeClass] = None
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
139
151
  return v
140
152
  return SchemaFieldDataTypeClass.from_obj(v)
141
153
 
154
+ def __hash__(self) -> int:
155
+ return hash((self.table, self.column, self.native_column_type))
142
156
 
143
- class ColumnTransformation(_ParserBaseModel):
157
+
158
+ class ColumnTransformation(_FrozenModel):
144
159
  is_direct_copy: bool
145
160
  column_logic: str
146
161
 
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
153
168
 
154
169
 
155
170
  class ColumnLineageInfo(_ParserBaseModel):
171
+ """
172
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
173
+ To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
174
+ depending on it.
175
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
176
+ """
177
+
156
178
  downstream: DownstreamColumnRef
157
179
  upstreams: List[ColumnRef]
158
180
 
159
181
  logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
160
182
 
183
+ def __hash__(self) -> int:
184
+ return hash((self.downstream, tuple(self.upstreams), self.logic))
185
+
161
186
 
162
187
  class _JoinInfo(_ParserBaseModel):
163
188
  join_type: str
@@ -1231,12 +1256,12 @@ def _sqlglot_lineage_inner(
1231
1256
  schema_resolver: SchemaResolverInterface,
1232
1257
  default_db: Optional[str] = None,
1233
1258
  default_schema: Optional[str] = None,
1234
- default_dialect: Optional[str] = None,
1259
+ override_dialect: Optional[DialectOrStr] = None,
1235
1260
  ) -> SqlParsingResult:
1236
- if not default_dialect:
1237
- dialect = get_dialect(schema_resolver.platform)
1261
+ if override_dialect:
1262
+ dialect = get_dialect(override_dialect)
1238
1263
  else:
1239
- dialect = get_dialect(default_dialect)
1264
+ dialect = get_dialect(schema_resolver.platform)
1240
1265
 
1241
1266
  default_db = _normalize_db_or_schema(default_db, dialect)
1242
1267
  default_schema = _normalize_db_or_schema(default_schema, dialect)
@@ -1423,7 +1448,7 @@ def _sqlglot_lineage_nocache(
1423
1448
  schema_resolver: SchemaResolverInterface,
1424
1449
  default_db: Optional[str] = None,
1425
1450
  default_schema: Optional[str] = None,
1426
- default_dialect: Optional[str] = None,
1451
+ override_dialect: Optional[DialectOrStr] = None,
1427
1452
  ) -> SqlParsingResult:
1428
1453
  """Parse a SQL statement and generate lineage information.
1429
1454
 
@@ -1441,8 +1466,8 @@ def _sqlglot_lineage_nocache(
1441
1466
  can be brittle with respect to missing schema information and complex
1442
1467
  SQL logic like UNNESTs.
1443
1468
 
1444
- The SQL dialect can be given as an argument called default_dialect or it can
1445
- be inferred from the schema_resolver's platform.
1469
+ The SQL dialect will be inferred from the schema_resolver's platform.
1470
+ That inference can be overridden by passing an override_dialect argument.
1446
1471
  The set of supported dialects is the same as sqlglot's. See their
1447
1472
  `documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
1448
1473
  for the full list.
@@ -1457,7 +1482,7 @@ def _sqlglot_lineage_nocache(
1457
1482
  schema_resolver: The schema resolver to use for resolving table schemas.
1458
1483
  default_db: The default database to use for unqualified table names.
1459
1484
  default_schema: The default schema to use for unqualified table names.
1460
- default_dialect: A default dialect to override the dialect provided by 'schema_resolver'.
1485
+ override_dialect: Override the dialect provided by 'schema_resolver'.
1461
1486
 
1462
1487
  Returns:
1463
1488
  A SqlParsingResult object containing the parsed lineage information.
@@ -1482,7 +1507,7 @@ def _sqlglot_lineage_nocache(
1482
1507
  schema_resolver=schema_resolver,
1483
1508
  default_db=default_db,
1484
1509
  default_schema=default_schema,
1485
- default_dialect=default_dialect,
1510
+ override_dialect=override_dialect,
1486
1511
  )
1487
1512
  except Exception as e:
1488
1513
  return SqlParsingResult.make_from_error(e)
@@ -1520,15 +1545,15 @@ def sqlglot_lineage(
1520
1545
  schema_resolver: SchemaResolverInterface,
1521
1546
  default_db: Optional[str] = None,
1522
1547
  default_schema: Optional[str] = None,
1523
- default_dialect: Optional[str] = None,
1548
+ override_dialect: Optional[DialectOrStr] = None,
1524
1549
  ) -> SqlParsingResult:
1525
1550
  if schema_resolver.includes_temp_tables():
1526
1551
  return _sqlglot_lineage_nocache(
1527
- sql, schema_resolver, default_db, default_schema, default_dialect
1552
+ sql, schema_resolver, default_db, default_schema, override_dialect
1528
1553
  )
1529
1554
  else:
1530
1555
  return _sqlglot_lineage_cached(
1531
- sql, schema_resolver, default_db, default_schema, default_dialect
1556
+ sql, schema_resolver, default_db, default_schema, override_dialect
1532
1557
  )
1533
1558
 
1534
1559
 
@@ -1580,6 +1605,7 @@ def create_lineage_sql_parsed_result(
1580
1605
  default_schema: Optional[str] = None,
1581
1606
  graph: Optional[DataHubGraph] = None,
1582
1607
  schema_aware: bool = True,
1608
+ override_dialect: Optional[DialectOrStr] = None,
1583
1609
  ) -> SqlParsingResult:
1584
1610
  schema_resolver = create_schema_resolver(
1585
1611
  platform=platform,
@@ -1599,6 +1625,7 @@ def create_lineage_sql_parsed_result(
1599
1625
  schema_resolver=schema_resolver,
1600
1626
  default_db=default_db,
1601
1627
  default_schema=default_schema,
1628
+ override_dialect=override_dialect,
1602
1629
  )
1603
1630
  except Exception as e:
1604
1631
  return SqlParsingResult.make_from_error(e)