acryl-datahub 1.1.0.5rc7__py3-none-any.whl → 1.1.0.5rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (59) hide show
  1. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/METADATA +2620 -2622
  2. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/RECORD +59 -59
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/autogenerated/capability_summary.json +88 -23
  26. datahub/ingestion/extractor/schema_util.py +13 -4
  27. datahub/ingestion/graph/client.py +2 -2
  28. datahub/ingestion/run/pipeline.py +43 -0
  29. datahub/ingestion/source/bigquery_v2/bigquery.py +9 -1
  30. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  31. datahub/ingestion/source/dremio/dremio_source.py +1 -4
  32. datahub/ingestion/source/gcs/gcs_source.py +9 -1
  33. datahub/ingestion/source/identity/okta.py +0 -13
  34. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  35. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  36. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  37. datahub/ingestion/source/sigma/sigma.py +6 -1
  38. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  39. datahub/ingestion/source/snowflake/snowflake_queries.py +100 -58
  40. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  41. datahub/ingestion/source/snowflake/stored_proc_lineage.py +1 -1
  42. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  43. datahub/ingestion/source/sql/sql_common.py +8 -0
  44. datahub/ingestion/source/sql/teradata.py +993 -234
  45. datahub/ingestion/source/sql/vertica.py +0 -4
  46. datahub/ingestion/source/sql_queries.py +2 -2
  47. datahub/ingestion/source/superset.py +56 -1
  48. datahub/ingestion/source/tableau/tableau.py +40 -34
  49. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  50. datahub/ingestion/source/unity/source.py +9 -1
  51. datahub/sdk/lineage_client.py +2 -2
  52. datahub/sql_parsing/sql_parsing_aggregator.py +21 -12
  53. datahub/sql_parsing/sqlglot_lineage.py +40 -15
  54. datahub/upgrade/upgrade.py +46 -13
  55. datahub/utilities/server_config_util.py +8 -0
  56. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/WHEEL +0 -0
  57. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/entry_points.txt +0 -0
  58. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/licenses/LICENSE +0 -0
  59. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/top_level.txt +0 -0
@@ -45,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
45
45
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
46
46
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
47
47
  from datahub.metadata.schema_classes import (
48
- ChangeTypeClass,
49
48
  DatasetLineageTypeClass,
50
49
  DatasetPropertiesClass,
51
50
  SubTypesClass,
@@ -501,10 +500,7 @@ class VerticaSource(SQLAlchemySource):
501
500
  if dpi_aspect:
502
501
  yield dpi_aspect
503
502
  yield MetadataChangeProposalWrapper(
504
- entityType="dataset",
505
- changeType=ChangeTypeClass.UPSERT,
506
503
  entityUrn=dataset_urn,
507
- aspectName="subTypes",
508
504
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
509
505
  ).as_workunit()
510
506
 
@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
66
66
  description="The default schema to use for unqualified table names",
67
67
  default=None,
68
68
  )
69
- default_dialect: Optional[str] = Field(
69
+ override_dialect: Optional[str] = Field(
70
70
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
71
71
  default=None,
72
72
  )
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
181
181
  schema_resolver=self.schema_resolver,
182
182
  default_db=self.config.default_db,
183
183
  default_schema=self.config.default_schema,
184
- default_dialect=self.config.default_dialect,
184
+ override_dialect=self.config.override_dialect,
185
185
  )
186
186
  if result.debug_info.table_error:
187
187
  logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
658
658
  if datasource_id:
659
659
  dataset_info = self.get_dataset_info(datasource_id).get("result", {})
660
660
  dataset_column_info = dataset_info.get("columns", [])
661
+ dataset_metric_info = dataset_info.get("metrics", [])
661
662
 
662
663
  for column in dataset_column_info:
663
664
  col_name = column.get("column_name", "")
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
671
672
  continue
672
673
 
673
674
  dataset_columns.append((col_name, col_type, col_description))
675
+
676
+ for metric in dataset_metric_info:
677
+ metric_name = metric.get("metric_name", "")
678
+ metric_type = metric.get("metric_type", "")
679
+ metric_description = metric.get("description", "")
680
+
681
+ if metric_name == "" or metric_type == "":
682
+ logger.info(f"could not construct metric lineage for {metric}")
683
+ continue
684
+
685
+ dataset_columns.append((metric_name, metric_type, metric_description))
674
686
  else:
675
687
  # if no datasource id, cannot build cll, just return
676
688
  logger.warning(
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
972
984
  schema_fields.append(field)
973
985
  return schema_fields
974
986
 
987
+ def gen_metric_schema_fields(
988
+ self, metric_data: List[Dict[str, Any]]
989
+ ) -> List[SchemaField]:
990
+ schema_fields: List[SchemaField] = []
991
+ for metric in metric_data:
992
+ metric_type = metric.get("metric_type", "")
993
+ data_type = resolve_sql_type(metric_type)
994
+ if data_type is None:
995
+ data_type = NullType()
996
+
997
+ field = SchemaField(
998
+ fieldPath=metric.get("metric_name", ""),
999
+ type=SchemaFieldDataType(data_type),
1000
+ nativeDataType=metric_type or "",
1001
+ description=metric.get("description", ""),
1002
+ nullable=True,
1003
+ )
1004
+ schema_fields.append(field)
1005
+ return schema_fields
1006
+
975
1007
  def gen_schema_metadata(
976
1008
  self,
977
1009
  dataset_response: dict,
978
1010
  ) -> SchemaMetadata:
979
1011
  dataset_response = dataset_response.get("result", {})
980
1012
  column_data = dataset_response.get("columns", [])
1013
+ metric_data = dataset_response.get("metrics", [])
1014
+
1015
+ column_fields = self.gen_schema_fields(column_data)
1016
+ metric_fields = self.gen_metric_schema_fields(metric_data)
1017
+
981
1018
  schema_metadata = SchemaMetadata(
982
1019
  schemaName=dataset_response.get("table_name", ""),
983
1020
  platform=make_data_platform_urn(self.platform),
984
1021
  version=0,
985
1022
  hash="",
986
1023
  platformSchema=MySqlDDL(tableSchema=""),
987
- fields=self.gen_schema_fields(column_data),
1024
+ fields=column_fields + metric_fields,
988
1025
  )
989
1026
  return schema_metadata
990
1027
 
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
1049
1086
  # To generate column level lineage, we can manually decode the metadata
1050
1087
  # to produce the ColumnLineageInfo
1051
1088
  columns = dataset_response.get("result", {}).get("columns", [])
1089
+ metrics = dataset_response.get("result", {}).get("metrics", [])
1090
+
1052
1091
  fine_grained_lineages: List[FineGrainedLineageClass] = []
1053
1092
 
1054
1093
  for column in columns:
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
1067
1106
  )
1068
1107
  )
1069
1108
 
1109
+ for metric in metrics:
1110
+ metric_name = metric.get("metric_name", "")
1111
+ if not metric_name:
1112
+ continue
1113
+
1114
+ downstream = [make_schema_field_urn(datasource_urn, metric_name)]
1115
+ upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
1116
+ fine_grained_lineages.append(
1117
+ FineGrainedLineageClass(
1118
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1119
+ downstreams=downstream,
1120
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1121
+ upstreams=upstreams,
1122
+ )
1123
+ )
1124
+
1070
1125
  upstream_lineage = UpstreamLineageClass(
1071
1126
  upstreams=[
1072
1127
  UpstreamClass(
@@ -149,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
149
149
  )
150
150
  from datahub.metadata.schema_classes import (
151
151
  BrowsePathsClass,
152
- ChangeTypeClass,
153
152
  ChartInfoClass,
154
153
  ChartUsageStatisticsClass,
155
154
  DashboardInfoClass,
@@ -529,6 +528,14 @@ class TableauConfig(
529
528
  default=False,
530
529
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
531
530
  )
531
+ emit_all_published_datasources: bool = Field(
532
+ default=False,
533
+ description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
534
+ )
535
+ emit_all_embedded_datasources: bool = Field(
536
+ default=False,
537
+ description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
538
+ )
532
539
 
533
540
  env: str = Field(
534
541
  default=builder.DEFAULT_ENV,
@@ -2180,32 +2187,32 @@ class TableauSiteSource:
2180
2187
  else []
2181
2188
  )
2182
2189
 
2183
- # The Tableau SQL parser much worse than our sqlglot based parser,
2184
- # so relying on metadata parsed by Tableau from SQL queries can be
2185
- # less accurate. This option allows us to ignore Tableau's parser and
2186
- # only use our own.
2187
- if self.config.force_extraction_of_lineage_from_custom_sql_queries:
2188
- logger.debug("Extracting TLL & CLL from custom sql (forced)")
2190
+ tableau_table_list = csql.get(c.TABLES, [])
2191
+ if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
2192
+ not tableau_table_list
2193
+ and self.config.extract_lineage_from_unsupported_custom_sql_queries
2194
+ ):
2195
+ if not tableau_table_list:
2196
+ # custom sql tables may contain unsupported sql, causing incomplete lineage
2197
+ # we extract the lineage from the raw queries
2198
+ logger.debug(
2199
+ "Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
2200
+ )
2201
+ else:
2202
+ # The Tableau SQL parser is much worse than our sqlglot based parser,
2203
+ # so relying on metadata parsed by Tableau from SQL queries can be
2204
+ # less accurate. This option allows us to ignore Tableau's parser and
2205
+ # only use our own.
2206
+ logger.debug("Parsing TLL & CLL from custom sql (forced)")
2207
+
2189
2208
  yield from self._create_lineage_from_unsupported_csql(
2190
2209
  csql_urn, csql, columns
2191
2210
  )
2192
2211
  else:
2193
- tables = csql.get(c.TABLES, [])
2194
-
2195
- if tables:
2196
- # lineage from custom sql -> datasets/tables #
2197
- yield from self._create_lineage_to_upstream_tables(
2198
- csql_urn, tables, datasource
2199
- )
2200
- elif (
2201
- self.config.extract_lineage_from_unsupported_custom_sql_queries
2202
- ):
2203
- logger.debug("Extracting TLL & CLL from custom sql")
2204
- # custom sql tables may contain unsupported sql, causing incomplete lineage
2205
- # we extract the lineage from the raw queries
2206
- yield from self._create_lineage_from_unsupported_csql(
2207
- csql_urn, csql, columns
2208
- )
2212
+ # lineage from custom sql -> datasets/tables #
2213
+ yield from self._create_lineage_to_upstream_tables(
2214
+ csql_urn, tableau_table_list, datasource
2215
+ )
2209
2216
 
2210
2217
  # Schema Metadata
2211
2218
  schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
@@ -2243,7 +2250,6 @@ class TableauSiteSource:
2243
2250
  yield self.get_metadata_change_event(dataset_snapshot)
2244
2251
  yield self.get_metadata_change_proposal(
2245
2252
  dataset_snapshot.urn,
2246
- aspect_name=c.SUB_TYPES,
2247
2253
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
2248
2254
  )
2249
2255
 
@@ -2408,7 +2414,6 @@ class TableauSiteSource:
2408
2414
  upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
2409
2415
  yield self.get_metadata_change_proposal(
2410
2416
  csql_urn,
2411
- aspect_name=c.UPSTREAM_LINEAGE,
2412
2417
  aspect=upstream_lineage,
2413
2418
  )
2414
2419
  self.report.num_tables_with_upstream_lineage += 1
@@ -2594,7 +2599,6 @@ class TableauSiteSource:
2594
2599
  )
2595
2600
  yield self.get_metadata_change_proposal(
2596
2601
  csql_urn,
2597
- aspect_name=c.UPSTREAM_LINEAGE,
2598
2602
  aspect=upstream_lineage,
2599
2603
  )
2600
2604
  self.report.num_tables_with_upstream_lineage += 1
@@ -2640,14 +2644,10 @@ class TableauSiteSource:
2640
2644
  def get_metadata_change_proposal(
2641
2645
  self,
2642
2646
  urn: str,
2643
- aspect_name: str,
2644
2647
  aspect: Union["UpstreamLineage", "SubTypesClass"],
2645
2648
  ) -> MetadataWorkUnit:
2646
2649
  return MetadataChangeProposalWrapper(
2647
- entityType=c.DATASET,
2648
- changeType=ChangeTypeClass.UPSERT,
2649
2650
  entityUrn=urn,
2650
- aspectName=aspect_name,
2651
2651
  aspect=aspect,
2652
2652
  ).as_workunit()
2653
2653
 
@@ -2755,7 +2755,6 @@ class TableauSiteSource:
2755
2755
  )
2756
2756
  yield self.get_metadata_change_proposal(
2757
2757
  datasource_urn,
2758
- aspect_name=c.UPSTREAM_LINEAGE,
2759
2758
  aspect=upstream_lineage,
2760
2759
  )
2761
2760
  self.report.num_tables_with_upstream_lineage += 1
@@ -2774,7 +2773,6 @@ class TableauSiteSource:
2774
2773
  yield self.get_metadata_change_event(dataset_snapshot)
2775
2774
  yield self.get_metadata_change_proposal(
2776
2775
  dataset_snapshot.urn,
2777
- aspect_name=c.SUB_TYPES,
2778
2776
  aspect=SubTypesClass(
2779
2777
  typeNames=(
2780
2778
  ["Embedded Data Source"]
@@ -2860,7 +2858,11 @@ class TableauSiteSource:
2860
2858
  return datasource
2861
2859
 
2862
2860
  def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
2863
- datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
2861
+ datasource_filter = (
2862
+ {}
2863
+ if self.config.emit_all_published_datasources
2864
+ else {c.ID_WITH_IN: self.datasource_ids_being_used}
2865
+ )
2864
2866
 
2865
2867
  for datasource in self.get_connection_objects(
2866
2868
  query=published_datasource_graphql_query,
@@ -3553,7 +3555,11 @@ class TableauSiteSource:
3553
3555
  return browse_paths
3554
3556
 
3555
3557
  def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
3556
- datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3558
+ datasource_filter = (
3559
+ {}
3560
+ if self.config.emit_all_embedded_datasources
3561
+ else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3562
+ )
3557
3563
 
3558
3564
  for datasource in self.get_connection_objects(
3559
3565
  query=embedded_datasource_graphql_query,
@@ -50,7 +50,6 @@ TABLES = "tables"
50
50
  DESCRIPTION = "description"
51
51
  SQL = "SQL"
52
52
  QUERY = "query"
53
- SUB_TYPES = "subTypes"
54
53
  VIEW = "view"
55
54
  CUSTOM_SQL = "Custom SQL"
56
55
  REMOTE_TYPE = "remoteType"
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
58
57
  PUBLISHED_DATA_SOURCE = "PublishedDatasource"
59
58
  LUID = "luid"
60
59
  EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
61
- UPSTREAM_LINEAGE = "upstreamLineage"
62
60
  OWNER = "owner"
63
61
  USERNAME = "username"
64
62
  HAS_EXTRACTS = "hasExtracts"
@@ -56,6 +56,7 @@ from datahub.ingestion.source.aws.s3_util import (
56
56
  from datahub.ingestion.source.common.subtypes import (
57
57
  DatasetContainerSubTypes,
58
58
  DatasetSubTypes,
59
+ SourceCapabilityModifier,
59
60
  )
60
61
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
61
62
  StaleEntityRemovalHandler,
@@ -152,7 +153,14 @@ logger: logging.Logger = logging.getLogger(__name__)
152
153
  @capability(SourceCapability.USAGE_STATS, "Enabled by default")
153
154
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
154
155
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
155
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
156
+ @capability(
157
+ SourceCapability.CONTAINERS,
158
+ "Enabled by default",
159
+ subtype_modifier=[
160
+ SourceCapabilityModifier.CATALOG,
161
+ SourceCapabilityModifier.SCHEMA,
162
+ ],
163
+ )
156
164
  @capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
157
165
  @capability(
158
166
  SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
@@ -478,7 +478,7 @@ class LineageClient:
478
478
  env: str = "PROD",
479
479
  default_db: Optional[str] = None,
480
480
  default_schema: Optional[str] = None,
481
- default_dialect: Optional[str] = None,
481
+ override_dialect: Optional[str] = None,
482
482
  ) -> None:
483
483
  """Add lineage by parsing a SQL query."""
484
484
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -494,7 +494,7 @@ class LineageClient:
494
494
  platform_instance=platform_instance,
495
495
  env=env,
496
496
  graph=self._client._graph,
497
- default_dialect=default_dialect,
497
+ override_dialect=override_dialect,
498
498
  )
499
499
 
500
500
  if parsed_result.debug_info.table_error:
@@ -1577,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
1577
1577
 
1578
1578
  @dataclasses.dataclass
1579
1579
  class QueryLineageInfo:
1580
- upstreams: List[UrnStr] # this is direct upstreams, with *no temp tables*
1581
- column_lineage: List[ColumnLineageInfo]
1580
+ upstreams: OrderedSet[
1581
+ UrnStr
1582
+ ] # this is direct upstreams, with *no temp tables*
1583
+ column_lineage: OrderedSet[ColumnLineageInfo]
1582
1584
  confidence_score: float
1583
1585
 
1584
1586
  def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
1585
- self.upstreams += other_query.upstreams
1586
- self.column_lineage += other_query.column_lineage
1587
+ self.upstreams.update(other_query.upstreams)
1588
+ self.column_lineage.update(other_query.column_lineage)
1587
1589
  self.confidence_score = min(
1588
1590
  self.confidence_score, other_query.confidence_score
1589
1591
  )
1590
1592
 
1593
+ cache: Dict[str, QueryLineageInfo] = {}
1594
+
1591
1595
  def _recurse_into_query(
1592
1596
  query: QueryMetadata, recursion_path: List[QueryId]
1593
1597
  ) -> QueryLineageInfo:
1594
1598
  if query.query_id in recursion_path:
1595
1599
  # This is a cycle, so we just return the query as-is.
1596
1600
  return QueryLineageInfo(
1597
- upstreams=query.upstreams,
1598
- column_lineage=query.column_lineage,
1601
+ upstreams=OrderedSet(query.upstreams),
1602
+ column_lineage=OrderedSet(query.column_lineage),
1599
1603
  confidence_score=query.confidence_score,
1600
1604
  )
1605
+ if query.query_id in cache:
1606
+ return cache[query.query_id]
1601
1607
  recursion_path = [*recursion_path, query.query_id]
1602
1608
  composed_of_queries.add(query.query_id)
1603
1609
 
@@ -1612,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
1612
1618
  upstream_query = self._query_map.get(upstream_query_id)
1613
1619
  if (
1614
1620
  upstream_query
1615
- and upstream_query.query_id not in composed_of_queries
1621
+ and upstream_query.query_id not in recursion_path
1616
1622
  ):
1617
1623
  temp_query_lineage_info = _recurse_into_query(
1618
1624
  upstream_query, recursion_path
@@ -1672,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
1672
1678
  ]
1673
1679
  )
1674
1680
 
1675
- return QueryLineageInfo(
1676
- upstreams=list(new_upstreams),
1677
- column_lineage=new_cll,
1681
+ ret = QueryLineageInfo(
1682
+ upstreams=new_upstreams,
1683
+ column_lineage=OrderedSet(new_cll),
1678
1684
  confidence_score=new_confidence_score,
1679
1685
  )
1686
+ cache[query.query_id] = ret
1687
+
1688
+ return ret
1680
1689
 
1681
1690
  resolved_lineage_info = _recurse_into_query(base_query, [])
1682
1691
 
@@ -1716,8 +1725,8 @@ class SqlParsingAggregator(Closeable):
1716
1725
  base_query,
1717
1726
  query_id=composite_query_id,
1718
1727
  formatted_query_string=merged_query_text,
1719
- upstreams=resolved_lineage_info.upstreams,
1720
- column_lineage=resolved_lineage_info.column_lineage,
1728
+ upstreams=list(resolved_lineage_info.upstreams),
1729
+ column_lineage=list(resolved_lineage_info.column_lineage),
1721
1730
  confidence_score=resolved_lineage_info.confidence_score,
1722
1731
  )
1723
1732
 
@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
56
56
  QueryTypeProps,
57
57
  )
58
58
  from datahub.sql_parsing.sqlglot_utils import (
59
+ DialectOrStr,
59
60
  get_dialect,
60
61
  get_query_fingerprint_debug,
61
62
  is_dialect_instance,
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
124
125
 
125
126
 
126
127
  class DownstreamColumnRef(_ParserBaseModel):
128
+ """
129
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
130
+ What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
131
+ auto-generated class from .pdl model files. We need generic solution allowing us to either:
132
+ 1. Implement hashing for .pdl model objects
133
+ 2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
134
+ hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
135
+ to understand that instruction as well.
136
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
137
+ """
138
+
127
139
  table: Optional[Urn] = None
128
140
  column: str
129
141
  column_type: Optional[SchemaFieldDataTypeClass] = None
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
139
151
  return v
140
152
  return SchemaFieldDataTypeClass.from_obj(v)
141
153
 
154
+ def __hash__(self) -> int:
155
+ return hash((self.table, self.column, self.native_column_type))
142
156
 
143
- class ColumnTransformation(_ParserBaseModel):
157
+
158
+ class ColumnTransformation(_FrozenModel):
144
159
  is_direct_copy: bool
145
160
  column_logic: str
146
161
 
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
153
168
 
154
169
 
155
170
  class ColumnLineageInfo(_ParserBaseModel):
171
+ """
172
+ TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
173
+ To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
174
+ depending on it.
175
+ Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
176
+ """
177
+
156
178
  downstream: DownstreamColumnRef
157
179
  upstreams: List[ColumnRef]
158
180
 
159
181
  logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
160
182
 
183
+ def __hash__(self) -> int:
184
+ return hash((self.downstream, tuple(self.upstreams), self.logic))
185
+
161
186
 
162
187
  class _JoinInfo(_ParserBaseModel):
163
188
  join_type: str
@@ -1231,12 +1256,12 @@ def _sqlglot_lineage_inner(
1231
1256
  schema_resolver: SchemaResolverInterface,
1232
1257
  default_db: Optional[str] = None,
1233
1258
  default_schema: Optional[str] = None,
1234
- default_dialect: Optional[str] = None,
1259
+ override_dialect: Optional[DialectOrStr] = None,
1235
1260
  ) -> SqlParsingResult:
1236
- if not default_dialect:
1237
- dialect = get_dialect(schema_resolver.platform)
1261
+ if override_dialect:
1262
+ dialect = get_dialect(override_dialect)
1238
1263
  else:
1239
- dialect = get_dialect(default_dialect)
1264
+ dialect = get_dialect(schema_resolver.platform)
1240
1265
 
1241
1266
  default_db = _normalize_db_or_schema(default_db, dialect)
1242
1267
  default_schema = _normalize_db_or_schema(default_schema, dialect)
@@ -1423,7 +1448,7 @@ def _sqlglot_lineage_nocache(
1423
1448
  schema_resolver: SchemaResolverInterface,
1424
1449
  default_db: Optional[str] = None,
1425
1450
  default_schema: Optional[str] = None,
1426
- default_dialect: Optional[str] = None,
1451
+ override_dialect: Optional[DialectOrStr] = None,
1427
1452
  ) -> SqlParsingResult:
1428
1453
  """Parse a SQL statement and generate lineage information.
1429
1454
 
@@ -1441,8 +1466,8 @@ def _sqlglot_lineage_nocache(
1441
1466
  can be brittle with respect to missing schema information and complex
1442
1467
  SQL logic like UNNESTs.
1443
1468
 
1444
- The SQL dialect can be given as an argument called default_dialect or it can
1445
- be inferred from the schema_resolver's platform.
1469
+ The SQL dialect will be inferred from the schema_resolver's platform.
1470
+ That inference can be overridden by passing an override_dialect argument.
1446
1471
  The set of supported dialects is the same as sqlglot's. See their
1447
1472
  `documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
1448
1473
  for the full list.
@@ -1457,7 +1482,7 @@ def _sqlglot_lineage_nocache(
1457
1482
  schema_resolver: The schema resolver to use for resolving table schemas.
1458
1483
  default_db: The default database to use for unqualified table names.
1459
1484
  default_schema: The default schema to use for unqualified table names.
1460
- default_dialect: A default dialect to override the dialect provided by 'schema_resolver'.
1485
+ override_dialect: Override the dialect provided by 'schema_resolver'.
1461
1486
 
1462
1487
  Returns:
1463
1488
  A SqlParsingResult object containing the parsed lineage information.
@@ -1482,7 +1507,7 @@ def _sqlglot_lineage_nocache(
1482
1507
  schema_resolver=schema_resolver,
1483
1508
  default_db=default_db,
1484
1509
  default_schema=default_schema,
1485
- default_dialect=default_dialect,
1510
+ override_dialect=override_dialect,
1486
1511
  )
1487
1512
  except Exception as e:
1488
1513
  return SqlParsingResult.make_from_error(e)
@@ -1520,15 +1545,15 @@ def sqlglot_lineage(
1520
1545
  schema_resolver: SchemaResolverInterface,
1521
1546
  default_db: Optional[str] = None,
1522
1547
  default_schema: Optional[str] = None,
1523
- default_dialect: Optional[str] = None,
1548
+ override_dialect: Optional[DialectOrStr] = None,
1524
1549
  ) -> SqlParsingResult:
1525
1550
  if schema_resolver.includes_temp_tables():
1526
1551
  return _sqlglot_lineage_nocache(
1527
- sql, schema_resolver, default_db, default_schema, default_dialect
1552
+ sql, schema_resolver, default_db, default_schema, override_dialect
1528
1553
  )
1529
1554
  else:
1530
1555
  return _sqlglot_lineage_cached(
1531
- sql, schema_resolver, default_db, default_schema, default_dialect
1556
+ sql, schema_resolver, default_db, default_schema, override_dialect
1532
1557
  )
1533
1558
 
1534
1559
 
@@ -1580,7 +1605,7 @@ def create_lineage_sql_parsed_result(
1580
1605
  default_schema: Optional[str] = None,
1581
1606
  graph: Optional[DataHubGraph] = None,
1582
1607
  schema_aware: bool = True,
1583
- default_dialect: Optional[str] = None,
1608
+ override_dialect: Optional[DialectOrStr] = None,
1584
1609
  ) -> SqlParsingResult:
1585
1610
  schema_resolver = create_schema_resolver(
1586
1611
  platform=platform,
@@ -1600,7 +1625,7 @@ def create_lineage_sql_parsed_result(
1600
1625
  schema_resolver=schema_resolver,
1601
1626
  default_db=default_db,
1602
1627
  default_schema=default_schema,
1603
- default_dialect=default_dialect,
1628
+ override_dialect=override_dialect,
1604
1629
  )
1605
1630
  except Exception as e:
1606
1631
  return SqlParsingResult.make_from_error(e)