acryl-datahub 1.2.0.7rc3__py3-none-any.whl → 1.2.0.8rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (43) hide show
  1. {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/METADATA +2634 -2633
  2. {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/RECORD +43 -40
  3. datahub/_version.py +1 -1
  4. datahub/cli/delete_cli.py +1 -0
  5. datahub/ingestion/api/report.py +4 -0
  6. datahub/ingestion/autogenerated/capability_summary.json +2 -2
  7. datahub/ingestion/graph/client.py +8 -1
  8. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  9. datahub/ingestion/source/datahub/config.py +4 -0
  10. datahub/ingestion/source/datahub/datahub_database_reader.py +6 -1
  11. datahub/ingestion/source/metadata/lineage.py +8 -8
  12. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  13. datahub/ingestion/source/redshift/redshift.py +1 -1
  14. datahub/ingestion/source/sql/athena.py +95 -18
  15. datahub/ingestion/source/sql/athena_properties_extractor.py +43 -25
  16. datahub/ingestion/source/superset.py +3 -2
  17. datahub/ingestion/source/tableau/tableau.py +8 -5
  18. datahub/ingestion/source/unity/config.py +65 -11
  19. datahub/ingestion/source/unity/proxy.py +90 -5
  20. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  21. datahub/ingestion/source/unity/source.py +12 -0
  22. datahub/ingestion/source/usage/usage_common.py +1 -0
  23. datahub/metadata/_internal_schema_classes.py +207 -12
  24. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  25. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
  26. datahub/metadata/schema.avsc +160 -12
  27. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  28. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -1
  29. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
  30. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  31. datahub/metadata/schemas/DomainKey.avsc +2 -1
  32. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  33. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  34. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  35. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +0 -3
  36. datahub/sdk/chart.py +36 -22
  37. datahub/sdk/dashboard.py +38 -62
  38. datahub/sql_parsing/sqlglot_lineage.py +121 -28
  39. datahub/sql_parsing/sqlglot_utils.py +12 -1
  40. {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/WHEEL +0 -0
  41. {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/entry_points.txt +0 -0
  42. {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/licenses/LICENSE +0 -0
  43. {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/top_level.txt +0 -0
datahub/sdk/dashboard.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from datetime import datetime
4
4
  from typing import Dict, List, Optional, Type, Union
5
5
 
6
+ from deprecated.sphinx import deprecated
6
7
  from typing_extensions import Self
7
8
 
8
9
  import datahub.metadata.schema_classes as models
@@ -24,12 +25,14 @@ from datahub.sdk._shared import (
24
25
  HasTerms,
25
26
  LinksInputType,
26
27
  OwnersInputType,
28
+ ParentContainerInputType,
27
29
  TagsInputType,
28
30
  TermsInputType,
29
31
  )
30
32
  from datahub.sdk.chart import Chart
31
33
  from datahub.sdk.dataset import Dataset
32
34
  from datahub.sdk.entity import Entity, ExtraAspectsType
35
+ from datahub.utilities.sentinels import Unset, unset
33
36
 
34
37
 
35
38
  class Dashboard(
@@ -64,7 +67,7 @@ class Dashboard(
64
67
  display_name: Optional[str] = None,
65
68
  platform_instance: Optional[DataPlatformInstanceUrnOrStr] = None,
66
69
  # Dashboard properties.
67
- description: str = "",
70
+ description: Optional[str] = None,
68
71
  external_url: Optional[str] = None,
69
72
  dashboard_url: Optional[str] = None,
70
73
  custom_properties: Optional[Dict[str, str]] = None,
@@ -74,6 +77,7 @@ class Dashboard(
74
77
  charts: Optional[List[Union[ChartUrnOrStr, Chart]]] = None,
75
78
  dashboards: Optional[List[Union[DashboardUrnOrStr, Dashboard]]] = None,
76
79
  # Standard aspects.
80
+ parent_container: ParentContainerInputType | Unset = unset,
77
81
  subtype: Optional[str] = None,
78
82
  owners: Optional[OwnersInputType] = None,
79
83
  links: Optional[LinksInputType] = None,
@@ -94,18 +98,7 @@ class Dashboard(
94
98
  self._set_platform_instance(platform, platform_instance)
95
99
 
96
100
  # Initialize DashboardInfoClass with default values
97
- dashboard_info = models.DashboardInfoClass(
98
- title=display_name or name,
99
- description=description or "",
100
- lastModified=models.ChangeAuditStampsClass(
101
- lastModified=None,
102
- ),
103
- customProperties={},
104
- chartEdges=[],
105
- datasetEdges=[],
106
- dashboards=[],
107
- )
108
-
101
+ dashboard_info = self._ensure_dashboard_props(display_name=display_name)
109
102
  if last_modified:
110
103
  dashboard_info.lastModified = models.ChangeAuditStampsClass(
111
104
  lastModified=models.AuditStampClass(
@@ -114,7 +107,6 @@ class Dashboard(
114
107
  ),
115
108
  )
116
109
 
117
- # Set additional properties
118
110
  if description is not None:
119
111
  self.set_description(description)
120
112
  if display_name is not None:
@@ -129,6 +121,15 @@ class Dashboard(
129
121
  self.set_last_modified(last_modified)
130
122
  if last_refreshed is not None:
131
123
  self.set_last_refreshed(last_refreshed)
124
+ if input_datasets is not None:
125
+ self.set_input_datasets(input_datasets)
126
+ if charts is not None:
127
+ self.set_charts(charts)
128
+ if dashboards is not None:
129
+ self.set_dashboards(dashboards)
130
+
131
+ if parent_container is not unset:
132
+ self._set_container(parent_container)
132
133
  if subtype is not None:
133
134
  self.set_subtype(subtype)
134
135
  if owners is not None:
@@ -141,12 +142,6 @@ class Dashboard(
141
142
  self.set_terms(terms)
142
143
  if domain is not None:
143
144
  self.set_domain(domain)
144
- if input_datasets is not None:
145
- self.set_input_datasets(input_datasets)
146
- if charts is not None:
147
- self.set_charts(charts)
148
- if dashboards is not None:
149
- self.set_dashboards(dashboards)
150
145
 
151
146
  @classmethod
152
147
  def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
@@ -162,11 +157,13 @@ class Dashboard(
162
157
  assert isinstance(self._urn, DashboardUrn)
163
158
  return self._urn
164
159
 
165
- def _ensure_dashboard_props(self) -> models.DashboardInfoClass:
160
+ def _ensure_dashboard_props(
161
+ self, display_name: Optional[str] = None
162
+ ) -> models.DashboardInfoClass:
166
163
  """Get the dashboard properties safely."""
167
164
  return self._setdefault_aspect(
168
165
  models.DashboardInfoClass(
169
- title=self.urn.dashboard_id,
166
+ title=display_name or self.urn.dashboard_id,
170
167
  description="",
171
168
  lastModified=models.ChangeAuditStampsClass(
172
169
  lastModified=models.AuditStampClass(
@@ -186,60 +183,52 @@ class Dashboard(
186
183
  return self.urn.dashboard_id
187
184
 
188
185
  @property
186
+ @deprecated("Use display_name instead", version="1.2.0.7")
189
187
  def title(self) -> str:
190
- """Get the title of the dashboard."""
191
- return self._ensure_dashboard_props().title
188
+ """Get the display name of the dashboard."""
189
+ return self.display_name
192
190
 
191
+ @deprecated("Use set_display_name instead", version="1.2.0.7")
193
192
  def set_title(self, title: str) -> None:
194
- """Set the title of the dashboard."""
195
- props = self._ensure_dashboard_props()
196
- props.title = title
197
- self._set_aspect(props)
193
+ """Set the display name of the dashboard."""
194
+ self.set_display_name(title)
198
195
 
199
196
  @property
200
197
  def description(self) -> Optional[str]:
201
198
  """Get the description of the dashboard."""
202
- props = self._ensure_dashboard_props()
203
- return props.description
199
+ # Because description is a required field, we treat "" as None.
200
+ return self._ensure_dashboard_props().description or None
204
201
 
205
202
  def set_description(self, description: str) -> None:
206
203
  """Set the description of the dashboard."""
207
- props = self._ensure_dashboard_props()
208
- props.description = description
209
- self._set_aspect(props)
204
+ self._ensure_dashboard_props().description = description
210
205
 
211
206
  @property
212
- def display_name(self) -> Optional[str]:
207
+ def display_name(self) -> str:
213
208
  """Get the display name of the dashboard."""
214
- return self.title
209
+ return self._ensure_dashboard_props().title
215
210
 
216
211
  def set_display_name(self, display_name: str) -> None:
217
212
  """Set the display name of the dashboard."""
218
- self.set_title(display_name)
213
+ self._ensure_dashboard_props().title = display_name
219
214
 
220
215
  @property
221
216
  def external_url(self) -> Optional[str]:
222
217
  """Get the external URL of the dashboard."""
223
- props = self._ensure_dashboard_props()
224
- return props.externalUrl
218
+ return self._ensure_dashboard_props().externalUrl
225
219
 
226
220
  def set_external_url(self, external_url: str) -> None:
227
221
  """Set the external URL of the dashboard."""
228
- props = self._ensure_dashboard_props()
229
- props.externalUrl = external_url
230
- self._set_aspect(props)
222
+ self._ensure_dashboard_props().externalUrl = external_url
231
223
 
232
224
  @property
233
225
  def dashboard_url(self) -> Optional[str]:
234
226
  """Get the dashboard URL."""
235
- props = self._ensure_dashboard_props()
236
- return props.dashboardUrl
227
+ return self._ensure_dashboard_props().dashboardUrl
237
228
 
238
229
  def set_dashboard_url(self, dashboard_url: str) -> None:
239
230
  """Set the dashboard URL."""
240
- props = self._ensure_dashboard_props()
241
- props.dashboardUrl = dashboard_url
242
- self._set_aspect(props)
231
+ self._ensure_dashboard_props().dashboardUrl = dashboard_url
243
232
 
244
233
  @property
245
234
  def custom_properties(self) -> Dict[str, str]:
@@ -249,9 +238,7 @@ class Dashboard(
249
238
 
250
239
  def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
251
240
  """Set the custom properties of the dashboard."""
252
- props = self._ensure_dashboard_props()
253
- props.customProperties = custom_properties
254
- self._set_aspect(props)
241
+ self._ensure_dashboard_props().customProperties = custom_properties
255
242
 
256
243
  @property
257
244
  def last_modified(self) -> Optional[datetime]:
@@ -263,14 +250,12 @@ class Dashboard(
263
250
 
264
251
  def set_last_modified(self, last_modified: datetime) -> None:
265
252
  """Set the last modification timestamp of the dashboard."""
266
- props = self._ensure_dashboard_props()
267
- props.lastModified = models.ChangeAuditStampsClass(
253
+ self._ensure_dashboard_props().lastModified = models.ChangeAuditStampsClass(
268
254
  lastModified=models.AuditStampClass(
269
255
  time=int(last_modified.timestamp()),
270
256
  actor="urn:li:corpuser:datahub",
271
257
  ),
272
258
  )
273
- self._set_aspect(props)
274
259
 
275
260
  @property
276
261
  def last_refreshed(self) -> Optional[datetime]:
@@ -284,9 +269,7 @@ class Dashboard(
284
269
 
285
270
  def set_last_refreshed(self, last_refreshed: datetime) -> None:
286
271
  """Set the last refresh timestamp of the dashboard."""
287
- props = self._ensure_dashboard_props()
288
- props.lastRefreshed = int(last_refreshed.timestamp())
289
- self._set_aspect(props)
272
+ self._ensure_dashboard_props().lastRefreshed = int(last_refreshed.timestamp())
290
273
 
291
274
  @property
292
275
  def input_datasets(self) -> List[DatasetUrn]:
@@ -310,7 +293,6 @@ class Dashboard(
310
293
  dataset_urn = DatasetUrn.from_string(dataset)
311
294
  dataset_edges.append(models.EdgeClass(destinationUrn=str(dataset_urn)))
312
295
  props.datasetEdges = dataset_edges
313
- self._set_aspect(props)
314
296
 
315
297
  def add_input_dataset(self, input_dataset: Union[DatasetUrnOrStr, Dataset]) -> None:
316
298
  """Add an input dataset to the dashboard."""
@@ -326,7 +308,6 @@ class Dashboard(
326
308
  models.EdgeClass(destinationUrn=str(input_dataset_urn))
327
309
  )
328
310
  props.datasetEdges = dataset_edges
329
- self._set_aspect(props)
330
311
 
331
312
  def remove_input_dataset(
332
313
  self, input_dataset: Union[DatasetUrnOrStr, Dataset]
@@ -342,7 +323,6 @@ class Dashboard(
342
323
  for edge in (props.datasetEdges or [])
343
324
  if edge.destinationUrn != str(input_dataset_urn)
344
325
  ]
345
- self._set_aspect(props)
346
326
 
347
327
  @property
348
328
  def charts(self) -> List[ChartUrn]:
@@ -363,7 +343,6 @@ class Dashboard(
363
343
  chart_urn = ChartUrn.from_string(chart)
364
344
  chart_edges.append(models.EdgeClass(destinationUrn=str(chart_urn)))
365
345
  props.chartEdges = chart_edges
366
- self._set_aspect(props)
367
346
 
368
347
  def add_chart(self, chart: Union[ChartUrnOrStr, Chart]) -> None:
369
348
  """Add a chart to the dashboard."""
@@ -381,7 +360,6 @@ class Dashboard(
381
360
  if str(chart_urn) not in existing_urns:
382
361
  chart_edges.append(models.EdgeClass(destinationUrn=str(chart_urn)))
383
362
  props.chartEdges = chart_edges
384
- self._set_aspect(props)
385
363
 
386
364
  def remove_chart(self, chart: Union[ChartUrnOrStr, Chart]) -> None:
387
365
  """Remove a chart from the dashboard."""
@@ -395,7 +373,6 @@ class Dashboard(
395
373
  for edge in (props.chartEdges or [])
396
374
  if edge.destinationUrn != str(chart_urn)
397
375
  ]
398
- self._set_aspect(props)
399
376
 
400
377
  @property
401
378
  def dashboards(self) -> List[DashboardUrn]:
@@ -417,7 +394,6 @@ class Dashboard(
417
394
  else:
418
395
  dashboard_urn = DashboardUrn.from_string(dashboard)
419
396
  props.dashboards.append(models.EdgeClass(destinationUrn=str(dashboard_urn)))
420
- self._set_aspect(props)
421
397
 
422
398
  def add_dashboard(self, dashboard: Union[DashboardUrnOrStr, Dashboard]) -> None:
423
399
  """Add a dashboard to the dashboard."""
@@ -543,16 +543,19 @@ def _select_statement_cll(
543
543
  root_scope: sqlglot.optimizer.Scope,
544
544
  column_resolver: _ColumnResolver,
545
545
  output_table: Optional[_TableName],
546
+ table_name_schema_mapping: Dict[_TableName, SchemaInfo],
547
+ default_db: Optional[str] = None,
548
+ default_schema: Optional[str] = None,
546
549
  ) -> List[_ColumnLineageInfo]:
547
550
  column_lineage: List[_ColumnLineageInfo] = []
548
551
 
549
552
  try:
550
- # List output columns.
551
553
  output_columns = [
552
554
  (select_col.alias_or_name, select_col) for select_col in statement.selects
553
555
  ]
554
556
  logger.debug("output columns: %s", [col[0] for col in output_columns])
555
- for output_col, original_col_expression in output_columns:
557
+
558
+ for output_col, _original_col_expression in output_columns:
556
559
  if not output_col or output_col == "*":
557
560
  # If schema information is available, the * will be expanded to the actual columns.
558
561
  # Otherwise, we can't process it.
@@ -576,13 +579,14 @@ def _select_statement_cll(
576
579
  trim_selects=False,
577
580
  # We don't need to pass the schema in here, since we've already qualified the columns.
578
581
  )
579
- # import pathlib
580
- # pathlib.Path("sqlglot.html").write_text(
581
- # str(lineage_node.to_html(dialect=dialect))
582
- # )
583
582
 
584
583
  # Generate SELECT lineage.
585
- direct_raw_col_upstreams = _get_direct_raw_col_upstreams(lineage_node)
584
+ direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
585
+ lineage_node,
586
+ dialect,
587
+ default_db,
588
+ default_schema,
589
+ )
586
590
 
587
591
  # Fuzzy resolve the output column.
588
592
  original_col_expression = lineage_node.expression
@@ -601,7 +605,7 @@ def _select_statement_cll(
601
605
  if original_col_expression.type:
602
606
  output_col_type = original_col_expression.type
603
607
 
604
- # Fuzzy resolve upstream columns.
608
+ # Resolve upstream columns - table names should already be qualified from placeholder processing
605
609
  direct_resolved_col_upstreams = {
606
610
  _ColumnRef(
607
611
  table=edge.table,
@@ -706,6 +710,9 @@ def _column_level_lineage(
706
710
  root_scope=root_scope,
707
711
  column_resolver=column_resolver,
708
712
  output_table=downstream_table,
713
+ table_name_schema_mapping=table_name_schema_mapping,
714
+ default_db=default_db,
715
+ default_schema=default_schema,
709
716
  )
710
717
 
711
718
  joins: Optional[List[_JoinInfo]] = None
@@ -726,6 +733,9 @@ def _column_level_lineage(
726
733
 
727
734
  def _get_direct_raw_col_upstreams(
728
735
  lineage_node: sqlglot.lineage.Node,
736
+ dialect: Optional[sqlglot.Dialect] = None,
737
+ default_db: Optional[str] = None,
738
+ default_schema: Optional[str] = None,
729
739
  ) -> OrderedSet[_ColumnRef]:
730
740
  # Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
731
741
  direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
@@ -755,6 +765,53 @@ def _get_direct_raw_col_upstreams(
755
765
  direct_raw_col_upstreams.add(
756
766
  _ColumnRef(table=table_ref, column=normalized_col)
757
767
  )
768
+ elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
769
+ # Handle placeholder expressions from lateral joins.
770
+ #
771
+ # In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
772
+ # expressions instead of regular table references. This is critical for lateral join column lineage.
773
+ #
774
+ # Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
775
+ # The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
776
+ # which we need to parse to establish the lineage: output.value <- my_table2.value
777
+ #
778
+ # Without this handling, lateral join column lineage would be incomplete/missing.
779
+ try:
780
+ parsed = sqlglot.parse_one(node.name, dialect=dialect)
781
+ if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
782
+ table_ref = _TableName.from_sqlglot_table(
783
+ sqlglot.parse_one(
784
+ parsed.table, into=sqlglot.exp.Table, dialect=dialect
785
+ )
786
+ )
787
+
788
+ # SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
789
+ # Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
790
+ # We need this runtime qualification to ensure proper lineage resolution.
791
+ # Only qualify if this appears to be a real table reference (not a temporary construct)
792
+ if (
793
+ not (table_ref.database or table_ref.db_schema)
794
+ and dialect is not None
795
+ ):
796
+ table_ref = table_ref.qualified(
797
+ dialect=dialect,
798
+ default_db=default_db,
799
+ default_schema=default_schema,
800
+ )
801
+
802
+ # Extract column name using proper isinstance check
803
+ if isinstance(parsed.this, sqlglot.exp.Identifier):
804
+ column_name = parsed.this.name
805
+ else:
806
+ column_name = str(parsed.this)
807
+ direct_raw_col_upstreams.add(
808
+ _ColumnRef(table=table_ref, column=column_name)
809
+ )
810
+ except Exception as e:
811
+ logger.debug(
812
+ f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
813
+ exc_info=True,
814
+ )
758
815
  else:
759
816
  # This branch doesn't matter. For example, a count(*) column would go here, and
760
817
  # we don't get any column-level lineage for that.
@@ -857,7 +914,7 @@ def _get_raw_col_upstreams_for_expression(
857
914
  trim_selects=False,
858
915
  )
859
916
 
860
- return _get_direct_raw_col_upstreams(node)
917
+ return _get_direct_raw_col_upstreams(node, dialect, None, None)
861
918
  finally:
862
919
  scope.expression = original_expression
863
920
 
@@ -872,8 +929,9 @@ def _list_joins(
872
929
 
873
930
  scope: sqlglot.optimizer.Scope
874
931
  for scope in root_scope.traverse():
932
+ # PART 1: Handle regular explicit JOINs (updated API)
875
933
  join: sqlglot.exp.Join
876
- for join in scope.find_all(sqlglot.exp.Join):
934
+ for join in scope.expression.find_all(sqlglot.exp.Join):
877
935
  left_side_tables: OrderedSet[_TableName] = OrderedSet()
878
936
  from_clause: sqlglot.exp.From
879
937
  for from_clause in scope.find_all(sqlglot.exp.From):
@@ -949,6 +1007,36 @@ def _list_joins(
949
1007
  )
950
1008
  )
951
1009
 
1010
+ # Handle LATERAL constructs
1011
+ for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
1012
+ # Get tables from non-lateral FROM clauses
1013
+ qualified_left: OrderedSet[_TableName] = OrderedSet()
1014
+ for from_clause in scope.find_all(sqlglot.exp.From):
1015
+ if not isinstance(from_clause.this, sqlglot.exp.Lateral):
1016
+ qualified_left.update(
1017
+ _get_join_side_tables(from_clause.this, dialect, scope)
1018
+ )
1019
+
1020
+ # Get tables from lateral subquery
1021
+ qualified_right: OrderedSet[_TableName] = OrderedSet()
1022
+ if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
1023
+ qualified_right.update(
1024
+ _TableName.from_sqlglot_table(t)
1025
+ for t in lateral.this.find_all(sqlglot.exp.Table)
1026
+ )
1027
+ qualified_right.update(qualified_left)
1028
+
1029
+ if qualified_left and qualified_right:
1030
+ joins.append(
1031
+ _JoinInfo(
1032
+ join_type="LATERAL JOIN",
1033
+ left_tables=list(qualified_left),
1034
+ right_tables=list(qualified_right),
1035
+ on_clause=None,
1036
+ columns_involved=[],
1037
+ )
1038
+ )
1039
+
952
1040
  return joins
953
1041
 
954
1042
 
@@ -1186,25 +1274,30 @@ def _translate_internal_joins(
1186
1274
  ) -> List[JoinInfo]:
1187
1275
  joins = []
1188
1276
  for raw_join in raw_joins:
1189
- joins.append(
1190
- JoinInfo(
1191
- join_type=raw_join.join_type,
1192
- left_tables=[
1193
- table_name_urn_mapping[table] for table in raw_join.left_tables
1194
- ],
1195
- right_tables=[
1196
- table_name_urn_mapping[table] for table in raw_join.right_tables
1197
- ],
1198
- on_clause=raw_join.on_clause,
1199
- columns_involved=[
1200
- ColumnRef(
1201
- table=table_name_urn_mapping[col.table],
1202
- column=col.column,
1203
- )
1204
- for col in raw_join.columns_involved
1205
- ],
1277
+ try:
1278
+ joins.append(
1279
+ JoinInfo(
1280
+ join_type=raw_join.join_type,
1281
+ left_tables=[
1282
+ table_name_urn_mapping[table] for table in raw_join.left_tables
1283
+ ],
1284
+ right_tables=[
1285
+ table_name_urn_mapping[table] for table in raw_join.right_tables
1286
+ ],
1287
+ on_clause=raw_join.on_clause,
1288
+ columns_involved=[
1289
+ ColumnRef(
1290
+ table=table_name_urn_mapping[col.table],
1291
+ column=col.column,
1292
+ )
1293
+ for col in raw_join.columns_involved
1294
+ ],
1295
+ )
1206
1296
  )
1207
- )
1297
+ except KeyError as e:
1298
+ # Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
1299
+ logger.debug(f"Skipping join with unresolvable table: {e}")
1300
+ continue
1208
1301
  return joins
1209
1302
 
1210
1303
 
@@ -115,6 +115,8 @@ def _expression_to_string(
115
115
  return expression.sql(dialect=get_dialect(platform))
116
116
 
117
117
 
118
+ PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION = re.compile(r"(%s|\$\d|\?)")
119
+
118
120
  _BASIC_NORMALIZATION_RULES = {
119
121
  # Remove /* */ comments.
120
122
  re.compile(r"/\*.*?\*/", re.DOTALL): "",
@@ -130,7 +132,9 @@ _BASIC_NORMALIZATION_RULES = {
130
132
  re.compile(r"'[^']*'"): "?",
131
133
  # Replace sequences of IN/VALUES with a single placeholder.
132
134
  # The r" ?" makes it more robust to uneven spacing.
133
- re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
135
+ re.compile(
136
+ r"\b(IN|VALUES)\s*\( ?(?:%s|\$\d|\?)(?:, ?(?:%s|\$\d|\?))* ?\)", re.IGNORECASE
137
+ ): r"\1 (?)",
134
138
  # Normalize parenthesis spacing.
135
139
  re.compile(r"\( "): "(",
136
140
  re.compile(r" \)"): ")",
@@ -139,6 +143,9 @@ _BASIC_NORMALIZATION_RULES = {
139
143
  # e.g. "col1,col2" -> "col1, col2"
140
144
  re.compile(r"\b ,"): ",",
141
145
  re.compile(r"\b,\b"): ", ",
146
+ # MAKE SURE THAT THIS IS AFTER THE ABOVE REPLACEMENT
147
+ # Replace all versions of placeholders with generic ? placeholder.
148
+ PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION: "?",
142
149
  }
143
150
  _TABLE_NAME_NORMALIZATION_RULES = {
144
151
  # Replace UUID-like strings with a placeholder (both - and _ variants).
@@ -262,6 +269,10 @@ def get_query_fingerprint_debug(
262
269
  if not fast:
263
270
  dialect = get_dialect(platform)
264
271
  expression_sql = generalize_query(expression, dialect=dialect)
272
+ # Normalize placeholders for consistent fingerprinting -> this only needs to be backward compatible with earlier sqglot generated generalized queries where the placeholders were always ?
273
+ expression_sql = PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION.sub(
274
+ "?", expression_sql
275
+ )
265
276
  else:
266
277
  expression_sql = generalize_query_fast(expression, dialect=platform)
267
278
  except (ValueError, sqlglot.errors.SqlglotError) as e: