acryl-datahub 1.2.0.7rc3__py3-none-any.whl → 1.2.0.8rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/METADATA +2634 -2633
- {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/RECORD +43 -40
- datahub/_version.py +1 -1
- datahub/cli/delete_cli.py +1 -0
- datahub/ingestion/api/report.py +4 -0
- datahub/ingestion/autogenerated/capability_summary.json +2 -2
- datahub/ingestion/graph/client.py +8 -1
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/datahub/config.py +4 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +6 -1
- datahub/ingestion/source/metadata/lineage.py +8 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +1 -1
- datahub/ingestion/source/sql/athena.py +95 -18
- datahub/ingestion/source/sql/athena_properties_extractor.py +43 -25
- datahub/ingestion/source/superset.py +3 -2
- datahub/ingestion/source/tableau/tableau.py +8 -5
- datahub/ingestion/source/unity/config.py +65 -11
- datahub/ingestion/source/unity/proxy.py +90 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/source.py +12 -0
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +207 -12
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
- datahub/metadata/schema.avsc +160 -12
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -1
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +0 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sql_parsing/sqlglot_lineage.py +121 -28
- datahub/sql_parsing/sqlglot_utils.py +12 -1
- {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.7rc3.dist-info → acryl_datahub-1.2.0.8rc1.dist-info}/top_level.txt +0 -0
datahub/sdk/dashboard.py
CHANGED
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from typing import Dict, List, Optional, Type, Union
|
|
5
5
|
|
|
6
|
+
from deprecated.sphinx import deprecated
|
|
6
7
|
from typing_extensions import Self
|
|
7
8
|
|
|
8
9
|
import datahub.metadata.schema_classes as models
|
|
@@ -24,12 +25,14 @@ from datahub.sdk._shared import (
|
|
|
24
25
|
HasTerms,
|
|
25
26
|
LinksInputType,
|
|
26
27
|
OwnersInputType,
|
|
28
|
+
ParentContainerInputType,
|
|
27
29
|
TagsInputType,
|
|
28
30
|
TermsInputType,
|
|
29
31
|
)
|
|
30
32
|
from datahub.sdk.chart import Chart
|
|
31
33
|
from datahub.sdk.dataset import Dataset
|
|
32
34
|
from datahub.sdk.entity import Entity, ExtraAspectsType
|
|
35
|
+
from datahub.utilities.sentinels import Unset, unset
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
class Dashboard(
|
|
@@ -64,7 +67,7 @@ class Dashboard(
|
|
|
64
67
|
display_name: Optional[str] = None,
|
|
65
68
|
platform_instance: Optional[DataPlatformInstanceUrnOrStr] = None,
|
|
66
69
|
# Dashboard properties.
|
|
67
|
-
description: str =
|
|
70
|
+
description: Optional[str] = None,
|
|
68
71
|
external_url: Optional[str] = None,
|
|
69
72
|
dashboard_url: Optional[str] = None,
|
|
70
73
|
custom_properties: Optional[Dict[str, str]] = None,
|
|
@@ -74,6 +77,7 @@ class Dashboard(
|
|
|
74
77
|
charts: Optional[List[Union[ChartUrnOrStr, Chart]]] = None,
|
|
75
78
|
dashboards: Optional[List[Union[DashboardUrnOrStr, Dashboard]]] = None,
|
|
76
79
|
# Standard aspects.
|
|
80
|
+
parent_container: ParentContainerInputType | Unset = unset,
|
|
77
81
|
subtype: Optional[str] = None,
|
|
78
82
|
owners: Optional[OwnersInputType] = None,
|
|
79
83
|
links: Optional[LinksInputType] = None,
|
|
@@ -94,18 +98,7 @@ class Dashboard(
|
|
|
94
98
|
self._set_platform_instance(platform, platform_instance)
|
|
95
99
|
|
|
96
100
|
# Initialize DashboardInfoClass with default values
|
|
97
|
-
dashboard_info =
|
|
98
|
-
title=display_name or name,
|
|
99
|
-
description=description or "",
|
|
100
|
-
lastModified=models.ChangeAuditStampsClass(
|
|
101
|
-
lastModified=None,
|
|
102
|
-
),
|
|
103
|
-
customProperties={},
|
|
104
|
-
chartEdges=[],
|
|
105
|
-
datasetEdges=[],
|
|
106
|
-
dashboards=[],
|
|
107
|
-
)
|
|
108
|
-
|
|
101
|
+
dashboard_info = self._ensure_dashboard_props(display_name=display_name)
|
|
109
102
|
if last_modified:
|
|
110
103
|
dashboard_info.lastModified = models.ChangeAuditStampsClass(
|
|
111
104
|
lastModified=models.AuditStampClass(
|
|
@@ -114,7 +107,6 @@ class Dashboard(
|
|
|
114
107
|
),
|
|
115
108
|
)
|
|
116
109
|
|
|
117
|
-
# Set additional properties
|
|
118
110
|
if description is not None:
|
|
119
111
|
self.set_description(description)
|
|
120
112
|
if display_name is not None:
|
|
@@ -129,6 +121,15 @@ class Dashboard(
|
|
|
129
121
|
self.set_last_modified(last_modified)
|
|
130
122
|
if last_refreshed is not None:
|
|
131
123
|
self.set_last_refreshed(last_refreshed)
|
|
124
|
+
if input_datasets is not None:
|
|
125
|
+
self.set_input_datasets(input_datasets)
|
|
126
|
+
if charts is not None:
|
|
127
|
+
self.set_charts(charts)
|
|
128
|
+
if dashboards is not None:
|
|
129
|
+
self.set_dashboards(dashboards)
|
|
130
|
+
|
|
131
|
+
if parent_container is not unset:
|
|
132
|
+
self._set_container(parent_container)
|
|
132
133
|
if subtype is not None:
|
|
133
134
|
self.set_subtype(subtype)
|
|
134
135
|
if owners is not None:
|
|
@@ -141,12 +142,6 @@ class Dashboard(
|
|
|
141
142
|
self.set_terms(terms)
|
|
142
143
|
if domain is not None:
|
|
143
144
|
self.set_domain(domain)
|
|
144
|
-
if input_datasets is not None:
|
|
145
|
-
self.set_input_datasets(input_datasets)
|
|
146
|
-
if charts is not None:
|
|
147
|
-
self.set_charts(charts)
|
|
148
|
-
if dashboards is not None:
|
|
149
|
-
self.set_dashboards(dashboards)
|
|
150
145
|
|
|
151
146
|
@classmethod
|
|
152
147
|
def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
|
|
@@ -162,11 +157,13 @@ class Dashboard(
|
|
|
162
157
|
assert isinstance(self._urn, DashboardUrn)
|
|
163
158
|
return self._urn
|
|
164
159
|
|
|
165
|
-
def _ensure_dashboard_props(
|
|
160
|
+
def _ensure_dashboard_props(
|
|
161
|
+
self, display_name: Optional[str] = None
|
|
162
|
+
) -> models.DashboardInfoClass:
|
|
166
163
|
"""Get the dashboard properties safely."""
|
|
167
164
|
return self._setdefault_aspect(
|
|
168
165
|
models.DashboardInfoClass(
|
|
169
|
-
title=self.urn.dashboard_id,
|
|
166
|
+
title=display_name or self.urn.dashboard_id,
|
|
170
167
|
description="",
|
|
171
168
|
lastModified=models.ChangeAuditStampsClass(
|
|
172
169
|
lastModified=models.AuditStampClass(
|
|
@@ -186,60 +183,52 @@ class Dashboard(
|
|
|
186
183
|
return self.urn.dashboard_id
|
|
187
184
|
|
|
188
185
|
@property
|
|
186
|
+
@deprecated("Use display_name instead", version="1.2.0.7")
|
|
189
187
|
def title(self) -> str:
|
|
190
|
-
"""Get the
|
|
191
|
-
return self.
|
|
188
|
+
"""Get the display name of the dashboard."""
|
|
189
|
+
return self.display_name
|
|
192
190
|
|
|
191
|
+
@deprecated("Use set_display_name instead", version="1.2.0.7")
|
|
193
192
|
def set_title(self, title: str) -> None:
|
|
194
|
-
"""Set the
|
|
195
|
-
|
|
196
|
-
props.title = title
|
|
197
|
-
self._set_aspect(props)
|
|
193
|
+
"""Set the display name of the dashboard."""
|
|
194
|
+
self.set_display_name(title)
|
|
198
195
|
|
|
199
196
|
@property
|
|
200
197
|
def description(self) -> Optional[str]:
|
|
201
198
|
"""Get the description of the dashboard."""
|
|
202
|
-
|
|
203
|
-
return
|
|
199
|
+
# Because description is a required field, we treat "" as None.
|
|
200
|
+
return self._ensure_dashboard_props().description or None
|
|
204
201
|
|
|
205
202
|
def set_description(self, description: str) -> None:
|
|
206
203
|
"""Set the description of the dashboard."""
|
|
207
|
-
|
|
208
|
-
props.description = description
|
|
209
|
-
self._set_aspect(props)
|
|
204
|
+
self._ensure_dashboard_props().description = description
|
|
210
205
|
|
|
211
206
|
@property
|
|
212
|
-
def display_name(self) ->
|
|
207
|
+
def display_name(self) -> str:
|
|
213
208
|
"""Get the display name of the dashboard."""
|
|
214
|
-
return self.title
|
|
209
|
+
return self._ensure_dashboard_props().title
|
|
215
210
|
|
|
216
211
|
def set_display_name(self, display_name: str) -> None:
|
|
217
212
|
"""Set the display name of the dashboard."""
|
|
218
|
-
self.
|
|
213
|
+
self._ensure_dashboard_props().title = display_name
|
|
219
214
|
|
|
220
215
|
@property
|
|
221
216
|
def external_url(self) -> Optional[str]:
|
|
222
217
|
"""Get the external URL of the dashboard."""
|
|
223
|
-
|
|
224
|
-
return props.externalUrl
|
|
218
|
+
return self._ensure_dashboard_props().externalUrl
|
|
225
219
|
|
|
226
220
|
def set_external_url(self, external_url: str) -> None:
|
|
227
221
|
"""Set the external URL of the dashboard."""
|
|
228
|
-
|
|
229
|
-
props.externalUrl = external_url
|
|
230
|
-
self._set_aspect(props)
|
|
222
|
+
self._ensure_dashboard_props().externalUrl = external_url
|
|
231
223
|
|
|
232
224
|
@property
|
|
233
225
|
def dashboard_url(self) -> Optional[str]:
|
|
234
226
|
"""Get the dashboard URL."""
|
|
235
|
-
|
|
236
|
-
return props.dashboardUrl
|
|
227
|
+
return self._ensure_dashboard_props().dashboardUrl
|
|
237
228
|
|
|
238
229
|
def set_dashboard_url(self, dashboard_url: str) -> None:
|
|
239
230
|
"""Set the dashboard URL."""
|
|
240
|
-
|
|
241
|
-
props.dashboardUrl = dashboard_url
|
|
242
|
-
self._set_aspect(props)
|
|
231
|
+
self._ensure_dashboard_props().dashboardUrl = dashboard_url
|
|
243
232
|
|
|
244
233
|
@property
|
|
245
234
|
def custom_properties(self) -> Dict[str, str]:
|
|
@@ -249,9 +238,7 @@ class Dashboard(
|
|
|
249
238
|
|
|
250
239
|
def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
|
|
251
240
|
"""Set the custom properties of the dashboard."""
|
|
252
|
-
|
|
253
|
-
props.customProperties = custom_properties
|
|
254
|
-
self._set_aspect(props)
|
|
241
|
+
self._ensure_dashboard_props().customProperties = custom_properties
|
|
255
242
|
|
|
256
243
|
@property
|
|
257
244
|
def last_modified(self) -> Optional[datetime]:
|
|
@@ -263,14 +250,12 @@ class Dashboard(
|
|
|
263
250
|
|
|
264
251
|
def set_last_modified(self, last_modified: datetime) -> None:
|
|
265
252
|
"""Set the last modification timestamp of the dashboard."""
|
|
266
|
-
|
|
267
|
-
props.lastModified = models.ChangeAuditStampsClass(
|
|
253
|
+
self._ensure_dashboard_props().lastModified = models.ChangeAuditStampsClass(
|
|
268
254
|
lastModified=models.AuditStampClass(
|
|
269
255
|
time=int(last_modified.timestamp()),
|
|
270
256
|
actor="urn:li:corpuser:datahub",
|
|
271
257
|
),
|
|
272
258
|
)
|
|
273
|
-
self._set_aspect(props)
|
|
274
259
|
|
|
275
260
|
@property
|
|
276
261
|
def last_refreshed(self) -> Optional[datetime]:
|
|
@@ -284,9 +269,7 @@ class Dashboard(
|
|
|
284
269
|
|
|
285
270
|
def set_last_refreshed(self, last_refreshed: datetime) -> None:
|
|
286
271
|
"""Set the last refresh timestamp of the dashboard."""
|
|
287
|
-
|
|
288
|
-
props.lastRefreshed = int(last_refreshed.timestamp())
|
|
289
|
-
self._set_aspect(props)
|
|
272
|
+
self._ensure_dashboard_props().lastRefreshed = int(last_refreshed.timestamp())
|
|
290
273
|
|
|
291
274
|
@property
|
|
292
275
|
def input_datasets(self) -> List[DatasetUrn]:
|
|
@@ -310,7 +293,6 @@ class Dashboard(
|
|
|
310
293
|
dataset_urn = DatasetUrn.from_string(dataset)
|
|
311
294
|
dataset_edges.append(models.EdgeClass(destinationUrn=str(dataset_urn)))
|
|
312
295
|
props.datasetEdges = dataset_edges
|
|
313
|
-
self._set_aspect(props)
|
|
314
296
|
|
|
315
297
|
def add_input_dataset(self, input_dataset: Union[DatasetUrnOrStr, Dataset]) -> None:
|
|
316
298
|
"""Add an input dataset to the dashboard."""
|
|
@@ -326,7 +308,6 @@ class Dashboard(
|
|
|
326
308
|
models.EdgeClass(destinationUrn=str(input_dataset_urn))
|
|
327
309
|
)
|
|
328
310
|
props.datasetEdges = dataset_edges
|
|
329
|
-
self._set_aspect(props)
|
|
330
311
|
|
|
331
312
|
def remove_input_dataset(
|
|
332
313
|
self, input_dataset: Union[DatasetUrnOrStr, Dataset]
|
|
@@ -342,7 +323,6 @@ class Dashboard(
|
|
|
342
323
|
for edge in (props.datasetEdges or [])
|
|
343
324
|
if edge.destinationUrn != str(input_dataset_urn)
|
|
344
325
|
]
|
|
345
|
-
self._set_aspect(props)
|
|
346
326
|
|
|
347
327
|
@property
|
|
348
328
|
def charts(self) -> List[ChartUrn]:
|
|
@@ -363,7 +343,6 @@ class Dashboard(
|
|
|
363
343
|
chart_urn = ChartUrn.from_string(chart)
|
|
364
344
|
chart_edges.append(models.EdgeClass(destinationUrn=str(chart_urn)))
|
|
365
345
|
props.chartEdges = chart_edges
|
|
366
|
-
self._set_aspect(props)
|
|
367
346
|
|
|
368
347
|
def add_chart(self, chart: Union[ChartUrnOrStr, Chart]) -> None:
|
|
369
348
|
"""Add a chart to the dashboard."""
|
|
@@ -381,7 +360,6 @@ class Dashboard(
|
|
|
381
360
|
if str(chart_urn) not in existing_urns:
|
|
382
361
|
chart_edges.append(models.EdgeClass(destinationUrn=str(chart_urn)))
|
|
383
362
|
props.chartEdges = chart_edges
|
|
384
|
-
self._set_aspect(props)
|
|
385
363
|
|
|
386
364
|
def remove_chart(self, chart: Union[ChartUrnOrStr, Chart]) -> None:
|
|
387
365
|
"""Remove a chart from the dashboard."""
|
|
@@ -395,7 +373,6 @@ class Dashboard(
|
|
|
395
373
|
for edge in (props.chartEdges or [])
|
|
396
374
|
if edge.destinationUrn != str(chart_urn)
|
|
397
375
|
]
|
|
398
|
-
self._set_aspect(props)
|
|
399
376
|
|
|
400
377
|
@property
|
|
401
378
|
def dashboards(self) -> List[DashboardUrn]:
|
|
@@ -417,7 +394,6 @@ class Dashboard(
|
|
|
417
394
|
else:
|
|
418
395
|
dashboard_urn = DashboardUrn.from_string(dashboard)
|
|
419
396
|
props.dashboards.append(models.EdgeClass(destinationUrn=str(dashboard_urn)))
|
|
420
|
-
self._set_aspect(props)
|
|
421
397
|
|
|
422
398
|
def add_dashboard(self, dashboard: Union[DashboardUrnOrStr, Dashboard]) -> None:
|
|
423
399
|
"""Add a dashboard to the dashboard."""
|
|
@@ -543,16 +543,19 @@ def _select_statement_cll(
|
|
|
543
543
|
root_scope: sqlglot.optimizer.Scope,
|
|
544
544
|
column_resolver: _ColumnResolver,
|
|
545
545
|
output_table: Optional[_TableName],
|
|
546
|
+
table_name_schema_mapping: Dict[_TableName, SchemaInfo],
|
|
547
|
+
default_db: Optional[str] = None,
|
|
548
|
+
default_schema: Optional[str] = None,
|
|
546
549
|
) -> List[_ColumnLineageInfo]:
|
|
547
550
|
column_lineage: List[_ColumnLineageInfo] = []
|
|
548
551
|
|
|
549
552
|
try:
|
|
550
|
-
# List output columns.
|
|
551
553
|
output_columns = [
|
|
552
554
|
(select_col.alias_or_name, select_col) for select_col in statement.selects
|
|
553
555
|
]
|
|
554
556
|
logger.debug("output columns: %s", [col[0] for col in output_columns])
|
|
555
|
-
|
|
557
|
+
|
|
558
|
+
for output_col, _original_col_expression in output_columns:
|
|
556
559
|
if not output_col or output_col == "*":
|
|
557
560
|
# If schema information is available, the * will be expanded to the actual columns.
|
|
558
561
|
# Otherwise, we can't process it.
|
|
@@ -576,13 +579,14 @@ def _select_statement_cll(
|
|
|
576
579
|
trim_selects=False,
|
|
577
580
|
# We don't need to pass the schema in here, since we've already qualified the columns.
|
|
578
581
|
)
|
|
579
|
-
# import pathlib
|
|
580
|
-
# pathlib.Path("sqlglot.html").write_text(
|
|
581
|
-
# str(lineage_node.to_html(dialect=dialect))
|
|
582
|
-
# )
|
|
583
582
|
|
|
584
583
|
# Generate SELECT lineage.
|
|
585
|
-
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
584
|
+
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
585
|
+
lineage_node,
|
|
586
|
+
dialect,
|
|
587
|
+
default_db,
|
|
588
|
+
default_schema,
|
|
589
|
+
)
|
|
586
590
|
|
|
587
591
|
# Fuzzy resolve the output column.
|
|
588
592
|
original_col_expression = lineage_node.expression
|
|
@@ -601,7 +605,7 @@ def _select_statement_cll(
|
|
|
601
605
|
if original_col_expression.type:
|
|
602
606
|
output_col_type = original_col_expression.type
|
|
603
607
|
|
|
604
|
-
#
|
|
608
|
+
# Resolve upstream columns - table names should already be qualified from placeholder processing
|
|
605
609
|
direct_resolved_col_upstreams = {
|
|
606
610
|
_ColumnRef(
|
|
607
611
|
table=edge.table,
|
|
@@ -706,6 +710,9 @@ def _column_level_lineage(
|
|
|
706
710
|
root_scope=root_scope,
|
|
707
711
|
column_resolver=column_resolver,
|
|
708
712
|
output_table=downstream_table,
|
|
713
|
+
table_name_schema_mapping=table_name_schema_mapping,
|
|
714
|
+
default_db=default_db,
|
|
715
|
+
default_schema=default_schema,
|
|
709
716
|
)
|
|
710
717
|
|
|
711
718
|
joins: Optional[List[_JoinInfo]] = None
|
|
@@ -726,6 +733,9 @@ def _column_level_lineage(
|
|
|
726
733
|
|
|
727
734
|
def _get_direct_raw_col_upstreams(
|
|
728
735
|
lineage_node: sqlglot.lineage.Node,
|
|
736
|
+
dialect: Optional[sqlglot.Dialect] = None,
|
|
737
|
+
default_db: Optional[str] = None,
|
|
738
|
+
default_schema: Optional[str] = None,
|
|
729
739
|
) -> OrderedSet[_ColumnRef]:
|
|
730
740
|
# Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
|
|
731
741
|
direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
|
|
@@ -755,6 +765,53 @@ def _get_direct_raw_col_upstreams(
|
|
|
755
765
|
direct_raw_col_upstreams.add(
|
|
756
766
|
_ColumnRef(table=table_ref, column=normalized_col)
|
|
757
767
|
)
|
|
768
|
+
elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
|
|
769
|
+
# Handle placeholder expressions from lateral joins.
|
|
770
|
+
#
|
|
771
|
+
# In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
|
|
772
|
+
# expressions instead of regular table references. This is critical for lateral join column lineage.
|
|
773
|
+
#
|
|
774
|
+
# Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
|
|
775
|
+
# The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
|
|
776
|
+
# which we need to parse to establish the lineage: output.value <- my_table2.value
|
|
777
|
+
#
|
|
778
|
+
# Without this handling, lateral join column lineage would be incomplete/missing.
|
|
779
|
+
try:
|
|
780
|
+
parsed = sqlglot.parse_one(node.name, dialect=dialect)
|
|
781
|
+
if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
|
|
782
|
+
table_ref = _TableName.from_sqlglot_table(
|
|
783
|
+
sqlglot.parse_one(
|
|
784
|
+
parsed.table, into=sqlglot.exp.Table, dialect=dialect
|
|
785
|
+
)
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
# SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
|
|
789
|
+
# Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
|
|
790
|
+
# We need this runtime qualification to ensure proper lineage resolution.
|
|
791
|
+
# Only qualify if this appears to be a real table reference (not a temporary construct)
|
|
792
|
+
if (
|
|
793
|
+
not (table_ref.database or table_ref.db_schema)
|
|
794
|
+
and dialect is not None
|
|
795
|
+
):
|
|
796
|
+
table_ref = table_ref.qualified(
|
|
797
|
+
dialect=dialect,
|
|
798
|
+
default_db=default_db,
|
|
799
|
+
default_schema=default_schema,
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
# Extract column name using proper isinstance check
|
|
803
|
+
if isinstance(parsed.this, sqlglot.exp.Identifier):
|
|
804
|
+
column_name = parsed.this.name
|
|
805
|
+
else:
|
|
806
|
+
column_name = str(parsed.this)
|
|
807
|
+
direct_raw_col_upstreams.add(
|
|
808
|
+
_ColumnRef(table=table_ref, column=column_name)
|
|
809
|
+
)
|
|
810
|
+
except Exception as e:
|
|
811
|
+
logger.debug(
|
|
812
|
+
f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
|
|
813
|
+
exc_info=True,
|
|
814
|
+
)
|
|
758
815
|
else:
|
|
759
816
|
# This branch doesn't matter. For example, a count(*) column would go here, and
|
|
760
817
|
# we don't get any column-level lineage for that.
|
|
@@ -857,7 +914,7 @@ def _get_raw_col_upstreams_for_expression(
|
|
|
857
914
|
trim_selects=False,
|
|
858
915
|
)
|
|
859
916
|
|
|
860
|
-
return _get_direct_raw_col_upstreams(node)
|
|
917
|
+
return _get_direct_raw_col_upstreams(node, dialect, None, None)
|
|
861
918
|
finally:
|
|
862
919
|
scope.expression = original_expression
|
|
863
920
|
|
|
@@ -872,8 +929,9 @@ def _list_joins(
|
|
|
872
929
|
|
|
873
930
|
scope: sqlglot.optimizer.Scope
|
|
874
931
|
for scope in root_scope.traverse():
|
|
932
|
+
# PART 1: Handle regular explicit JOINs (updated API)
|
|
875
933
|
join: sqlglot.exp.Join
|
|
876
|
-
for join in scope.find_all(sqlglot.exp.Join):
|
|
934
|
+
for join in scope.expression.find_all(sqlglot.exp.Join):
|
|
877
935
|
left_side_tables: OrderedSet[_TableName] = OrderedSet()
|
|
878
936
|
from_clause: sqlglot.exp.From
|
|
879
937
|
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
@@ -949,6 +1007,36 @@ def _list_joins(
|
|
|
949
1007
|
)
|
|
950
1008
|
)
|
|
951
1009
|
|
|
1010
|
+
# Handle LATERAL constructs
|
|
1011
|
+
for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
|
|
1012
|
+
# Get tables from non-lateral FROM clauses
|
|
1013
|
+
qualified_left: OrderedSet[_TableName] = OrderedSet()
|
|
1014
|
+
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
1015
|
+
if not isinstance(from_clause.this, sqlglot.exp.Lateral):
|
|
1016
|
+
qualified_left.update(
|
|
1017
|
+
_get_join_side_tables(from_clause.this, dialect, scope)
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
# Get tables from lateral subquery
|
|
1021
|
+
qualified_right: OrderedSet[_TableName] = OrderedSet()
|
|
1022
|
+
if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
|
|
1023
|
+
qualified_right.update(
|
|
1024
|
+
_TableName.from_sqlglot_table(t)
|
|
1025
|
+
for t in lateral.this.find_all(sqlglot.exp.Table)
|
|
1026
|
+
)
|
|
1027
|
+
qualified_right.update(qualified_left)
|
|
1028
|
+
|
|
1029
|
+
if qualified_left and qualified_right:
|
|
1030
|
+
joins.append(
|
|
1031
|
+
_JoinInfo(
|
|
1032
|
+
join_type="LATERAL JOIN",
|
|
1033
|
+
left_tables=list(qualified_left),
|
|
1034
|
+
right_tables=list(qualified_right),
|
|
1035
|
+
on_clause=None,
|
|
1036
|
+
columns_involved=[],
|
|
1037
|
+
)
|
|
1038
|
+
)
|
|
1039
|
+
|
|
952
1040
|
return joins
|
|
953
1041
|
|
|
954
1042
|
|
|
@@ -1186,25 +1274,30 @@ def _translate_internal_joins(
|
|
|
1186
1274
|
) -> List[JoinInfo]:
|
|
1187
1275
|
joins = []
|
|
1188
1276
|
for raw_join in raw_joins:
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1277
|
+
try:
|
|
1278
|
+
joins.append(
|
|
1279
|
+
JoinInfo(
|
|
1280
|
+
join_type=raw_join.join_type,
|
|
1281
|
+
left_tables=[
|
|
1282
|
+
table_name_urn_mapping[table] for table in raw_join.left_tables
|
|
1283
|
+
],
|
|
1284
|
+
right_tables=[
|
|
1285
|
+
table_name_urn_mapping[table] for table in raw_join.right_tables
|
|
1286
|
+
],
|
|
1287
|
+
on_clause=raw_join.on_clause,
|
|
1288
|
+
columns_involved=[
|
|
1289
|
+
ColumnRef(
|
|
1290
|
+
table=table_name_urn_mapping[col.table],
|
|
1291
|
+
column=col.column,
|
|
1292
|
+
)
|
|
1293
|
+
for col in raw_join.columns_involved
|
|
1294
|
+
],
|
|
1295
|
+
)
|
|
1206
1296
|
)
|
|
1207
|
-
|
|
1297
|
+
except KeyError as e:
|
|
1298
|
+
# Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
|
|
1299
|
+
logger.debug(f"Skipping join with unresolvable table: {e}")
|
|
1300
|
+
continue
|
|
1208
1301
|
return joins
|
|
1209
1302
|
|
|
1210
1303
|
|
|
@@ -115,6 +115,8 @@ def _expression_to_string(
|
|
|
115
115
|
return expression.sql(dialect=get_dialect(platform))
|
|
116
116
|
|
|
117
117
|
|
|
118
|
+
PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION = re.compile(r"(%s|\$\d|\?)")
|
|
119
|
+
|
|
118
120
|
_BASIC_NORMALIZATION_RULES = {
|
|
119
121
|
# Remove /* */ comments.
|
|
120
122
|
re.compile(r"/\*.*?\*/", re.DOTALL): "",
|
|
@@ -130,7 +132,9 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
130
132
|
re.compile(r"'[^']*'"): "?",
|
|
131
133
|
# Replace sequences of IN/VALUES with a single placeholder.
|
|
132
134
|
# The r" ?" makes it more robust to uneven spacing.
|
|
133
|
-
re.compile(
|
|
135
|
+
re.compile(
|
|
136
|
+
r"\b(IN|VALUES)\s*\( ?(?:%s|\$\d|\?)(?:, ?(?:%s|\$\d|\?))* ?\)", re.IGNORECASE
|
|
137
|
+
): r"\1 (?)",
|
|
134
138
|
# Normalize parenthesis spacing.
|
|
135
139
|
re.compile(r"\( "): "(",
|
|
136
140
|
re.compile(r" \)"): ")",
|
|
@@ -139,6 +143,9 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
139
143
|
# e.g. "col1,col2" -> "col1, col2"
|
|
140
144
|
re.compile(r"\b ,"): ",",
|
|
141
145
|
re.compile(r"\b,\b"): ", ",
|
|
146
|
+
# MAKE SURE THAT THIS IS AFTER THE ABOVE REPLACEMENT
|
|
147
|
+
# Replace all versions of placeholders with generic ? placeholder.
|
|
148
|
+
PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION: "?",
|
|
142
149
|
}
|
|
143
150
|
_TABLE_NAME_NORMALIZATION_RULES = {
|
|
144
151
|
# Replace UUID-like strings with a placeholder (both - and _ variants).
|
|
@@ -262,6 +269,10 @@ def get_query_fingerprint_debug(
|
|
|
262
269
|
if not fast:
|
|
263
270
|
dialect = get_dialect(platform)
|
|
264
271
|
expression_sql = generalize_query(expression, dialect=dialect)
|
|
272
|
+
# Normalize placeholders for consistent fingerprinting -> this only needs to be backward compatible with earlier sqglot generated generalized queries where the placeholders were always ?
|
|
273
|
+
expression_sql = PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION.sub(
|
|
274
|
+
"?", expression_sql
|
|
275
|
+
)
|
|
265
276
|
else:
|
|
266
277
|
expression_sql = generalize_query_fast(expression, dialect=platform)
|
|
267
278
|
except (ValueError, sqlglot.errors.SqlglotError) as e:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|