acryl-datahub 1.0.0.3rc6__py3-none-any.whl → 1.0.0.3rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc6.dist-info → acryl_datahub-1.0.0.3rc8.dist-info}/METADATA +2517 -2517
- {acryl_datahub-1.0.0.3rc6.dist-info → acryl_datahub-1.0.0.3rc8.dist-info}/RECORD +22 -20
- {acryl_datahub-1.0.0.3rc6.dist-info → acryl_datahub-1.0.0.3rc8.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/emitter/mcp.py +15 -4
- datahub/errors.py +4 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +8 -62
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/sdk/_shared.py +3 -5
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +4 -1
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +2 -6
- {acryl_datahub-1.0.0.3rc6.dist-info → acryl_datahub-1.0.0.3rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc6.dist-info → acryl_datahub-1.0.0.3rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc6.dist-info → acryl_datahub-1.0.0.3rc8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import logging
|
|
5
|
+
from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
|
|
6
|
+
|
|
7
|
+
import datahub.metadata.schema_classes as models
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.errors import SdkUsageError
|
|
10
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
11
|
+
from datahub.metadata.urns import DatasetUrn, QueryUrn
|
|
12
|
+
from datahub.sdk._shared import DatasetUrnOrStr
|
|
13
|
+
from datahub.sdk._utils import DEFAULT_ACTOR_URN
|
|
14
|
+
from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
|
|
15
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
16
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
17
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from datahub.sdk.main_client import DataHubClient
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
_empty_audit_stamp = models.AuditStampClass(
|
|
25
|
+
time=0,
|
|
26
|
+
actor=DEFAULT_ACTOR_URN,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LineageClient:
|
|
31
|
+
def __init__(self, client: DataHubClient):
|
|
32
|
+
self._client = client
|
|
33
|
+
|
|
34
|
+
def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
|
|
35
|
+
schema_metadata = self._client._graph.get_aspect(
|
|
36
|
+
str(dataset_urn), SchemaMetadataClass
|
|
37
|
+
)
|
|
38
|
+
if schema_metadata is None:
|
|
39
|
+
return Set()
|
|
40
|
+
|
|
41
|
+
return {field.fieldPath for field in schema_metadata.fields}
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def _get_strict_column_lineage(
|
|
45
|
+
cls,
|
|
46
|
+
upstream_fields: Set[str],
|
|
47
|
+
downstream_fields: Set[str],
|
|
48
|
+
) -> ColumnLineageMapping:
|
|
49
|
+
"""Find matches between upstream and downstream fields with case-insensitive matching."""
|
|
50
|
+
strict_column_lineage: ColumnLineageMapping = {}
|
|
51
|
+
|
|
52
|
+
# Create case-insensitive mapping of upstream fields
|
|
53
|
+
case_insensitive_map = {field.lower(): field for field in upstream_fields}
|
|
54
|
+
|
|
55
|
+
# Match downstream fields using case-insensitive comparison
|
|
56
|
+
for downstream_field in downstream_fields:
|
|
57
|
+
lower_field = downstream_field.lower()
|
|
58
|
+
if lower_field in case_insensitive_map:
|
|
59
|
+
# Use the original case of the upstream field
|
|
60
|
+
strict_column_lineage[downstream_field] = [
|
|
61
|
+
case_insensitive_map[lower_field]
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
return strict_column_lineage
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def _get_fuzzy_column_lineage(
|
|
68
|
+
cls,
|
|
69
|
+
upstream_fields: Set[str],
|
|
70
|
+
downstream_fields: Set[str],
|
|
71
|
+
) -> ColumnLineageMapping:
|
|
72
|
+
"""Generate fuzzy matches between upstream and downstream fields."""
|
|
73
|
+
|
|
74
|
+
# Simple normalization function for better matching
|
|
75
|
+
def normalize(s: str) -> str:
|
|
76
|
+
return s.lower().replace("_", "")
|
|
77
|
+
|
|
78
|
+
# Create normalized lookup for upstream fields
|
|
79
|
+
normalized_upstream = {normalize(field): field for field in upstream_fields}
|
|
80
|
+
|
|
81
|
+
fuzzy_column_lineage = {}
|
|
82
|
+
for downstream_field in downstream_fields:
|
|
83
|
+
# Try exact match first
|
|
84
|
+
if downstream_field in upstream_fields:
|
|
85
|
+
fuzzy_column_lineage[downstream_field] = [downstream_field]
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
# Try normalized match
|
|
89
|
+
norm_downstream = normalize(downstream_field)
|
|
90
|
+
if norm_downstream in normalized_upstream:
|
|
91
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
92
|
+
normalized_upstream[norm_downstream]
|
|
93
|
+
]
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# If no direct match, find closest match using similarity
|
|
97
|
+
matches = difflib.get_close_matches(
|
|
98
|
+
norm_downstream,
|
|
99
|
+
normalized_upstream.keys(),
|
|
100
|
+
n=1, # Return only the best match
|
|
101
|
+
cutoff=0.8, # Adjust cutoff for sensitivity
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if matches:
|
|
105
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
106
|
+
normalized_upstream[matches[0]]
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
return fuzzy_column_lineage
|
|
110
|
+
|
|
111
|
+
def add_dataset_copy_lineage(
|
|
112
|
+
self,
|
|
113
|
+
*,
|
|
114
|
+
upstream: DatasetUrnOrStr,
|
|
115
|
+
downstream: DatasetUrnOrStr,
|
|
116
|
+
column_lineage: Union[
|
|
117
|
+
None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
|
|
118
|
+
] = "auto_fuzzy",
|
|
119
|
+
) -> None:
|
|
120
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
121
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
122
|
+
|
|
123
|
+
if column_lineage is None:
|
|
124
|
+
cll = None
|
|
125
|
+
elif column_lineage in ["auto_fuzzy", "auto_strict"]:
|
|
126
|
+
upstream_schema = self._get_fields_from_dataset_urn(upstream)
|
|
127
|
+
downstream_schema = self._get_fields_from_dataset_urn(downstream)
|
|
128
|
+
if column_lineage == "auto_fuzzy":
|
|
129
|
+
mapping = self._get_fuzzy_column_lineage(
|
|
130
|
+
upstream_schema, downstream_schema
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
mapping = self._get_strict_column_lineage(
|
|
134
|
+
upstream_schema, downstream_schema
|
|
135
|
+
)
|
|
136
|
+
cll = parse_cll_mapping(
|
|
137
|
+
upstream=upstream,
|
|
138
|
+
downstream=downstream,
|
|
139
|
+
cll_mapping=mapping,
|
|
140
|
+
)
|
|
141
|
+
elif isinstance(column_lineage, dict):
|
|
142
|
+
cll = parse_cll_mapping(
|
|
143
|
+
upstream=upstream,
|
|
144
|
+
downstream=downstream,
|
|
145
|
+
cll_mapping=column_lineage,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
149
|
+
updater.add_upstream_lineage(
|
|
150
|
+
models.UpstreamClass(
|
|
151
|
+
dataset=str(upstream),
|
|
152
|
+
type=models.DatasetLineageTypeClass.COPY,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
for cl in cll or []:
|
|
156
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
157
|
+
|
|
158
|
+
self._client.entities.update(updater)
|
|
159
|
+
|
|
160
|
+
def add_dataset_transform_lineage(
|
|
161
|
+
self,
|
|
162
|
+
*,
|
|
163
|
+
upstream: DatasetUrnOrStr,
|
|
164
|
+
downstream: DatasetUrnOrStr,
|
|
165
|
+
column_lineage: Optional[ColumnLineageMapping] = None,
|
|
166
|
+
query_text: Optional[str] = None,
|
|
167
|
+
) -> None:
|
|
168
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
169
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
170
|
+
|
|
171
|
+
cll = None
|
|
172
|
+
if column_lineage is not None:
|
|
173
|
+
cll = parse_cll_mapping(
|
|
174
|
+
upstream=upstream,
|
|
175
|
+
downstream=downstream,
|
|
176
|
+
cll_mapping=column_lineage,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
fields_involved = OrderedSet([str(upstream), str(downstream)])
|
|
180
|
+
if cll is not None:
|
|
181
|
+
for c in cll:
|
|
182
|
+
for field in c.upstreams or []:
|
|
183
|
+
fields_involved.add(field)
|
|
184
|
+
for field in c.downstreams or []:
|
|
185
|
+
fields_involved.add(field)
|
|
186
|
+
|
|
187
|
+
query_urn = None
|
|
188
|
+
query_entity = None
|
|
189
|
+
if query_text:
|
|
190
|
+
# Eventually we might want to use our regex-based fingerprinting instead.
|
|
191
|
+
fingerprint = generate_hash(query_text)
|
|
192
|
+
query_urn = QueryUrn(fingerprint).urn()
|
|
193
|
+
|
|
194
|
+
from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
|
|
195
|
+
|
|
196
|
+
query_entity = MetadataChangeProposalWrapper.construct_many(
|
|
197
|
+
query_urn,
|
|
198
|
+
aspects=[
|
|
199
|
+
models.QueryPropertiesClass(
|
|
200
|
+
statement=models.QueryStatementClass(
|
|
201
|
+
value=query_text, language=models.QueryLanguageClass.SQL
|
|
202
|
+
),
|
|
203
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
204
|
+
created=_empty_audit_stamp,
|
|
205
|
+
lastModified=_empty_audit_stamp,
|
|
206
|
+
),
|
|
207
|
+
make_query_subjects(list(fields_involved)),
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
212
|
+
updater.add_upstream_lineage(
|
|
213
|
+
models.UpstreamClass(
|
|
214
|
+
dataset=str(upstream),
|
|
215
|
+
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
|
216
|
+
query=query_urn,
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
for cl in cll or []:
|
|
220
|
+
cl.query = query_urn
|
|
221
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
222
|
+
|
|
223
|
+
# Throw if the dataset does not exist.
|
|
224
|
+
# We need to manually call .build() instead of reusing client.update()
|
|
225
|
+
# so that we make just one emit_mcps call.
|
|
226
|
+
if not self._client._graph.exists(updater.urn):
|
|
227
|
+
raise SdkUsageError(
|
|
228
|
+
f"Dataset {updater.urn} does not exist, and hence cannot be updated."
|
|
229
|
+
)
|
|
230
|
+
mcps: List[
|
|
231
|
+
Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
|
|
232
|
+
] = list(updater.build())
|
|
233
|
+
if query_entity:
|
|
234
|
+
mcps.extend(query_entity)
|
|
235
|
+
self._client._graph.emit_mcps(mcps)
|
datahub/sdk/main_client.py
CHANGED
|
@@ -6,6 +6,7 @@ from datahub.errors import SdkUsageError
|
|
|
6
6
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
7
7
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
8
8
|
from datahub.sdk.entity_client import EntityClient
|
|
9
|
+
from datahub.sdk.lineage_client import LineageClient
|
|
9
10
|
from datahub.sdk.resolver_client import ResolverClient
|
|
10
11
|
from datahub.sdk.search_client import SearchClient
|
|
11
12
|
|
|
@@ -99,4 +100,6 @@ class DataHubClient:
|
|
|
99
100
|
def search(self) -> SearchClient:
|
|
100
101
|
return SearchClient(self)
|
|
101
102
|
|
|
102
|
-
|
|
103
|
+
@property
|
|
104
|
+
def lineage(self) -> LineageClient:
|
|
105
|
+
return LineageClient(self)
|
|
@@ -32,6 +32,7 @@ from datahub.metadata.urns import (
|
|
|
32
32
|
SchemaFieldUrn,
|
|
33
33
|
Urn,
|
|
34
34
|
)
|
|
35
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
35
36
|
from datahub.sql_parsing.schema_resolver import (
|
|
36
37
|
SchemaResolver,
|
|
37
38
|
SchemaResolverInterface,
|
|
@@ -49,7 +50,6 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
49
50
|
)
|
|
50
51
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
51
52
|
_parse_statement,
|
|
52
|
-
generate_hash,
|
|
53
53
|
get_query_fingerprint,
|
|
54
54
|
try_format_query,
|
|
55
55
|
)
|
|
@@ -155,6 +155,47 @@ class QueryMetadata:
|
|
|
155
155
|
actor=(self.actor or _DEFAULT_USER_URN).urn(),
|
|
156
156
|
)
|
|
157
157
|
|
|
158
|
+
def get_subjects(
|
|
159
|
+
self,
|
|
160
|
+
downstream_urn: Optional[str],
|
|
161
|
+
include_fields: bool,
|
|
162
|
+
) -> List[UrnStr]:
|
|
163
|
+
query_subject_urns = OrderedSet[UrnStr]()
|
|
164
|
+
for upstream in self.upstreams:
|
|
165
|
+
query_subject_urns.add(upstream)
|
|
166
|
+
if include_fields:
|
|
167
|
+
for column in sorted(self.column_usage.get(upstream, [])):
|
|
168
|
+
query_subject_urns.add(
|
|
169
|
+
builder.make_schema_field_urn(upstream, column)
|
|
170
|
+
)
|
|
171
|
+
if downstream_urn:
|
|
172
|
+
query_subject_urns.add(downstream_urn)
|
|
173
|
+
if include_fields:
|
|
174
|
+
for column_lineage in self.column_lineage:
|
|
175
|
+
query_subject_urns.add(
|
|
176
|
+
builder.make_schema_field_urn(
|
|
177
|
+
downstream_urn, column_lineage.downstream.column
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
return list(query_subject_urns)
|
|
181
|
+
|
|
182
|
+
def make_query_properties(self) -> models.QueryPropertiesClass:
|
|
183
|
+
return models.QueryPropertiesClass(
|
|
184
|
+
statement=models.QueryStatementClass(
|
|
185
|
+
value=self.formatted_query_string,
|
|
186
|
+
language=models.QueryLanguageClass.SQL,
|
|
187
|
+
),
|
|
188
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
189
|
+
created=self.make_created_audit_stamp(),
|
|
190
|
+
lastModified=self.make_last_modified_audit_stamp(),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def make_query_subjects(urns: List[UrnStr]) -> models.QuerySubjectsClass:
|
|
195
|
+
return models.QuerySubjectsClass(
|
|
196
|
+
subjects=[models.QuerySubjectClass(entity=urn) for urn in urns]
|
|
197
|
+
)
|
|
198
|
+
|
|
158
199
|
|
|
159
200
|
@dataclasses.dataclass
|
|
160
201
|
class KnownQueryLineageInfo:
|
|
@@ -1440,42 +1481,15 @@ class SqlParsingAggregator(Closeable):
|
|
|
1440
1481
|
self.report.num_queries_skipped_due_to_filters += 1
|
|
1441
1482
|
return
|
|
1442
1483
|
|
|
1443
|
-
query_subject_urns = OrderedSet[UrnStr]()
|
|
1444
|
-
for upstream in query.upstreams:
|
|
1445
|
-
query_subject_urns.add(upstream)
|
|
1446
|
-
if self.generate_query_subject_fields:
|
|
1447
|
-
for column in sorted(query.column_usage.get(upstream, [])):
|
|
1448
|
-
query_subject_urns.add(
|
|
1449
|
-
builder.make_schema_field_urn(upstream, column)
|
|
1450
|
-
)
|
|
1451
|
-
if downstream_urn:
|
|
1452
|
-
query_subject_urns.add(downstream_urn)
|
|
1453
|
-
if self.generate_query_subject_fields:
|
|
1454
|
-
for column_lineage in query.column_lineage:
|
|
1455
|
-
query_subject_urns.add(
|
|
1456
|
-
builder.make_schema_field_urn(
|
|
1457
|
-
downstream_urn, column_lineage.downstream.column
|
|
1458
|
-
)
|
|
1459
|
-
)
|
|
1460
|
-
|
|
1461
1484
|
yield from MetadataChangeProposalWrapper.construct_many(
|
|
1462
1485
|
entityUrn=self._query_urn(query_id),
|
|
1463
1486
|
aspects=[
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
created=query.make_created_audit_stamp(),
|
|
1471
|
-
lastModified=query.make_last_modified_audit_stamp(),
|
|
1472
|
-
origin=query.origin.urn() if query.origin else None,
|
|
1473
|
-
),
|
|
1474
|
-
models.QuerySubjectsClass(
|
|
1475
|
-
subjects=[
|
|
1476
|
-
models.QuerySubjectClass(entity=urn)
|
|
1477
|
-
for urn in query_subject_urns
|
|
1478
|
-
]
|
|
1487
|
+
query.make_query_properties(),
|
|
1488
|
+
make_query_subjects(
|
|
1489
|
+
query.get_subjects(
|
|
1490
|
+
downstream_urn=downstream_urn,
|
|
1491
|
+
include_fields=self.generate_query_subject_fields,
|
|
1492
|
+
)
|
|
1479
1493
|
),
|
|
1480
1494
|
models.DataPlatformInstanceClass(
|
|
1481
1495
|
platform=self.platform.urn(),
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
|
|
2
2
|
|
|
3
3
|
import functools
|
|
4
|
-
import hashlib
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
7
6
|
from typing import Dict, Iterable, Optional, Tuple, Union
|
|
@@ -10,6 +9,8 @@ import sqlglot
|
|
|
10
9
|
import sqlglot.errors
|
|
11
10
|
import sqlglot.optimizer.eliminate_ctes
|
|
12
11
|
|
|
12
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
13
|
+
|
|
13
14
|
assert SQLGLOT_PATCHED
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
@@ -251,11 +252,6 @@ def generalize_query(expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr) ->
|
|
|
251
252
|
return expression.transform(_strip_expression, copy=True).sql(dialect=dialect)
|
|
252
253
|
|
|
253
254
|
|
|
254
|
-
def generate_hash(text: str) -> str:
|
|
255
|
-
# Once we move to Python 3.9+, we can set `usedforsecurity=False`.
|
|
256
|
-
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
257
|
-
|
|
258
|
-
|
|
259
255
|
def get_query_fingerprint_debug(
|
|
260
256
|
expression: sqlglot.exp.ExpOrStr,
|
|
261
257
|
platform: DialectOrStr,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|