acryl-datahub 1.0.0.4rc3__py3-none-any.whl → 1.0.0.4rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.4rc3.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/METADATA +2555 -2555
- {acryl_datahub-1.0.0.4rc3.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/RECORD +13 -12
- datahub/_version.py +1 -1
- datahub/ingestion/source/kafka/kafka.py +11 -67
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/mode.py +12 -5
- datahub/ingestion/source/sql/hive.py +15 -0
- datahub/ingestion/source/superset.py +163 -9
- datahub/sdk/lineage_client.py +134 -7
- {acryl_datahub-1.0.0.4rc3.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.4rc3.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.4rc3.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.4rc3.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/top_level.txt +0 -0
datahub/sdk/lineage_client.py
CHANGED
|
@@ -4,22 +4,24 @@ import difflib
|
|
|
4
4
|
import logging
|
|
5
5
|
from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
|
|
6
6
|
|
|
7
|
+
from typing_extensions import assert_never
|
|
8
|
+
|
|
7
9
|
import datahub.metadata.schema_classes as models
|
|
8
10
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
11
|
from datahub.errors import SdkUsageError
|
|
10
|
-
from datahub.metadata.
|
|
11
|
-
from datahub.
|
|
12
|
-
from datahub.sdk._shared import DatasetUrnOrStr
|
|
12
|
+
from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn
|
|
13
|
+
from datahub.sdk._shared import DatajobUrnOrStr, DatasetUrnOrStr
|
|
13
14
|
from datahub.sdk._utils import DEFAULT_ACTOR_URN
|
|
14
15
|
from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
|
|
16
|
+
from datahub.specific.datajob import DataJobPatchBuilder
|
|
15
17
|
from datahub.specific.dataset import DatasetPatchBuilder
|
|
16
18
|
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
17
19
|
from datahub.utilities.ordered_set import OrderedSet
|
|
20
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
18
21
|
|
|
19
22
|
if TYPE_CHECKING:
|
|
20
23
|
from datahub.sdk.main_client import DataHubClient
|
|
21
24
|
|
|
22
|
-
logger = logging.getLogger(__name__)
|
|
23
25
|
|
|
24
26
|
_empty_audit_stamp = models.AuditStampClass(
|
|
25
27
|
time=0,
|
|
@@ -27,16 +29,19 @@ _empty_audit_stamp = models.AuditStampClass(
|
|
|
27
29
|
)
|
|
28
30
|
|
|
29
31
|
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
30
35
|
class LineageClient:
|
|
31
36
|
def __init__(self, client: DataHubClient):
|
|
32
37
|
self._client = client
|
|
33
38
|
|
|
34
39
|
def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
|
|
35
40
|
schema_metadata = self._client._graph.get_aspect(
|
|
36
|
-
str(dataset_urn), SchemaMetadataClass
|
|
41
|
+
str(dataset_urn), models.SchemaMetadataClass
|
|
37
42
|
)
|
|
38
43
|
if schema_metadata is None:
|
|
39
|
-
return
|
|
44
|
+
return set()
|
|
40
45
|
|
|
41
46
|
return {field.fieldPath for field in schema_metadata.fields}
|
|
42
47
|
|
|
@@ -122,7 +127,7 @@ class LineageClient:
|
|
|
122
127
|
|
|
123
128
|
if column_lineage is None:
|
|
124
129
|
cll = None
|
|
125
|
-
elif column_lineage
|
|
130
|
+
elif column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
|
|
126
131
|
upstream_schema = self._get_fields_from_dataset_urn(upstream)
|
|
127
132
|
downstream_schema = self._get_fields_from_dataset_urn(downstream)
|
|
128
133
|
if column_lineage == "auto_fuzzy":
|
|
@@ -144,6 +149,8 @@ class LineageClient:
|
|
|
144
149
|
downstream=downstream,
|
|
145
150
|
cll_mapping=column_lineage,
|
|
146
151
|
)
|
|
152
|
+
else:
|
|
153
|
+
assert_never(column_lineage)
|
|
147
154
|
|
|
148
155
|
updater = DatasetPatchBuilder(str(downstream))
|
|
149
156
|
updater.add_upstream_lineage(
|
|
@@ -227,9 +234,129 @@ class LineageClient:
|
|
|
227
234
|
raise SdkUsageError(
|
|
228
235
|
f"Dataset {updater.urn} does not exist, and hence cannot be updated."
|
|
229
236
|
)
|
|
237
|
+
|
|
230
238
|
mcps: List[
|
|
231
239
|
Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
|
|
232
240
|
] = list(updater.build())
|
|
233
241
|
if query_entity:
|
|
234
242
|
mcps.extend(query_entity)
|
|
235
243
|
self._client._graph.emit_mcps(mcps)
|
|
244
|
+
|
|
245
|
+
def add_dataset_lineage_from_sql(
|
|
246
|
+
self,
|
|
247
|
+
*,
|
|
248
|
+
query_text: str,
|
|
249
|
+
platform: str,
|
|
250
|
+
platform_instance: Optional[str] = None,
|
|
251
|
+
env: str = "PROD",
|
|
252
|
+
default_db: Optional[str] = None,
|
|
253
|
+
default_schema: Optional[str] = None,
|
|
254
|
+
) -> None:
|
|
255
|
+
"""Add lineage by parsing a SQL query."""
|
|
256
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
257
|
+
create_lineage_sql_parsed_result,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Parse the SQL query to extract lineage information
|
|
261
|
+
parsed_result = create_lineage_sql_parsed_result(
|
|
262
|
+
query=query_text,
|
|
263
|
+
default_db=default_db,
|
|
264
|
+
default_schema=default_schema,
|
|
265
|
+
platform=platform,
|
|
266
|
+
platform_instance=platform_instance,
|
|
267
|
+
env=env,
|
|
268
|
+
graph=self._client._graph,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if parsed_result.debug_info.table_error:
|
|
272
|
+
raise SdkUsageError(
|
|
273
|
+
f"Failed to parse SQL query: {parsed_result.debug_info.error}"
|
|
274
|
+
)
|
|
275
|
+
elif parsed_result.debug_info.column_error:
|
|
276
|
+
logger.warning(
|
|
277
|
+
f"Failed to parse SQL query: {parsed_result.debug_info.error}",
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
if not parsed_result.out_tables:
|
|
281
|
+
raise SdkUsageError(
|
|
282
|
+
"No output tables found in the query. Cannot establish lineage."
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
# Use the first output table as the downstream
|
|
286
|
+
downstream_urn = parsed_result.out_tables[0]
|
|
287
|
+
|
|
288
|
+
# Process all upstream tables found in the query
|
|
289
|
+
for upstream_table in parsed_result.in_tables:
|
|
290
|
+
# Skip self-lineage
|
|
291
|
+
if upstream_table == downstream_urn:
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
# Extract column-level lineage for this specific upstream table
|
|
295
|
+
column_mapping = {}
|
|
296
|
+
if parsed_result.column_lineage:
|
|
297
|
+
for col_lineage in parsed_result.column_lineage:
|
|
298
|
+
if not (col_lineage.downstream and col_lineage.downstream.column):
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Filter upstreams to only include columns from current upstream table
|
|
302
|
+
upstream_cols = [
|
|
303
|
+
ref.column
|
|
304
|
+
for ref in col_lineage.upstreams
|
|
305
|
+
if ref.table == upstream_table and ref.column
|
|
306
|
+
]
|
|
307
|
+
|
|
308
|
+
if upstream_cols:
|
|
309
|
+
column_mapping[col_lineage.downstream.column] = upstream_cols
|
|
310
|
+
|
|
311
|
+
# Add lineage, including query text
|
|
312
|
+
self.add_dataset_transform_lineage(
|
|
313
|
+
upstream=upstream_table,
|
|
314
|
+
downstream=downstream_urn,
|
|
315
|
+
column_lineage=column_mapping or None,
|
|
316
|
+
query_text=query_text,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
def add_datajob_lineage(
|
|
320
|
+
self,
|
|
321
|
+
*,
|
|
322
|
+
datajob: DatajobUrnOrStr,
|
|
323
|
+
upstreams: Optional[List[Union[DatasetUrnOrStr, DatajobUrnOrStr]]] = None,
|
|
324
|
+
downstreams: Optional[List[DatasetUrnOrStr]] = None,
|
|
325
|
+
) -> None:
|
|
326
|
+
"""
|
|
327
|
+
Add lineage between a datajob and datasets/datajobs.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
datajob: The datajob URN to connect lineage with
|
|
331
|
+
upstreams: List of upstream datasets or datajobs that serve as inputs to the datajob
|
|
332
|
+
downstreams: List of downstream datasets that are outputs of the datajob
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
if not upstreams and not downstreams:
|
|
336
|
+
raise SdkUsageError("No upstreams or downstreams provided")
|
|
337
|
+
|
|
338
|
+
datajob_urn = DataJobUrn.from_string(datajob)
|
|
339
|
+
|
|
340
|
+
# Initialize the patch builder for the datajob
|
|
341
|
+
patch_builder = DataJobPatchBuilder(str(datajob_urn))
|
|
342
|
+
|
|
343
|
+
# Process upstream connections (inputs to the datajob)
|
|
344
|
+
if upstreams:
|
|
345
|
+
for upstream in upstreams:
|
|
346
|
+
# try converting to dataset urn
|
|
347
|
+
try:
|
|
348
|
+
dataset_urn = DatasetUrn.from_string(upstream)
|
|
349
|
+
patch_builder.add_input_dataset(dataset_urn)
|
|
350
|
+
except InvalidUrnError:
|
|
351
|
+
# try converting to datajob urn
|
|
352
|
+
datajob_urn = DataJobUrn.from_string(upstream)
|
|
353
|
+
patch_builder.add_input_datajob(datajob_urn)
|
|
354
|
+
|
|
355
|
+
# Process downstream connections (outputs from the datajob)
|
|
356
|
+
if downstreams:
|
|
357
|
+
for downstream in downstreams:
|
|
358
|
+
downstream_urn = DatasetUrn.from_string(downstream)
|
|
359
|
+
patch_builder.add_output_dataset(downstream_urn)
|
|
360
|
+
|
|
361
|
+
# Apply the changes to the entity
|
|
362
|
+
self._client.entities.update(patch_builder)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|