acryl-datahub 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2424 -2424
- {acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +14 -13
- datahub/__init__.py +1 -1
- datahub/emitter/rest_emitter.py +16 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
- datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +23 -28
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- {acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=gbsVKK_ULsM259cMG08Rrx6A9_72Iy7zxyDkQZ37NCw,576
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
|
|
|
119
119
|
datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
|
|
120
120
|
datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
|
|
121
121
|
datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
|
|
122
|
-
datahub/emitter/rest_emitter.py,sha256=
|
|
122
|
+
datahub/emitter/rest_emitter.py,sha256=d5Zjo3GXDu9rUqlSsK9aOx-yEbHjDFZHelfq_ZFeb5M,16393
|
|
123
123
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
124
124
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
125
125
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -144,6 +144,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
|
|
|
144
144
|
datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
|
|
145
145
|
datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
146
|
datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
|
|
147
|
+
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
|
|
147
148
|
datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
148
149
|
datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
|
|
149
150
|
datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
|
|
@@ -431,19 +432,19 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
|
|
|
431
432
|
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
|
|
432
433
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
433
434
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
434
|
-
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=
|
|
435
|
+
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=uMGmMEl4hWEmN7GxMyDBdwlIPAW7WmOnu41kZ0dvCG4,21551
|
|
435
436
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
436
|
-
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=
|
|
437
|
+
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
|
|
437
438
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
|
|
438
439
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
|
|
439
440
|
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
|
|
440
|
-
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=
|
|
441
|
+
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=K-KEr3OpwMHye08lXAy-5doUUGoGJP3b-ntJAGU_NBY,42472
|
|
441
442
|
datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
|
|
442
443
|
datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
|
|
443
444
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
|
|
444
445
|
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
|
|
445
446
|
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
|
|
446
|
-
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=
|
|
447
|
+
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=ecaTCJNAQ_IJOPInPGXA3jv1dE5lztSU82UhpBygiq0,31654
|
|
447
448
|
datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
448
449
|
datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
|
|
449
450
|
datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
|
|
@@ -505,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=2-pYQ-3B9UVUwO1yB9iTdi3DqgqZ2JrpQ
|
|
|
505
506
|
datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
|
|
506
507
|
datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
|
|
507
508
|
datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
|
|
508
|
-
datahub/ingestion/source/unity/source.py,sha256=
|
|
509
|
+
datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
|
|
509
510
|
datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
|
|
510
511
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
511
512
|
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
|
|
@@ -516,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
|
|
|
516
517
|
datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
|
|
517
518
|
datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
|
|
518
519
|
datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
519
|
-
datahub/ingestion/source_report/ingestion_stage.py,sha256=
|
|
520
|
+
datahub/ingestion/source_report/ingestion_stage.py,sha256=pJcJeLSjaixlLqQyQtE3bfUcvXEVwrSaWWtU4iU9UEo,1557
|
|
520
521
|
datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
|
|
521
522
|
datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
|
|
522
523
|
datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -980,8 +981,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
980
981
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
981
982
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
982
983
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
983
|
-
acryl_datahub-0.15.0.
|
|
984
|
-
acryl_datahub-0.15.0.
|
|
985
|
-
acryl_datahub-0.15.0.
|
|
986
|
-
acryl_datahub-0.15.0.
|
|
987
|
-
acryl_datahub-0.15.0.
|
|
984
|
+
acryl_datahub-0.15.0.1rc2.dist-info/METADATA,sha256=gLix1LBWIrfQF-dcU1JsLeAAFkHuALY9dHVboVmjIJg,173642
|
|
985
|
+
acryl_datahub-0.15.0.1rc2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
986
|
+
acryl_datahub-0.15.0.1rc2.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
987
|
+
acryl_datahub-0.15.0.1rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
988
|
+
acryl_datahub-0.15.0.1rc2.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -291,6 +291,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
291
291
|
mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
292
292
|
async_flag: Optional[bool] = None,
|
|
293
293
|
) -> int:
|
|
294
|
+
logger.debug("Attempting to emit batch mcps")
|
|
294
295
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
295
296
|
for mcp in mcps:
|
|
296
297
|
ensure_has_system_metadata(mcp)
|
|
@@ -303,15 +304,22 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
303
304
|
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
304
305
|
for mcp_obj in mcp_objs:
|
|
305
306
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
307
|
+
logger.debug(
|
|
308
|
+
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
|
|
309
|
+
)
|
|
306
310
|
|
|
307
311
|
if (
|
|
308
312
|
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
309
313
|
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
310
314
|
):
|
|
315
|
+
logger.debug("Decided to create new chunk")
|
|
311
316
|
mcp_obj_chunks.append([])
|
|
312
317
|
current_chunk_size = 0
|
|
313
318
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
314
319
|
current_chunk_size += mcp_obj_size
|
|
320
|
+
logger.debug(
|
|
321
|
+
f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
|
|
322
|
+
)
|
|
315
323
|
|
|
316
324
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
317
325
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
@@ -338,8 +346,15 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
338
346
|
|
|
339
347
|
def _emit_generic(self, url: str, payload: str) -> None:
|
|
340
348
|
curl_command = make_curl_command(self._session, "POST", url, payload)
|
|
349
|
+
payload_size = len(payload)
|
|
350
|
+
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
351
|
+
# since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
|
|
352
|
+
logger.warning(
|
|
353
|
+
f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
|
|
354
|
+
)
|
|
341
355
|
logger.debug(
|
|
342
|
-
"Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
|
|
356
|
+
"Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
|
|
357
|
+
payload_size,
|
|
343
358
|
curl_command,
|
|
344
359
|
)
|
|
345
360
|
try:
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Iterable, List
|
|
4
|
+
|
|
5
|
+
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
|
6
|
+
from datahub.emitter.serialization_helper import pre_json_transform
|
|
7
|
+
from datahub.ingestion.api.source import SourceReport
|
|
8
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
9
|
+
from datahub.metadata.schema_classes import (
|
|
10
|
+
DatasetProfileClass,
|
|
11
|
+
SchemaFieldClass,
|
|
12
|
+
SchemaMetadataClass,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EnsureAspectSizeProcessor:
|
|
19
|
+
def __init__(
|
|
20
|
+
self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
21
|
+
):
|
|
22
|
+
self.report = report
|
|
23
|
+
self.payload_constraint = payload_constraint
|
|
24
|
+
|
|
25
|
+
def ensure_dataset_profile_size(
|
|
26
|
+
self, dataset_urn: str, profile: DatasetProfileClass
|
|
27
|
+
) -> None:
|
|
28
|
+
"""
|
|
29
|
+
This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
|
|
30
|
+
in the future
|
|
31
|
+
"""
|
|
32
|
+
sample_fields_size = 0
|
|
33
|
+
if profile.fieldProfiles:
|
|
34
|
+
logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
|
|
35
|
+
for field in profile.fieldProfiles:
|
|
36
|
+
if field.sampleValues:
|
|
37
|
+
values_len = 0
|
|
38
|
+
for value in field.sampleValues:
|
|
39
|
+
if value:
|
|
40
|
+
values_len += len(value)
|
|
41
|
+
logger.debug(
|
|
42
|
+
f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
|
|
43
|
+
)
|
|
44
|
+
if sample_fields_size + values_len > self.payload_constraint:
|
|
45
|
+
field.sampleValues = []
|
|
46
|
+
self.report.warning(
|
|
47
|
+
title="Dataset profile truncated due to size constraint",
|
|
48
|
+
message="Dataset profile contained too much data and would have caused ingestion to fail",
|
|
49
|
+
context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
sample_fields_size += values_len
|
|
53
|
+
else:
|
|
54
|
+
logger.debug(f"Field {field.fieldPath} has no sample values")
|
|
55
|
+
|
|
56
|
+
def ensure_schema_metadata_size(
|
|
57
|
+
self, dataset_urn: str, schema: SchemaMetadataClass
|
|
58
|
+
) -> None:
|
|
59
|
+
"""
|
|
60
|
+
This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
|
|
61
|
+
in the future
|
|
62
|
+
"""
|
|
63
|
+
total_fields_size = 0
|
|
64
|
+
logger.debug(f"Amount of schema fields: {len(schema.fields)}")
|
|
65
|
+
accepted_fields: List[SchemaFieldClass] = []
|
|
66
|
+
for field in schema.fields:
|
|
67
|
+
field_size = len(json.dumps(pre_json_transform(field.to_obj())))
|
|
68
|
+
logger.debug(f"Field {field.fieldPath} takes total {field_size}")
|
|
69
|
+
if total_fields_size + field_size < self.payload_constraint:
|
|
70
|
+
accepted_fields.append(field)
|
|
71
|
+
total_fields_size += field_size
|
|
72
|
+
else:
|
|
73
|
+
self.report.warning(
|
|
74
|
+
title="Schema truncated due to size constraint",
|
|
75
|
+
message="Dataset schema contained too much data and would have caused ingestion to fail",
|
|
76
|
+
context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
schema.fields = accepted_fields
|
|
80
|
+
|
|
81
|
+
def ensure_aspect_size(
|
|
82
|
+
self,
|
|
83
|
+
stream: Iterable[MetadataWorkUnit],
|
|
84
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
85
|
+
"""
|
|
86
|
+
We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
|
|
87
|
+
on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
|
|
88
|
+
"""
|
|
89
|
+
for wu in stream:
|
|
90
|
+
logger.debug(f"Ensuring size of workunit: {wu.id}")
|
|
91
|
+
|
|
92
|
+
if schema := wu.get_aspect_of_type(SchemaMetadataClass):
|
|
93
|
+
self.ensure_schema_metadata_size(wu.get_urn(), schema)
|
|
94
|
+
elif profile := wu.get_aspect_of_type(DatasetProfileClass):
|
|
95
|
+
self.ensure_dataset_profile_size(wu.get_urn(), profile)
|
|
96
|
+
yield wu
|
|
@@ -265,64 +265,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
265
265
|
with PerfTimer() as timer:
|
|
266
266
|
self.report.num_external_table_edges_scanned = 0
|
|
267
267
|
|
|
268
|
-
for (
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
self.sql_aggregator.add(known_lineage_mapping)
|
|
272
|
-
logger.info(
|
|
273
|
-
"Done populating external lineage from copy history. "
|
|
274
|
-
f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
for (
|
|
278
|
-
known_lineage_mapping
|
|
279
|
-
) in self._populate_external_lineage_from_show_query(discovered_tables):
|
|
280
|
-
self.sql_aggregator.add(known_lineage_mapping)
|
|
281
|
-
|
|
282
|
-
logger.info(
|
|
283
|
-
"Done populating external lineage from show external tables. "
|
|
284
|
-
f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
|
|
285
|
-
)
|
|
268
|
+
for entry in self._get_copy_history_lineage(discovered_tables):
|
|
269
|
+
self.sql_aggregator.add(entry)
|
|
270
|
+
logger.info("Done populating external lineage from copy history. ")
|
|
286
271
|
|
|
287
272
|
self.report.external_lineage_queries_secs = timer.elapsed_seconds()
|
|
288
273
|
|
|
289
|
-
# Handles the case for explicitly created external tables.
|
|
290
|
-
# NOTE: Snowflake does not log this information to the access_history table.
|
|
291
|
-
def _populate_external_lineage_from_show_query(
|
|
292
|
-
self, discovered_tables: List[str]
|
|
293
|
-
) -> Iterable[KnownLineageMapping]:
|
|
294
|
-
external_tables_query: str = SnowflakeQuery.show_external_tables()
|
|
295
|
-
try:
|
|
296
|
-
for db_row in self.connection.query(external_tables_query):
|
|
297
|
-
key = self.identifiers.get_dataset_identifier(
|
|
298
|
-
db_row["name"], db_row["schema_name"], db_row["database_name"]
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
if key not in discovered_tables:
|
|
302
|
-
continue
|
|
303
|
-
if db_row["location"].startswith("s3://"):
|
|
304
|
-
yield KnownLineageMapping(
|
|
305
|
-
upstream_urn=make_s3_urn_for_lineage(
|
|
306
|
-
db_row["location"], self.config.env
|
|
307
|
-
),
|
|
308
|
-
downstream_urn=self.identifiers.gen_dataset_urn(key),
|
|
309
|
-
)
|
|
310
|
-
self.report.num_external_table_edges_scanned += 1
|
|
311
|
-
|
|
312
|
-
self.report.num_external_table_edges_scanned += 1
|
|
313
|
-
except Exception as e:
|
|
314
|
-
logger.debug(e, exc_info=e)
|
|
315
|
-
self.structured_reporter.warning(
|
|
316
|
-
"Error populating external table lineage from Snowflake",
|
|
317
|
-
exc=e,
|
|
318
|
-
)
|
|
319
|
-
self.report_status(EXTERNAL_LINEAGE, False)
|
|
320
|
-
|
|
321
274
|
# Handles the case where a table is populated from an external stage/s3 location via copy.
|
|
322
275
|
# Eg: copy into category_english from @external_s3_stage;
|
|
323
276
|
# Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv';
|
|
324
277
|
# NOTE: Snowflake does not log this information to the access_history table.
|
|
325
|
-
def
|
|
278
|
+
def _get_copy_history_lineage(
|
|
326
279
|
self, discovered_tables: List[str]
|
|
327
280
|
) -> Iterable[KnownLineageMapping]:
|
|
328
281
|
query: str = SnowflakeQuery.copy_lineage_history(
|
|
@@ -247,9 +247,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
247
247
|
for entry in self.fetch_copy_history():
|
|
248
248
|
queries.append(entry)
|
|
249
249
|
|
|
250
|
-
# TODO: Add "show external tables" lineage to the main schema extractor.
|
|
251
|
-
# Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
|
|
252
|
-
|
|
253
250
|
with self.report.query_log_fetch_timer:
|
|
254
251
|
for entry in self.fetch_query_log():
|
|
255
252
|
queries.append(entry)
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
|
|
|
16
16
|
ClassificationHandler,
|
|
17
17
|
classification_workunit_processor,
|
|
18
18
|
)
|
|
19
|
+
from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
|
|
19
20
|
from datahub.ingestion.source.common.subtypes import (
|
|
20
21
|
DatasetContainerSubTypes,
|
|
21
22
|
DatasetSubTypes,
|
|
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
|
|
|
35
36
|
)
|
|
36
37
|
from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
|
|
37
38
|
from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
|
|
39
|
+
from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
|
|
38
40
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
39
41
|
from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
40
42
|
SCHEMA_PARALLELISM,
|
|
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
65
67
|
get_domain_wu,
|
|
66
68
|
)
|
|
67
69
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
70
|
+
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
68
71
|
METADATA_EXTRACTION,
|
|
69
72
|
PROFILING,
|
|
70
73
|
)
|
|
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
96
99
|
TimeType,
|
|
97
100
|
)
|
|
98
101
|
from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
|
|
99
|
-
from datahub.sql_parsing.sql_parsing_aggregator import
|
|
102
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
103
|
+
KnownLineageMapping,
|
|
104
|
+
SqlParsingAggregator,
|
|
105
|
+
)
|
|
100
106
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
101
107
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
102
108
|
|
|
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
180
186
|
|
|
181
187
|
# These are populated as side-effects of get_workunits_internal.
|
|
182
188
|
self.databases: List[SnowflakeDatabase] = []
|
|
183
|
-
|
|
189
|
+
|
|
190
|
+
self.aggregator = aggregator
|
|
184
191
|
|
|
185
192
|
def get_connection(self) -> SnowflakeConnection:
|
|
186
193
|
return self.connection
|
|
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
212
219
|
self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
|
|
213
220
|
yield from self._process_database(snowflake_db)
|
|
214
221
|
|
|
222
|
+
self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
|
|
223
|
+
discovered_tables: List[str] = [
|
|
224
|
+
self.identifiers.get_dataset_identifier(
|
|
225
|
+
table_name, schema.name, db.name
|
|
226
|
+
)
|
|
227
|
+
for db in self.databases
|
|
228
|
+
for schema in db.schemas
|
|
229
|
+
for table_name in schema.tables
|
|
230
|
+
]
|
|
231
|
+
if self.aggregator:
|
|
232
|
+
for entry in self._external_tables_ddl_lineage(discovered_tables):
|
|
233
|
+
self.aggregator.add(entry)
|
|
234
|
+
|
|
215
235
|
except SnowflakePermissionError as e:
|
|
216
236
|
self.structured_reporter.failure(
|
|
217
237
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
@@ -1082,3 +1102,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1082
1102
|
|
|
1083
1103
|
# Access to table but none of its constraints - is this possible ?
|
|
1084
1104
|
return constraints.get(table_name, [])
|
|
1105
|
+
|
|
1106
|
+
# Handles the case for explicitly created external tables.
|
|
1107
|
+
# NOTE: Snowflake does not log this information to the access_history table.
|
|
1108
|
+
def _external_tables_ddl_lineage(
|
|
1109
|
+
self, discovered_tables: List[str]
|
|
1110
|
+
) -> Iterable[KnownLineageMapping]:
|
|
1111
|
+
external_tables_query: str = SnowflakeQuery.show_external_tables()
|
|
1112
|
+
try:
|
|
1113
|
+
for db_row in self.connection.query(external_tables_query):
|
|
1114
|
+
key = self.identifiers.get_dataset_identifier(
|
|
1115
|
+
db_row["name"], db_row["schema_name"], db_row["database_name"]
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
if key not in discovered_tables:
|
|
1119
|
+
continue
|
|
1120
|
+
if db_row["location"].startswith("s3://"):
|
|
1121
|
+
yield KnownLineageMapping(
|
|
1122
|
+
upstream_urn=make_s3_urn_for_lineage(
|
|
1123
|
+
db_row["location"], self.config.env
|
|
1124
|
+
),
|
|
1125
|
+
downstream_urn=self.identifiers.gen_dataset_urn(key),
|
|
1126
|
+
)
|
|
1127
|
+
self.report.num_external_table_edges_scanned += 1
|
|
1128
|
+
|
|
1129
|
+
self.report.num_external_table_edges_scanned += 1
|
|
1130
|
+
except Exception as e:
|
|
1131
|
+
self.structured_reporter.warning(
|
|
1132
|
+
"External table ddl lineage extraction failed",
|
|
1133
|
+
exc=e,
|
|
1134
|
+
)
|
|
@@ -161,35 +161,32 @@ class SnowflakeV2Source(
|
|
|
161
161
|
# For database, schema, tables, views, etc
|
|
162
162
|
self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
|
|
163
163
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
generate_operations=False,
|
|
186
|
-
format_queries=self.config.format_sql_queries,
|
|
187
|
-
)
|
|
164
|
+
|
|
165
|
+
self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
|
|
166
|
+
SqlParsingAggregator(
|
|
167
|
+
platform=self.identifiers.platform,
|
|
168
|
+
platform_instance=self.config.platform_instance,
|
|
169
|
+
env=self.config.env,
|
|
170
|
+
graph=self.ctx.graph,
|
|
171
|
+
eager_graph_load=(
|
|
172
|
+
# If we're ingestion schema metadata for tables/views, then we will populate
|
|
173
|
+
# schemas into the resolver as we go. We only need to do a bulk fetch
|
|
174
|
+
# if we're not ingesting schema metadata as part of ingestion.
|
|
175
|
+
not (
|
|
176
|
+
self.config.include_technical_schema
|
|
177
|
+
and self.config.include_tables
|
|
178
|
+
and self.config.include_views
|
|
179
|
+
)
|
|
180
|
+
and not self.config.lazy_schema_resolver
|
|
181
|
+
),
|
|
182
|
+
generate_usage_statistics=False,
|
|
183
|
+
generate_operations=False,
|
|
184
|
+
format_queries=self.config.format_sql_queries,
|
|
188
185
|
)
|
|
189
|
-
|
|
186
|
+
)
|
|
187
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
190
188
|
|
|
191
189
|
if self.config.include_table_lineage:
|
|
192
|
-
assert self.aggregator is not None
|
|
193
190
|
redundant_lineage_run_skip_handler: Optional[
|
|
194
191
|
RedundantLineageRunSkipHandler
|
|
195
192
|
] = None
|
|
@@ -487,8 +484,6 @@ class SnowflakeV2Source(
|
|
|
487
484
|
|
|
488
485
|
databases = schema_extractor.databases
|
|
489
486
|
|
|
490
|
-
# TODO: The checkpoint state for stale entity detection can be committed here.
|
|
491
|
-
|
|
492
487
|
if self.config.shares:
|
|
493
488
|
yield from SnowflakeSharesHandler(
|
|
494
489
|
self.config, self.report
|
|
@@ -26,6 +26,9 @@ from datahub.emitter.mcp_builder import (
|
|
|
26
26
|
gen_containers,
|
|
27
27
|
)
|
|
28
28
|
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
29
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
30
|
+
EnsureAspectSizeProcessor,
|
|
31
|
+
)
|
|
29
32
|
from datahub.ingestion.api.common import PipelineContext
|
|
30
33
|
from datahub.ingestion.api.decorators import (
|
|
31
34
|
SupportStatus,
|
|
@@ -260,6 +263,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
260
263
|
StaleEntityRemovalHandler.create(
|
|
261
264
|
self, self.config, self.ctx
|
|
262
265
|
).workunit_processor,
|
|
266
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
263
267
|
]
|
|
264
268
|
|
|
265
269
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -14,6 +14,7 @@ LINEAGE_EXTRACTION = "Lineage Extraction"
|
|
|
14
14
|
USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion"
|
|
15
15
|
USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats"
|
|
16
16
|
USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation"
|
|
17
|
+
EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage"
|
|
17
18
|
QUERIES_EXTRACTION = "Queries Extraction"
|
|
18
19
|
PROFILING = "Profiling"
|
|
19
20
|
|
|
File without changes
|
{acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|