acryl-datahub 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=gK5aLEGMHMZfg-QUDI5T7mr1ej_5OFVKhCrqqoj_QGk,576
1
+ datahub/__init__.py,sha256=gbsVKK_ULsM259cMG08Rrx6A9_72Iy7zxyDkQZ37NCw,576
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
119
119
  datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
120
120
  datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
121
121
  datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
122
- datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
122
+ datahub/emitter/rest_emitter.py,sha256=d5Zjo3GXDu9rUqlSsK9aOx-yEbHjDFZHelfq_ZFeb5M,16393
123
123
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
124
124
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
125
125
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -144,6 +144,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
144
144
  datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
145
145
  datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
147
+ datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
147
148
  datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
149
  datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
149
150
  datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -431,19 +432,19 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
431
432
  datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
432
433
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
433
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
434
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=suMICPFPvoV6shkjD_14JunLc8jAZBINzlFk2mYldkU,23676
435
+ datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=uMGmMEl4hWEmN7GxMyDBdwlIPAW7WmOnu41kZ0dvCG4,21551
435
436
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
436
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
437
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
437
438
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
438
439
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
439
440
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
440
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
441
+ datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=K-KEr3OpwMHye08lXAy-5doUUGoGJP3b-ntJAGU_NBY,42472
441
442
  datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
442
443
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
443
444
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
444
445
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
445
446
  datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
446
- datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=TlS5d1lpEN74ZP0c8UzUhJZIeBMO3ZIUxRler1p7lnA,31998
447
+ datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=ecaTCJNAQ_IJOPInPGXA3jv1dE5lztSU82UhpBygiq0,31654
447
448
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
448
449
  datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
449
450
  datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
@@ -505,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=2-pYQ-3B9UVUwO1yB9iTdi3DqgqZ2JrpQ
505
506
  datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
506
507
  datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
507
508
  datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
508
- datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
509
+ datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
509
510
  datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
510
511
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
511
512
  datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
@@ -516,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
516
517
  datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
517
518
  datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
518
519
  datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
519
- datahub/ingestion/source_report/ingestion_stage.py,sha256=zijPKvJKnB298cMoG6j_w6QRva9t1RWqnmycmbsKdgs,1499
520
+ datahub/ingestion/source_report/ingestion_stage.py,sha256=pJcJeLSjaixlLqQyQtE3bfUcvXEVwrSaWWtU4iU9UEo,1557
520
521
  datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
521
522
  datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
522
523
  datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -980,8 +981,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
980
981
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
981
982
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
982
983
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
983
- acryl_datahub-0.15.0.1rc1.dist-info/METADATA,sha256=Ll6D3fw03bz2dVmCp9Mcez5IbFKIfXzUnMeSe6Ej4eM,173642
984
- acryl_datahub-0.15.0.1rc1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
985
- acryl_datahub-0.15.0.1rc1.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
986
- acryl_datahub-0.15.0.1rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
987
- acryl_datahub-0.15.0.1rc1.dist-info/RECORD,,
984
+ acryl_datahub-0.15.0.1rc2.dist-info/METADATA,sha256=gLix1LBWIrfQF-dcU1JsLeAAFkHuALY9dHVboVmjIJg,173642
985
+ acryl_datahub-0.15.0.1rc2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
986
+ acryl_datahub-0.15.0.1rc2.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
987
+ acryl_datahub-0.15.0.1rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
988
+ acryl_datahub-0.15.0.1rc2.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc1"
6
+ __version__ = "0.15.0.1rc2"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -291,6 +291,7 @@ class DataHubRestEmitter(Closeable, Emitter):
291
291
  mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
292
292
  async_flag: Optional[bool] = None,
293
293
  ) -> int:
294
+ logger.debug("Attempting to emit batch mcps")
294
295
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
295
296
  for mcp in mcps:
296
297
  ensure_has_system_metadata(mcp)
@@ -303,15 +304,22 @@ class DataHubRestEmitter(Closeable, Emitter):
303
304
  current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
304
305
  for mcp_obj in mcp_objs:
305
306
  mcp_obj_size = len(json.dumps(mcp_obj))
307
+ logger.debug(
308
+ f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
309
+ )
306
310
 
307
311
  if (
308
312
  mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
309
313
  or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
310
314
  ):
315
+ logger.debug("Decided to create new chunk")
311
316
  mcp_obj_chunks.append([])
312
317
  current_chunk_size = 0
313
318
  mcp_obj_chunks[-1].append(mcp_obj)
314
319
  current_chunk_size += mcp_obj_size
320
+ logger.debug(
321
+ f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
322
+ )
315
323
 
316
324
  for mcp_obj_chunk in mcp_obj_chunks:
317
325
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +346,15 @@ class DataHubRestEmitter(Closeable, Emitter):
338
346
 
339
347
  def _emit_generic(self, url: str, payload: str) -> None:
340
348
  curl_command = make_curl_command(self._session, "POST", url, payload)
349
+ payload_size = len(payload)
350
+ if payload_size > INGEST_MAX_PAYLOAD_BYTES:
351
+ # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
352
+ logger.warning(
353
+ f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
354
+ )
341
355
  logger.debug(
342
- "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
356
+ "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
357
+ payload_size,
343
358
  curl_command,
344
359
  )
345
360
  try:
@@ -0,0 +1,96 @@
1
+ import json
2
+ import logging
3
+ from typing import Iterable, List
4
+
5
+ from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
6
+ from datahub.emitter.serialization_helper import pre_json_transform
7
+ from datahub.ingestion.api.source import SourceReport
8
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
9
+ from datahub.metadata.schema_classes import (
10
+ DatasetProfileClass,
11
+ SchemaFieldClass,
12
+ SchemaMetadataClass,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class EnsureAspectSizeProcessor:
19
+ def __init__(
20
+ self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
21
+ ):
22
+ self.report = report
23
+ self.payload_constraint = payload_constraint
24
+
25
+ def ensure_dataset_profile_size(
26
+ self, dataset_urn: str, profile: DatasetProfileClass
27
+ ) -> None:
28
+ """
29
+ This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
30
+ in the future
31
+ """
32
+ sample_fields_size = 0
33
+ if profile.fieldProfiles:
34
+ logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
35
+ for field in profile.fieldProfiles:
36
+ if field.sampleValues:
37
+ values_len = 0
38
+ for value in field.sampleValues:
39
+ if value:
40
+ values_len += len(value)
41
+ logger.debug(
42
+ f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
43
+ )
44
+ if sample_fields_size + values_len > self.payload_constraint:
45
+ field.sampleValues = []
46
+ self.report.warning(
47
+ title="Dataset profile truncated due to size constraint",
48
+ message="Dataset profile contained too much data and would have caused ingestion to fail",
49
+ context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
50
+ )
51
+ else:
52
+ sample_fields_size += values_len
53
+ else:
54
+ logger.debug(f"Field {field.fieldPath} has no sample values")
55
+
56
+ def ensure_schema_metadata_size(
57
+ self, dataset_urn: str, schema: SchemaMetadataClass
58
+ ) -> None:
59
+ """
60
+ This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
61
+ in the future
62
+ """
63
+ total_fields_size = 0
64
+ logger.debug(f"Amount of schema fields: {len(schema.fields)}")
65
+ accepted_fields: List[SchemaFieldClass] = []
66
+ for field in schema.fields:
67
+ field_size = len(json.dumps(pre_json_transform(field.to_obj())))
68
+ logger.debug(f"Field {field.fieldPath} takes total {field_size}")
69
+ if total_fields_size + field_size < self.payload_constraint:
70
+ accepted_fields.append(field)
71
+ total_fields_size += field_size
72
+ else:
73
+ self.report.warning(
74
+ title="Schema truncated due to size constraint",
75
+ message="Dataset schema contained too much data and would have caused ingestion to fail",
76
+ context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
77
+ )
78
+
79
+ schema.fields = accepted_fields
80
+
81
+ def ensure_aspect_size(
82
+ self,
83
+ stream: Iterable[MetadataWorkUnit],
84
+ ) -> Iterable[MetadataWorkUnit]:
85
+ """
86
+ We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
87
+ on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
88
+ """
89
+ for wu in stream:
90
+ logger.debug(f"Ensuring size of workunit: {wu.id}")
91
+
92
+ if schema := wu.get_aspect_of_type(SchemaMetadataClass):
93
+ self.ensure_schema_metadata_size(wu.get_urn(), schema)
94
+ elif profile := wu.get_aspect_of_type(DatasetProfileClass):
95
+ self.ensure_dataset_profile_size(wu.get_urn(), profile)
96
+ yield wu
@@ -265,64 +265,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
265
265
  with PerfTimer() as timer:
266
266
  self.report.num_external_table_edges_scanned = 0
267
267
 
268
- for (
269
- known_lineage_mapping
270
- ) in self._populate_external_lineage_from_copy_history(discovered_tables):
271
- self.sql_aggregator.add(known_lineage_mapping)
272
- logger.info(
273
- "Done populating external lineage from copy history. "
274
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
275
- )
276
-
277
- for (
278
- known_lineage_mapping
279
- ) in self._populate_external_lineage_from_show_query(discovered_tables):
280
- self.sql_aggregator.add(known_lineage_mapping)
281
-
282
- logger.info(
283
- "Done populating external lineage from show external tables. "
284
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
285
- )
268
+ for entry in self._get_copy_history_lineage(discovered_tables):
269
+ self.sql_aggregator.add(entry)
270
+ logger.info("Done populating external lineage from copy history. ")
286
271
 
287
272
  self.report.external_lineage_queries_secs = timer.elapsed_seconds()
288
273
 
289
- # Handles the case for explicitly created external tables.
290
- # NOTE: Snowflake does not log this information to the access_history table.
291
- def _populate_external_lineage_from_show_query(
292
- self, discovered_tables: List[str]
293
- ) -> Iterable[KnownLineageMapping]:
294
- external_tables_query: str = SnowflakeQuery.show_external_tables()
295
- try:
296
- for db_row in self.connection.query(external_tables_query):
297
- key = self.identifiers.get_dataset_identifier(
298
- db_row["name"], db_row["schema_name"], db_row["database_name"]
299
- )
300
-
301
- if key not in discovered_tables:
302
- continue
303
- if db_row["location"].startswith("s3://"):
304
- yield KnownLineageMapping(
305
- upstream_urn=make_s3_urn_for_lineage(
306
- db_row["location"], self.config.env
307
- ),
308
- downstream_urn=self.identifiers.gen_dataset_urn(key),
309
- )
310
- self.report.num_external_table_edges_scanned += 1
311
-
312
- self.report.num_external_table_edges_scanned += 1
313
- except Exception as e:
314
- logger.debug(e, exc_info=e)
315
- self.structured_reporter.warning(
316
- "Error populating external table lineage from Snowflake",
317
- exc=e,
318
- )
319
- self.report_status(EXTERNAL_LINEAGE, False)
320
-
321
274
  # Handles the case where a table is populated from an external stage/s3 location via copy.
322
275
  # Eg: copy into category_english from @external_s3_stage;
323
276
  # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv';
324
277
  # NOTE: Snowflake does not log this information to the access_history table.
325
- def _populate_external_lineage_from_copy_history(
278
+ def _get_copy_history_lineage(
326
279
  self, discovered_tables: List[str]
327
280
  ) -> Iterable[KnownLineageMapping]:
328
281
  query: str = SnowflakeQuery.copy_lineage_history(
@@ -247,9 +247,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
247
247
  for entry in self.fetch_copy_history():
248
248
  queries.append(entry)
249
249
 
250
- # TODO: Add "show external tables" lineage to the main schema extractor.
251
- # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
252
-
253
250
  with self.report.query_log_fetch_timer:
254
251
  for entry in self.fetch_query_log():
255
252
  queries.append(entry)
@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
16
16
  ClassificationHandler,
17
17
  classification_workunit_processor,
18
18
  )
19
+ from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
19
20
  from datahub.ingestion.source.common.subtypes import (
20
21
  DatasetContainerSubTypes,
21
22
  DatasetSubTypes,
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
35
36
  )
36
37
  from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
37
38
  from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
39
+ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
38
40
  from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
39
41
  from datahub.ingestion.source.snowflake.snowflake_schema import (
40
42
  SCHEMA_PARALLELISM,
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
65
67
  get_domain_wu,
66
68
  )
67
69
  from datahub.ingestion.source_report.ingestion_stage import (
70
+ EXTERNAL_TABLE_DDL_LINEAGE,
68
71
  METADATA_EXTRACTION,
69
72
  PROFILING,
70
73
  )
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
96
99
  TimeType,
97
100
  )
98
101
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
99
- from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
102
+ from datahub.sql_parsing.sql_parsing_aggregator import (
103
+ KnownLineageMapping,
104
+ SqlParsingAggregator,
105
+ )
100
106
  from datahub.utilities.registries.domain_registry import DomainRegistry
101
107
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
102
108
 
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
180
186
 
181
187
  # These are populated as side-effects of get_workunits_internal.
182
188
  self.databases: List[SnowflakeDatabase] = []
183
- self.aggregator: Optional[SqlParsingAggregator] = aggregator
189
+
190
+ self.aggregator = aggregator
184
191
 
185
192
  def get_connection(self) -> SnowflakeConnection:
186
193
  return self.connection
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
212
219
  self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
213
220
  yield from self._process_database(snowflake_db)
214
221
 
222
+ self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
223
+ discovered_tables: List[str] = [
224
+ self.identifiers.get_dataset_identifier(
225
+ table_name, schema.name, db.name
226
+ )
227
+ for db in self.databases
228
+ for schema in db.schemas
229
+ for table_name in schema.tables
230
+ ]
231
+ if self.aggregator:
232
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
233
+ self.aggregator.add(entry)
234
+
215
235
  except SnowflakePermissionError as e:
216
236
  self.structured_reporter.failure(
217
237
  GENERIC_PERMISSION_ERROR_KEY,
@@ -1082,3 +1102,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1082
1102
 
1083
1103
  # Access to table but none of its constraints - is this possible ?
1084
1104
  return constraints.get(table_name, [])
1105
+
1106
+ # Handles the case for explicitly created external tables.
1107
+ # NOTE: Snowflake does not log this information to the access_history table.
1108
+ def _external_tables_ddl_lineage(
1109
+ self, discovered_tables: List[str]
1110
+ ) -> Iterable[KnownLineageMapping]:
1111
+ external_tables_query: str = SnowflakeQuery.show_external_tables()
1112
+ try:
1113
+ for db_row in self.connection.query(external_tables_query):
1114
+ key = self.identifiers.get_dataset_identifier(
1115
+ db_row["name"], db_row["schema_name"], db_row["database_name"]
1116
+ )
1117
+
1118
+ if key not in discovered_tables:
1119
+ continue
1120
+ if db_row["location"].startswith("s3://"):
1121
+ yield KnownLineageMapping(
1122
+ upstream_urn=make_s3_urn_for_lineage(
1123
+ db_row["location"], self.config.env
1124
+ ),
1125
+ downstream_urn=self.identifiers.gen_dataset_urn(key),
1126
+ )
1127
+ self.report.num_external_table_edges_scanned += 1
1128
+
1129
+ self.report.num_external_table_edges_scanned += 1
1130
+ except Exception as e:
1131
+ self.structured_reporter.warning(
1132
+ "External table ddl lineage extraction failed",
1133
+ exc=e,
1134
+ )
@@ -161,35 +161,32 @@ class SnowflakeV2Source(
161
161
  # For database, schema, tables, views, etc
162
162
  self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
163
163
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
164
- self.aggregator: Optional[SqlParsingAggregator] = None
165
-
166
- if self.config.use_queries_v2 or self.config.include_table_lineage:
167
- self.aggregator = self._exit_stack.enter_context(
168
- SqlParsingAggregator(
169
- platform=self.identifiers.platform,
170
- platform_instance=self.config.platform_instance,
171
- env=self.config.env,
172
- graph=self.ctx.graph,
173
- eager_graph_load=(
174
- # If we're ingestion schema metadata for tables/views, then we will populate
175
- # schemas into the resolver as we go. We only need to do a bulk fetch
176
- # if we're not ingesting schema metadata as part of ingestion.
177
- not (
178
- self.config.include_technical_schema
179
- and self.config.include_tables
180
- and self.config.include_views
181
- )
182
- and not self.config.lazy_schema_resolver
183
- ),
184
- generate_usage_statistics=False,
185
- generate_operations=False,
186
- format_queries=self.config.format_sql_queries,
187
- )
164
+
165
+ self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
166
+ SqlParsingAggregator(
167
+ platform=self.identifiers.platform,
168
+ platform_instance=self.config.platform_instance,
169
+ env=self.config.env,
170
+ graph=self.ctx.graph,
171
+ eager_graph_load=(
172
+ # If we're ingestion schema metadata for tables/views, then we will populate
173
+ # schemas into the resolver as we go. We only need to do a bulk fetch
174
+ # if we're not ingesting schema metadata as part of ingestion.
175
+ not (
176
+ self.config.include_technical_schema
177
+ and self.config.include_tables
178
+ and self.config.include_views
179
+ )
180
+ and not self.config.lazy_schema_resolver
181
+ ),
182
+ generate_usage_statistics=False,
183
+ generate_operations=False,
184
+ format_queries=self.config.format_sql_queries,
188
185
  )
189
- self.report.sql_aggregator = self.aggregator.report
186
+ )
187
+ self.report.sql_aggregator = self.aggregator.report
190
188
 
191
189
  if self.config.include_table_lineage:
192
- assert self.aggregator is not None
193
190
  redundant_lineage_run_skip_handler: Optional[
194
191
  RedundantLineageRunSkipHandler
195
192
  ] = None
@@ -487,8 +484,6 @@ class SnowflakeV2Source(
487
484
 
488
485
  databases = schema_extractor.databases
489
486
 
490
- # TODO: The checkpoint state for stale entity detection can be committed here.
491
-
492
487
  if self.config.shares:
493
488
  yield from SnowflakeSharesHandler(
494
489
  self.config, self.report
@@ -26,6 +26,9 @@ from datahub.emitter.mcp_builder import (
26
26
  gen_containers,
27
27
  )
28
28
  from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
29
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
30
+ EnsureAspectSizeProcessor,
31
+ )
29
32
  from datahub.ingestion.api.common import PipelineContext
30
33
  from datahub.ingestion.api.decorators import (
31
34
  SupportStatus,
@@ -260,6 +263,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
260
263
  StaleEntityRemovalHandler.create(
261
264
  self, self.config, self.ctx
262
265
  ).workunit_processor,
266
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
263
267
  ]
264
268
 
265
269
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -14,6 +14,7 @@ LINEAGE_EXTRACTION = "Lineage Extraction"
14
14
  USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion"
15
15
  USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats"
16
16
  USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation"
17
+ EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage"
17
18
  QUERIES_EXTRACTION = "Queries Extraction"
18
19
  PROFILING = "Profiling"
19
20