PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc1py3-none-any.whl → 0.15.0.1rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (14) hide show

{acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datahub/__init__.py,sha256=gK5aLEGMHMZfg-QUDI5T7mr1ej_5OFVKhCrqqoj_QGk,576
+datahub/__init__.py,sha256=gbsVKK_ULsM259cMG08Rrx6A9_72Iy7zxyDkQZ37NCw,576
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
 datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
 datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
 datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
 datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
-datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
+datahub/emitter/rest_emitter.py,sha256=d5Zjo3GXDu9rUqlSsK9aOx-yEbHjDFZHelfq_ZFeb5M,16393
 datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
 datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
 datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -144,6 +144,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
 datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
 datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
+datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
 datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
 datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -431,19 +432,19 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
 datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
 datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
 datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
-datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=suMICPFPvoV6shkjD_14JunLc8jAZBINzlFk2mYldkU,23676
+datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=uMGmMEl4hWEmN7GxMyDBdwlIPAW7WmOnu41kZ0dvCG4,21551
 datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
-datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
+datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
 datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
 datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
 datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
-datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
+datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=K-KEr3OpwMHye08lXAy-5doUUGoGJP3b-ntJAGU_NBY,42472
 datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
 datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
 datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
 datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
 datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
-datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=TlS5d1lpEN74ZP0c8UzUhJZIeBMO3ZIUxRler1p7lnA,31998
+datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=ecaTCJNAQ_IJOPInPGXA3jv1dE5lztSU82UhpBygiq0,31654
 datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
 datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
@@ -505,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=2-pYQ-3B9UVUwO1yB9iTdi3DqgqZ2JrpQ
 datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
 datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
 datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
-datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
+datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
 datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
 datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
@@ -516,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
 datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
 datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
 datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source_report/ingestion_stage.py,sha256=zijPKvJKnB298cMoG6j_w6QRva9t1RWqnmycmbsKdgs,1499
+datahub/ingestion/source_report/ingestion_stage.py,sha256=pJcJeLSjaixlLqQyQtE3bfUcvXEVwrSaWWtU4iU9UEo,1557
 datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
 datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
 datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -980,8 +981,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-0.15.0.1rc1.dist-info/METADATA,sha256=Ll6D3fw03bz2dVmCp9Mcez5IbFKIfXzUnMeSe6Ej4eM,173642
-acryl_datahub-0.15.0.1rc1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-acryl_datahub-0.15.0.1rc1.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
-acryl_datahub-0.15.0.1rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-0.15.0.1rc1.dist-info/RECORD,,
+acryl_datahub-0.15.0.1rc2.dist-info/METADATA,sha256=gLix1LBWIrfQF-dcU1JsLeAAFkHuALY9dHVboVmjIJg,173642
+acryl_datahub-0.15.0.1rc2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+acryl_datahub-0.15.0.1rc2.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
+acryl_datahub-0.15.0.1rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-0.15.0.1rc2.dist-info/RECORD,,

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.15.0.1rc1"
+__version__ = "0.15.0.1rc2"
 def is_dev_mode() -> bool:

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -291,6 +291,7 @@ class DataHubRestEmitter(Closeable, Emitter):
         mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
         async_flag: Optional[bool] = None,
     ) -> int:
+        logger.debug("Attempting to emit batch mcps")
         url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
         for mcp in mcps:
             ensure_has_system_metadata(mcp)
@@ -303,15 +304,22 @@ class DataHubRestEmitter(Closeable, Emitter):
         current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
         for mcp_obj in mcp_objs:
             mcp_obj_size = len(json.dumps(mcp_obj))
+            logger.debug(
+                f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
+            )
             if (
                 mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
                 or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
             ):
+                logger.debug("Decided to create new chunk")
                 mcp_obj_chunks.append([])
                 current_chunk_size = 0
             mcp_obj_chunks[-1].append(mcp_obj)
             current_chunk_size += mcp_obj_size
+        logger.debug(
+            f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
+        )
         for mcp_obj_chunk in mcp_obj_chunks:
             # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +346,15 @@ class DataHubRestEmitter(Closeable, Emitter):
     def _emit_generic(self, url: str, payload: str) -> None:
         curl_command = make_curl_command(self._session, "POST", url, payload)
+        payload_size = len(payload)
+        if payload_size > INGEST_MAX_PAYLOAD_BYTES:
+            # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
+            logger.warning(
+                f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
+            )
         logger.debug(
-            "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
+            "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
+            payload_size,
             curl_command,
         )
         try:

datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py ADDED Viewed

@@ -0,0 +1,96 @@
+import json
+import logging
+from typing import Iterable, List
+from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
+from datahub.emitter.serialization_helper import pre_json_transform
+from datahub.ingestion.api.source import SourceReport
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.metadata.schema_classes import (
+    DatasetProfileClass,
+    SchemaFieldClass,
+    SchemaMetadataClass,
+)
+logger = logging.getLogger(__name__)
+class EnsureAspectSizeProcessor:
+    def __init__(
+        self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
+    ):
+        self.report = report
+        self.payload_constraint = payload_constraint
+    def ensure_dataset_profile_size(
+        self, dataset_urn: str, profile: DatasetProfileClass
+    ) -> None:
+        """
+        This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
+        in the future
+        """
+        sample_fields_size = 0
+        if profile.fieldProfiles:
+            logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
+            for field in profile.fieldProfiles:
+                if field.sampleValues:
+                    values_len = 0
+                    for value in field.sampleValues:
+                        if value:
+                            values_len += len(value)
+                    logger.debug(
+                        f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
+                    )
+                    if sample_fields_size + values_len > self.payload_constraint:
+                        field.sampleValues = []
+                        self.report.warning(
+                            title="Dataset profile truncated due to size constraint",
+                            message="Dataset profile contained too much data and would have caused ingestion to fail",
+                            context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
+                        )
+                    else:
+                        sample_fields_size += values_len
+                else:
+                    logger.debug(f"Field {field.fieldPath} has no sample values")
+    def ensure_schema_metadata_size(
+        self, dataset_urn: str, schema: SchemaMetadataClass
+    ) -> None:
+        """
+        This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
+        in the future
+        """
+        total_fields_size = 0
+        logger.debug(f"Amount of schema fields: {len(schema.fields)}")
+        accepted_fields: List[SchemaFieldClass] = []
+        for field in schema.fields:
+            field_size = len(json.dumps(pre_json_transform(field.to_obj())))
+            logger.debug(f"Field {field.fieldPath} takes total {field_size}")
+            if total_fields_size + field_size < self.payload_constraint:
+                accepted_fields.append(field)
+                total_fields_size += field_size
+            else:
+                self.report.warning(
+                    title="Schema truncated due to size constraint",
+                    message="Dataset schema contained too much data and would have caused ingestion to fail",
+                    context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
+                )
+        schema.fields = accepted_fields
+    def ensure_aspect_size(
+        self,
+        stream: Iterable[MetadataWorkUnit],
+    ) -> Iterable[MetadataWorkUnit]:
+        """
+        We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
+        on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
+        """
+        for wu in stream:
+            logger.debug(f"Ensuring size of workunit: {wu.id}")
+            if schema := wu.get_aspect_of_type(SchemaMetadataClass):
+                self.ensure_schema_metadata_size(wu.get_urn(), schema)
+            elif profile := wu.get_aspect_of_type(DatasetProfileClass):
+                self.ensure_dataset_profile_size(wu.get_urn(), profile)
+            yield wu

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -265,64 +265,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
         with PerfTimer() as timer:
             self.report.num_external_table_edges_scanned = 0
-            for (
-                known_lineage_mapping
-            ) in self._populate_external_lineage_from_copy_history(discovered_tables):
-                self.sql_aggregator.add(known_lineage_mapping)
-            logger.info(
-                "Done populating external lineage from copy history. "
-                f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
-            )
-            for (
-                known_lineage_mapping
-            ) in self._populate_external_lineage_from_show_query(discovered_tables):
-                self.sql_aggregator.add(known_lineage_mapping)
-            logger.info(
-                "Done populating external lineage from show external tables. "
-                f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
-            )
+            for entry in self._get_copy_history_lineage(discovered_tables):
+                self.sql_aggregator.add(entry)
+            logger.info("Done populating external lineage from copy history. ")
             self.report.external_lineage_queries_secs = timer.elapsed_seconds()
-    # Handles the case for explicitly created external tables.
-    # NOTE: Snowflake does not log this information to the access_history table.
-    def _populate_external_lineage_from_show_query(
-        self, discovered_tables: List[str]
-    ) -> Iterable[KnownLineageMapping]:
-        external_tables_query: str = SnowflakeQuery.show_external_tables()
-        try:
-            for db_row in self.connection.query(external_tables_query):
-                key = self.identifiers.get_dataset_identifier(
-                    db_row["name"], db_row["schema_name"], db_row["database_name"]
-                )
-                if key not in discovered_tables:
-                    continue
-                if db_row["location"].startswith("s3://"):
-                    yield KnownLineageMapping(
-                        upstream_urn=make_s3_urn_for_lineage(
-                            db_row["location"], self.config.env
-                        ),
-                        downstream_urn=self.identifiers.gen_dataset_urn(key),
-                    )
-                    self.report.num_external_table_edges_scanned += 1
-                self.report.num_external_table_edges_scanned += 1
-        except Exception as e:
-            logger.debug(e, exc_info=e)
-            self.structured_reporter.warning(
-                "Error populating external table lineage from Snowflake",
-                exc=e,
-            )
-            self.report_status(EXTERNAL_LINEAGE, False)
     # Handles the case where a table is populated from an external stage/s3 location via copy.
     # Eg: copy into category_english from @external_s3_stage;
     # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...')  pattern='.*.csv';
     # NOTE: Snowflake does not log this information to the access_history table.
-    def _populate_external_lineage_from_copy_history(
+    def _get_copy_history_lineage(
         self, discovered_tables: List[str]
     ) -> Iterable[KnownLineageMapping]:
         query: str = SnowflakeQuery.copy_lineage_history(

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -247,9 +247,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 for entry in self.fetch_copy_history():
                     queries.append(entry)
-            # TODO: Add "show external tables" lineage to the main schema extractor.
-            # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
             with self.report.query_log_fetch_timer:
                 for entry in self.fetch_query_log():
                     queries.append(entry)

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
     ClassificationHandler,
     classification_workunit_processor,
 )
+from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
 from datahub.ingestion.source.common.subtypes import (
     DatasetContainerSubTypes,
     DatasetSubTypes,
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
 )
 from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
 from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
+from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
 from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
 from datahub.ingestion.source.snowflake.snowflake_schema import (
     SCHEMA_PARALLELISM,
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
     get_domain_wu,
 )
 from datahub.ingestion.source_report.ingestion_stage import (
+    EXTERNAL_TABLE_DDL_LINEAGE,
     METADATA_EXTRACTION,
     PROFILING,
 )
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     TimeType,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
-from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    KnownLineageMapping,
+    SqlParsingAggregator,
+)
 from datahub.utilities.registries.domain_registry import DomainRegistry
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         # These are populated as side-effects of get_workunits_internal.
         self.databases: List[SnowflakeDatabase] = []
-        self.aggregator: Optional[SqlParsingAggregator] = aggregator
+        self.aggregator = aggregator
     def get_connection(self) -> SnowflakeConnection:
         return self.connection
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                 self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
                 yield from self._process_database(snowflake_db)
+            self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
+            discovered_tables: List[str] = [
+                self.identifiers.get_dataset_identifier(
+                    table_name, schema.name, db.name
+                )
+                for db in self.databases
+                for schema in db.schemas
+                for table_name in schema.tables
+            ]
+            if self.aggregator:
+                for entry in self._external_tables_ddl_lineage(discovered_tables):
+                    self.aggregator.add(entry)
         except SnowflakePermissionError as e:
             self.structured_reporter.failure(
                 GENERIC_PERMISSION_ERROR_KEY,
@@ -1082,3 +1102,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         # Access to table but none of its constraints - is this possible ?
         return constraints.get(table_name, [])
+    # Handles the case for explicitly created external tables.
+    # NOTE: Snowflake does not log this information to the access_history table.
+    def _external_tables_ddl_lineage(
+        self, discovered_tables: List[str]
+    ) -> Iterable[KnownLineageMapping]:
+        external_tables_query: str = SnowflakeQuery.show_external_tables()
+        try:
+            for db_row in self.connection.query(external_tables_query):
+                key = self.identifiers.get_dataset_identifier(
+                    db_row["name"], db_row["schema_name"], db_row["database_name"]
+                )
+                if key not in discovered_tables:
+                    continue
+                if db_row["location"].startswith("s3://"):
+                    yield KnownLineageMapping(
+                        upstream_urn=make_s3_urn_for_lineage(
+                            db_row["location"], self.config.env
+                        ),
+                        downstream_urn=self.identifiers.gen_dataset_urn(key),
+                    )
+                    self.report.num_external_table_edges_scanned += 1
+                self.report.num_external_table_edges_scanned += 1
+        except Exception as e:
+            self.structured_reporter.warning(
+                "External table ddl lineage extraction failed",
+                exc=e,
+            )

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -161,35 +161,32 @@ class SnowflakeV2Source(
         # For database, schema, tables, views, etc
         self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
         self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
-        self.aggregator: Optional[SqlParsingAggregator] = None
-        if self.config.use_queries_v2 or self.config.include_table_lineage:
-            self.aggregator = self._exit_stack.enter_context(
-                SqlParsingAggregator(
-                    platform=self.identifiers.platform,
-                    platform_instance=self.config.platform_instance,
-                    env=self.config.env,
-                    graph=self.ctx.graph,
-                    eager_graph_load=(
-                        # If we're ingestion schema metadata for tables/views, then we will populate
-                        # schemas into the resolver as we go. We only need to do a bulk fetch
-                        # if we're not ingesting schema metadata as part of ingestion.
-                        not (
-                            self.config.include_technical_schema
-                            and self.config.include_tables
-                            and self.config.include_views
-                        )
-                        and not self.config.lazy_schema_resolver
-                    ),
-                    generate_usage_statistics=False,
-                    generate_operations=False,
-                    format_queries=self.config.format_sql_queries,
-                )
+        self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
+            SqlParsingAggregator(
+                platform=self.identifiers.platform,
+                platform_instance=self.config.platform_instance,
+                env=self.config.env,
+                graph=self.ctx.graph,
+                eager_graph_load=(
+                    # If we're ingestion schema metadata for tables/views, then we will populate
+                    # schemas into the resolver as we go. We only need to do a bulk fetch
+                    # if we're not ingesting schema metadata as part of ingestion.
+                    not (
+                        self.config.include_technical_schema
+                        and self.config.include_tables
+                        and self.config.include_views
+                    )
+                    and not self.config.lazy_schema_resolver
+                ),
+                generate_usage_statistics=False,
+                generate_operations=False,
+                format_queries=self.config.format_sql_queries,
             )
-            self.report.sql_aggregator = self.aggregator.report
+        )
+        self.report.sql_aggregator = self.aggregator.report
         if self.config.include_table_lineage:
-            assert self.aggregator is not None
             redundant_lineage_run_skip_handler: Optional[
                 RedundantLineageRunSkipHandler
             ] = None
@@ -487,8 +484,6 @@ class SnowflakeV2Source(
         databases = schema_extractor.databases
-        # TODO: The checkpoint state for stale entity detection can be committed here.
         if self.config.shares:
             yield from SnowflakeSharesHandler(
                 self.config, self.report

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -26,6 +26,9 @@ from datahub.emitter.mcp_builder import (
     gen_containers,
 )
 from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
+from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
+    EnsureAspectSizeProcessor,
+)
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -260,6 +263,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
             StaleEntityRemovalHandler.create(
                 self, self.config, self.ctx
             ).workunit_processor,
+            EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
         ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:

datahub/ingestion/source_report/ingestion_stage.py CHANGED Viewed

@@ -14,6 +14,7 @@ LINEAGE_EXTRACTION = "Lineage Extraction"
 USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion"
 USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats"
 USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation"
+EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage"
 QUERIES_EXTRACTION = "Queries Extraction"
 PROFILING = "Profiling"

{acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc1py3-none-any.whl → 0.15.0.1rc2py3-none-any.whl