PyPI - acryl-datahub - Versions diffs - 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
datahub/__init__.py +1 -1
datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
datahub/configuration/common.py +2 -5
datahub/configuration/source_common.py +13 -0
datahub/emitter/mce_builder.py +20 -4
datahub/emitter/mcp_builder.py +2 -7
datahub/emitter/mcp_patch_builder.py +37 -13
datahub/emitter/rest_emitter.py +25 -3
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
datahub/ingestion/api/closeable.py +3 -3
datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
datahub/ingestion/api/report.py +4 -1
datahub/ingestion/api/sink.py +4 -3
datahub/ingestion/api/source.py +4 -0
datahub/ingestion/api/source_helpers.py +2 -6
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/graph/client.py +6 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
datahub/ingestion/source/aws/aws_common.py +231 -27
datahub/ingestion/source/aws/glue.py +12 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
datahub/ingestion/source/datahub/config.py +22 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
datahub/ingestion/source/gc/datahub_gc.py +21 -5
datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
datahub/ingestion/source/iceberg/iceberg.py +27 -1
datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
datahub/ingestion/source/kafka_connect/__init__.py +0 -0
datahub/ingestion/source/kafka_connect/common.py +202 -0
datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
datahub/ingestion/source/looker/looker_common.py +63 -2
datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
datahub/ingestion/source/looker/looker_source.py +31 -4
datahub/ingestion/source/looker/looker_usage.py +23 -17
datahub/ingestion/source/mlflow.py +30 -5
datahub/ingestion/source/mode.py +40 -27
datahub/ingestion/source/powerbi/config.py +1 -14
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
datahub/ingestion/source/s3/source.py +1 -1
datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
datahub/ingestion/source/sql/hive.py +621 -8
datahub/ingestion/source/sql/hive_metastore.py +7 -0
datahub/ingestion/source/sql/mssql/job_models.py +30 -1
datahub/ingestion/source/sql/mssql/source.py +15 -1
datahub/ingestion/source/sql/sql_common.py +41 -102
datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
datahub/ingestion/source/sql/sql_report.py +2 -0
datahub/ingestion/source/state/checkpoint.py +2 -1
datahub/ingestion/source/tableau/tableau.py +122 -45
datahub/ingestion/source/tableau/tableau_common.py +18 -0
datahub/ingestion/source/tableau/tableau_constant.py +3 -1
datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/proxy.py +8 -27
datahub/ingestion/source/usage/usage_common.py +15 -1
datahub/ingestion/source_report/ingestion_stage.py +3 -0
datahub/metadata/_schema_classes.py +256 -3
datahub/metadata/_urns/urn_defs.py +168 -168
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
datahub/metadata/schema.avsc +252 -33
datahub/metadata/schemas/DataJobKey.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
datahub/metadata/schemas/MLModelProperties.avsc +62 -2
datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
datahub/specific/aspect_helpers/__init__.py +0 -0
datahub/specific/aspect_helpers/custom_properties.py +79 -0
datahub/specific/aspect_helpers/ownership.py +67 -0
datahub/specific/aspect_helpers/structured_properties.py +72 -0
datahub/specific/aspect_helpers/tags.py +42 -0
datahub/specific/aspect_helpers/terms.py +43 -0
datahub/specific/chart.py +28 -184
datahub/specific/dashboard.py +31 -196
datahub/specific/datajob.py +34 -189
datahub/specific/dataproduct.py +24 -86
datahub/specific/dataset.py +48 -133
datahub/specific/form.py +12 -32
datahub/specific/structured_property.py +9 -9
datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
datahub/sql_parsing/sqlglot_lineage.py +15 -5
datahub/sql_parsing/tool_meta_extractor.py +119 -5
datahub/utilities/time.py +8 -3
datahub/utilities/urns/_urn_base.py +5 -7
datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
datahub/specific/custom_properties.py +0 -37
datahub/specific/ownership.py +0 -48
datahub/specific/structured_properties.py +0 -53
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py CHANGED Viewed

@@ -84,13 +84,14 @@ class DataResolverBase(ABC):
         tenant_id: str,
         metadata_api_timeout: int,
     ):
-        self.__access_token: Optional[str] = None
-        self.__access_token_expiry_time: Optional[datetime] = None
-        self.__tenant_id = tenant_id
+        self._access_token: Optional[str] = None
+        self._access_token_expiry_time: Optional[datetime] = None
+        self._tenant_id = tenant_id
         # Test connection by generating access token
         logger.info(f"Trying to connect to {self._get_authority_url()}")
         # Power-Bi Auth (Service Principal Auth)
-        self.__msal_client = msal.ConfidentialClientApplication(
+        self._msal_client = msal.ConfidentialClientApplication(
             client_id,
             client_credential=client_secret,
             authority=DataResolverBase.AUTHORITY + tenant_id,
@@ -168,18 +169,18 @@ class DataResolverBase(ABC):
         pass
     def _get_authority_url(self):
-        return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}"
+        return f"{DataResolverBase.AUTHORITY}{self._tenant_id}"
     def get_authorization_header(self):
         return {Constant.Authorization: self.get_access_token()}
-    def get_access_token(self):
-        if self.__access_token is not None and not self._is_access_token_expired():
-            return self.__access_token
+    def get_access_token(self) -> str:
+        if self._access_token is not None and not self._is_access_token_expired():
+            return self._access_token
         logger.info("Generating PowerBi access token")
-        auth_response = self.__msal_client.acquire_token_for_client(
+        auth_response = self._msal_client.acquire_token_for_client(
             scopes=[DataResolverBase.SCOPE]
         )
@@ -193,24 +194,24 @@ class DataResolverBase(ABC):
         logger.info("Generated PowerBi access token")
-        self.__access_token = "Bearer {}".format(
+        self._access_token = "Bearer {}".format(
             auth_response.get(Constant.ACCESS_TOKEN)
         )
         safety_gap = 300
-        self.__access_token_expiry_time = datetime.now() + timedelta(
+        self._access_token_expiry_time = datetime.now() + timedelta(
             seconds=(
                 max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0)
             )
         )
-        logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}")
+        logger.debug(f"{Constant.PBIAccessToken}={self._access_token}")
-        return self.__access_token
+        return self._access_token
     def _is_access_token_expired(self) -> bool:
-        if not self.__access_token_expiry_time:
+        if not self._access_token_expiry_time:
             return True
-        return self.__access_token_expiry_time < datetime.now()
+        return self._access_token_expiry_time < datetime.now()
     def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
         """

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -225,7 +225,7 @@ class S3Source(StatefulIngestionSourceBase):
             self.init_spark()
     def init_spark(self):
-        os.environ.setdefault("SPARK_VERSION", "3.3")
+        os.environ.setdefault("SPARK_VERSION", "3.5")
         spark_version = os.environ["SPARK_VERSION"]
         # Importing here to avoid Deequ dependency for non profiling use cases

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig(
         description="Whether to convert dataset urns to lowercase.",
     )
-class SnowflakeUsageConfig(BaseUsageConfig):
     email_domain: Optional[str] = pydantic.Field(
         default=None,
         description="Email domain of your organization so users can be displayed on UI appropriately.",
     )
+    email_as_user_identifier: bool = Field(
+        default=True,
+        description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
+        "provided, generates email addresses for snowflake users with unset emails, based on their "
+        "username.",
+    )
+class SnowflakeUsageConfig(BaseUsageConfig):
     apply_view_usage_to_tables: bool = pydantic.Field(
         default=False,
         description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.",
@@ -163,26 +171,13 @@ class SnowflakeConfig(
         default=True,
         description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.",
     )
-    include_view_lineage: bool = pydantic.Field(
-        default=True,
-        description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.",
-    )
+    _include_view_lineage = pydantic_removed_field("include_view_lineage")
+    _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
     ignore_start_time_lineage: bool = False
     upstream_lineage_in_report: bool = False
-    @pydantic.root_validator(skip_on_failure=True)
-    def validate_include_view_lineage(cls, values):
-        if (
-            "include_table_lineage" in values
-            and not values.get("include_table_lineage")
-            and values.get("include_view_lineage")
-        ):
-            raise ValueError(
-                "include_table_lineage must be True for include_view_lineage to be set."
-            )
-        return values
 class SnowflakeV2Config(
     SnowflakeConfig,
@@ -222,11 +217,6 @@ class SnowflakeV2Config(
         description="Populates table->table and view->table column lineage. Requires appropriate grants given to the role and the Snowflake Enterprise Edition or above.",
     )
-    include_view_column_lineage: bool = Field(
-        default=True,
-        description="Populates view->view and table->view column lineage using DataHub's sql parser.",
-    )
     use_queries_v2: bool = Field(
         default=False,
         description="If enabled, uses the new queries extractor to extract queries from snowflake.",
@@ -285,13 +275,6 @@ class SnowflakeV2Config(
         " Map of share name -> details of share.",
     )
-    email_as_user_identifier: bool = Field(
-        default=True,
-        description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
-        "provided, generates email addresses for snowflake users with unset emails, based on their "
-        "username.",
-    )
     include_assertion_results: bool = Field(
         default=False,
         description="Whether to ingest assertion run results for assertions created using Datahub"
@@ -355,10 +338,6 @@ class SnowflakeV2Config(
             self, database=database, username=username, password=password, role=role
         )
-    @property
-    def parse_view_ddl(self) -> bool:
-        return self.include_view_column_lineage
     @validator("shares")
     def validate_shares(
         cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -4,11 +4,10 @@ from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, Field, validator
 from datahub.configuration.datetimes import parse_absolute_time
 from datahub.ingestion.api.closeable import Closeable
-from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
 from datahub.ingestion.source.snowflake.constants import (
     LINEAGE_PERMISSION_ERROR,
@@ -41,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
     ColumnRef,
     DownstreamColumnRef,
 )
+from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.time import ts_millis_to_datetime
@@ -72,8 +72,8 @@ class ColumnUpstreamJob(BaseModel):
 class ColumnUpstreamLineage(BaseModel):
-    column_name: str
-    upstreams: List[ColumnUpstreamJob]
+    column_name: Optional[str]
+    upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
 class UpstreamTableNode(BaseModel):
@@ -163,11 +163,11 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
                 self.config.end_time,
             )
-    def get_workunits(
+    def add_time_based_lineage_to_aggregator(
         self,
         discovered_tables: List[str],
         discovered_views: List[str],
-    ) -> Iterable[MetadataWorkUnit]:
+    ) -> None:
         if not self._should_ingest_lineage():
             return
@@ -177,9 +177,7 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
         # snowflake view/table -> snowflake table
         self.populate_table_upstreams(discovered_tables)
-        for mcp in self.sql_aggregator.gen_metadata():
-            yield mcp.as_workunit()
+    def update_state(self):
         if self.redundant_run_skip_handler:
             # Update the checkpoint state for this run.
             self.redundant_run_skip_handler.update_state(
@@ -242,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
         downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
         known_lineage = KnownQueryLineageInfo(
+            query_id=get_query_fingerprint(
+                query.query_text, self.identifiers.platform, fast=True
+            ),
             query_text=query.query_text,
             downstream=downstream_table_urn,
             upstreams=self.map_query_result_upstreams(
@@ -265,64 +266,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
         with PerfTimer() as timer:
             self.report.num_external_table_edges_scanned = 0
-            for (
-                known_lineage_mapping
-            ) in self._populate_external_lineage_from_copy_history(discovered_tables):
-                self.sql_aggregator.add(known_lineage_mapping)
-            logger.info(
-                "Done populating external lineage from copy history. "
-                f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
-            )
-            for (
-                known_lineage_mapping
-            ) in self._populate_external_lineage_from_show_query(discovered_tables):
-                self.sql_aggregator.add(known_lineage_mapping)
-            logger.info(
-                "Done populating external lineage from show external tables. "
-                f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
-            )
+            for entry in self._get_copy_history_lineage(discovered_tables):
+                self.sql_aggregator.add(entry)
+            logger.info("Done populating external lineage from copy history. ")
             self.report.external_lineage_queries_secs = timer.elapsed_seconds()
-    # Handles the case for explicitly created external tables.
-    # NOTE: Snowflake does not log this information to the access_history table.
-    def _populate_external_lineage_from_show_query(
-        self, discovered_tables: List[str]
-    ) -> Iterable[KnownLineageMapping]:
-        external_tables_query: str = SnowflakeQuery.show_external_tables()
-        try:
-            for db_row in self.connection.query(external_tables_query):
-                key = self.identifiers.get_dataset_identifier(
-                    db_row["name"], db_row["schema_name"], db_row["database_name"]
-                )
-                if key not in discovered_tables:
-                    continue
-                if db_row["location"].startswith("s3://"):
-                    yield KnownLineageMapping(
-                        upstream_urn=make_s3_urn_for_lineage(
-                            db_row["location"], self.config.env
-                        ),
-                        downstream_urn=self.identifiers.gen_dataset_urn(key),
-                    )
-                    self.report.num_external_table_edges_scanned += 1
-                self.report.num_external_table_edges_scanned += 1
-        except Exception as e:
-            logger.debug(e, exc_info=e)
-            self.structured_reporter.warning(
-                "Error populating external table lineage from Snowflake",
-                exc=e,
-            )
-            self.report_status(EXTERNAL_LINEAGE, False)
     # Handles the case where a table is populated from an external stage/s3 location via copy.
     # Eg: copy into category_english from @external_s3_stage;
     # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...')  pattern='.*.csv';
     # NOTE: Snowflake does not log this information to the access_history table.
-    def _populate_external_lineage_from_copy_history(
+    def _get_copy_history_lineage(
         self, discovered_tables: List[str]
     ) -> Iterable[KnownLineageMapping]:
         query: str = SnowflakeQuery.copy_lineage_history(
@@ -384,10 +338,6 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
             start_time_millis=int(self.start_time.timestamp() * 1000),
             end_time_millis=int(self.end_time.timestamp() * 1000),
             upstreams_deny_pattern=self.config.temporary_tables_pattern,
-            # The self.config.include_view_lineage setting is about fetching upstreams of views.
-            # We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False.
-            # TODO: Remove this `include_view_lineage` flag, since it's effectively dead code.
-            include_view_lineage=True,
             include_column_lineage=self.config.include_column_lineage,
         )
         try:

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -61,11 +61,17 @@ from datahub.sql_parsing.sqlglot_lineage import (
     ColumnRef,
     DownstreamColumnRef,
 )
+from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
 from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
 from datahub.utilities.perf_timer import PerfTimer
 logger = logging.getLogger(__name__)
+# Define a type alias
+UserName = str
+UserEmail = str
+UsersMapping = Dict[UserName, UserEmail]
 class SnowflakeQueriesExtractorConfig(ConfigModel):
     # TODO: Support stateful ingestion for the time windows.
@@ -114,11 +120,13 @@ class SnowflakeQueriesSourceConfig(
 class SnowflakeQueriesExtractorReport(Report):
     copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
     query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
+    users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
     audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
     sql_aggregator: Optional[SqlAggregatorReport] = None
     num_ddl_queries_dropped: int = 0
+    num_users: int = 0
 @dataclass
@@ -225,6 +233,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def get_workunits_internal(
         self,
     ) -> Iterable[MetadataWorkUnit]:
+        with self.report.users_fetch_timer:
+            users = self.fetch_users()
         # TODO: Add some logic to check if the cached audit log is stale or not.
         audit_log_file = self.local_temp_path / "audit_log.sqlite"
         use_cached_audit_log = audit_log_file.exists()
@@ -247,11 +258,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 for entry in self.fetch_copy_history():
                     queries.append(entry)
-            # TODO: Add "show external tables" lineage to the main schema extractor.
-            # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
             with self.report.query_log_fetch_timer:
-                for entry in self.fetch_query_log():
+                for entry in self.fetch_query_log(users):
                     queries.append(entry)
         with self.report.audit_log_load_timer:
@@ -266,6 +274,25 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             shared_connection.close()
             audit_log_file.unlink(missing_ok=True)
+    def fetch_users(self) -> UsersMapping:
+        users: UsersMapping = dict()
+        with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
+            logger.info("Fetching users from Snowflake")
+            query = SnowflakeQuery.get_all_users()
+            resp = self.connection.query(query)
+            for row in resp:
+                try:
+                    users[row["NAME"]] = row["EMAIL"]
+                    self.report.num_users += 1
+                except Exception as e:
+                    self.structured_reporter.warning(
+                        "Error parsing user row",
+                        context=f"{row}",
+                        exc=e,
+                    )
+        return users
     def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
         # Derived from _populate_external_lineage_from_copy_history.
@@ -301,7 +328,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                         yield result
     def fetch_query_log(
-        self,
+        self, users: UsersMapping
     ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
         query_log_query = _build_enriched_query_log_query(
             start_time=self.config.window.start_time,
@@ -322,7 +349,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 assert isinstance(row, dict)
                 try:
-                    entry = self._parse_audit_log_row(row)
+                    entry = self._parse_audit_log_row(row, users)
                 except Exception as e:
                     self.structured_reporter.warning(
                         "Error parsing query log row",
@@ -334,7 +361,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                         yield entry
     def _parse_audit_log_row(
-        self, row: Dict[str, Any]
+        self, row: Dict[str, Any], users: UsersMapping
     ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
         json_fields = {
             "DIRECT_OBJECTS_ACCESSED",
@@ -433,9 +460,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                     )
                 )
-        # TODO: Fetch email addresses from Snowflake to map user -> email
-        # TODO: Support email_domain fallback for generating user urns.
-        user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"]))
+        user = CorpUserUrn(
+            self.identifiers.get_user_identifier(
+                res["user_name"], users.get(res["user_name"])
+            )
+        )
         timestamp: datetime = res["query_start_time"]
         timestamp = timestamp.astimezone(timezone.utc)
@@ -447,10 +476,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         entry = PreparsedQuery(
             # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
-            # job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint
-            # here so that the aggregator auto-generates one.
-            # query_id=res["query_fingerprint"],
-            query_id=None,
+            # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
+            # here
+            query_id=get_query_fingerprint(
+                res["query_text"], self.identifiers.platform, fast=True
+            ),
             query_text=res["query_text"],
             upstreams=upstreams,
             downstream=downstream,

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -376,7 +376,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
     def table_to_table_lineage_history_v2(
         start_time_millis: int,
         end_time_millis: int,
-        include_view_lineage: bool = True,
         include_column_lineage: bool = True,
         upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS,
     ) -> str:
@@ -385,14 +384,12 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
                 start_time_millis,
                 end_time_millis,
                 upstreams_deny_pattern,
-                include_view_lineage,
             )
         else:
             return SnowflakeQuery.table_upstreams_only(
                 start_time_millis,
                 end_time_millis,
                 upstreams_deny_pattern,
-                include_view_lineage,
             )
     @staticmethod
@@ -677,12 +674,9 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
         start_time_millis: int,
         end_time_millis: int,
         upstreams_deny_pattern: List[str],
-        include_view_lineage: bool = True,
     ) -> str:
         allowed_upstream_table_domains = (
             SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER
-            if include_view_lineage
-            else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
         )
         upstream_sql_filter = create_deny_regex_sql_filter(
@@ -847,12 +841,9 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
         start_time_millis: int,
         end_time_millis: int,
         upstreams_deny_pattern: List[str],
-        include_view_lineage: bool = True,
     ) -> str:
         allowed_upstream_table_domains = (
             SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER
-            if include_view_lineage
-            else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
         )
         upstream_sql_filter = create_deny_regex_sql_filter(
@@ -956,4 +947,8 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
                 AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}'
                 ORDER BY MEASUREMENT_TIME ASC;
-"""
+            """
+    @staticmethod
+    def get_all_users() -> str:
+        return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
     ClassificationHandler,
     classification_workunit_processor,
 )
+from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
 from datahub.ingestion.source.common.subtypes import (
     DatasetContainerSubTypes,
     DatasetSubTypes,
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
 )
 from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
 from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
+from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
 from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
 from datahub.ingestion.source.snowflake.snowflake_schema import (
     SCHEMA_PARALLELISM,
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
     get_domain_wu,
 )
 from datahub.ingestion.source_report.ingestion_stage import (
+    EXTERNAL_TABLE_DDL_LINEAGE,
     METADATA_EXTRACTION,
     PROFILING,
 )
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     TimeType,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
-from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    KnownLineageMapping,
+    SqlParsingAggregator,
+)
 from datahub.utilities.registries.domain_registry import DomainRegistry
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         # These are populated as side-effects of get_workunits_internal.
         self.databases: List[SnowflakeDatabase] = []
-        self.aggregator: Optional[SqlParsingAggregator] = aggregator
+        self.aggregator = aggregator
     def get_connection(self) -> SnowflakeConnection:
         return self.connection
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                 self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
                 yield from self._process_database(snowflake_db)
+            self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
+            discovered_tables: List[str] = [
+                self.identifiers.get_dataset_identifier(
+                    table_name, schema.name, db.name
+                )
+                for db in self.databases
+                for schema in db.schemas
+                for table_name in schema.tables
+            ]
+            if self.aggregator:
+                for entry in self._external_tables_ddl_lineage(discovered_tables):
+                    self.aggregator.add(entry)
         except SnowflakePermissionError as e:
             self.structured_reporter.failure(
                 GENERIC_PERMISSION_ERROR_KEY,
@@ -415,11 +435,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                     )
         if self.config.include_views:
-            if (
-                self.aggregator
-                and self.config.include_view_lineage
-                and self.config.parse_view_ddl
-            ):
+            if self.aggregator:
                 for view in views:
                     view_identifier = self.identifiers.get_dataset_identifier(
                         view.name, schema_name, db_name
@@ -1082,3 +1098,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         # Access to table but none of its constraints - is this possible ?
         return constraints.get(table_name, [])
+    # Handles the case for explicitly created external tables.
+    # NOTE: Snowflake does not log this information to the access_history table.
+    def _external_tables_ddl_lineage(
+        self, discovered_tables: List[str]
+    ) -> Iterable[KnownLineageMapping]:
+        external_tables_query: str = SnowflakeQuery.show_external_tables()
+        try:
+            for db_row in self.connection.query(external_tables_query):
+                key = self.identifiers.get_dataset_identifier(
+                    db_row["name"], db_row["schema_name"], db_row["database_name"]
+                )
+                if key not in discovered_tables:
+                    continue
+                if db_row["location"].startswith("s3://"):
+                    yield KnownLineageMapping(
+                        upstream_urn=make_s3_urn_for_lineage(
+                            db_row["location"], self.config.env
+                        ),
+                        downstream_urn=self.identifiers.gen_dataset_urn(key),
+                    )
+                    self.report.num_external_table_edges_scanned += 1
+                self.report.num_external_table_edges_scanned += 1
+        except Exception as e:
+            self.structured_reporter.warning(
+                "External table ddl lineage extraction failed",
+                exc=e,
+            )

datahub/ingestion/source/snowflake/snowflake_shares.py CHANGED Viewed

@@ -72,7 +72,7 @@ class SnowflakeSharesHandler(SnowflakeCommonMixin):
                         assert len(sibling_dbs) == 1
                         # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy
                         # hence this lineage code is not written in SnowflakeLineageExtractor
-                        # also this is not governed by configs include_table_lineage and include_view_lineage
+                        # also this is not governed by configs include_table_lineage
                         yield self.get_upstream_lineage_with_primary_sibling(
                             db.name, schema.name, table_name, sibling_dbs[0]
                         )

datahub/ingestion/source/snowflake/snowflake_usage_v2.py CHANGED Viewed

@@ -342,10 +342,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
                 filtered_user_counts.append(
                     DatasetUserUsageCounts(
                         user=make_user_urn(
-                            self.get_user_identifier(
+                            self.identifiers.get_user_identifier(
                                 user_count["user_name"],
                                 user_email,
-                                self.config.email_as_user_identifier,
                             )
                         ),
                         count=user_count["total"],
@@ -453,9 +452,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
             reported_time: int = int(time.time() * 1000)
             last_updated_timestamp: int = int(start_time.timestamp() * 1000)
             user_urn = make_user_urn(
-                self.get_user_identifier(
-                    user_name, user_email, self.config.email_as_user_identifier
-                )
+                self.identifiers.get_user_identifier(user_name, user_email)
             )
             # NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect

acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl