PyPI - acryl-datahub - Versions diffs - 1.1.0rc4__py3-none-any.whl → 1.1.1rc2__py3-none-any.whl - Mend

acryl-datahub 1.1.0rc4py3-none-any.whl → 1.1.1rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (19) hide show

datahub/ingestion/source/gcs/gcs_utils.py CHANGED Viewed

@@ -4,35 +4,62 @@ GCS_PREFIX = "gs://"
 def is_gcs_uri(uri: str) -> bool:
+    """
+    Check if a URI is a GCS URI (starts with gs://).
+    For more general URI handling, consider using object_store.get_object_store_for_uri.
+    """
     return uri.startswith(GCS_PREFIX)
 def get_gcs_prefix(gcs_uri: str) -> Optional[str]:
+    """
+    Get the GCS prefix (gs://) if the URI is a GCS URI.
+    For more general URI handling, consider using object_store.get_object_store_for_uri.
+    """
     if gcs_uri.startswith(GCS_PREFIX):
         return GCS_PREFIX
     return None
 def strip_gcs_prefix(gcs_uri: str) -> str:
-    # remove GCS prefix (gs://)
+    """
+    Remove the GCS prefix (gs://) from a GCS URI.
+    For more general URI handling, consider using the object_store module.
+    Args:
+        gcs_uri: A GCS URI starting with gs://
+    Returns:
+        The URI without the gs:// prefix
+    Raises:
+        ValueError: If the URI doesn't start with gs://
+    """
     prefix = get_gcs_prefix(gcs_uri)
     if not prefix:
-        raise ValueError(f"Not an GCS URI. Must start with prefix: {GCS_PREFIX}")
+        raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
     return gcs_uri[len(GCS_PREFIX) :]
-def get_gcs_bucket_name(path):
-    if not is_gcs_uri(path):
-        raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
-    return strip_gcs_prefix(path).split("/")[0]
 def get_gcs_bucket_relative_path(gcs_uri: str) -> str:
+    """
+    Get the path relative to the bucket from a GCS URI.
+    For more general URI handling, consider using object_store.get_object_key.
+    """
     return "/".join(strip_gcs_prefix(gcs_uri).split("/")[1:])
 def get_gcs_key_prefix(gcs_uri: str) -> str:
+    """
+    Get the key prefix (first path component after bucket) from a GCS URI.
+    For more general URI handling, consider using object_store.get_object_key.
+    """
     if not is_gcs_uri(gcs_uri):
-        raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
+        raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
     return strip_gcs_prefix(gcs_uri).split("/", maxsplit=1)[1]

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -7,7 +7,7 @@ import re
 import time
 from datetime import datetime
 from pathlib import PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
 from urllib.parse import urlparse
 import smart_open.compression as so_compression
@@ -43,6 +43,9 @@ from datahub.ingestion.source.aws.s3_util import (
     strip_s3_prefix,
 )
 from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
+from datahub.ingestion.source.data_lake_common.object_store import (
+    create_object_store_adapter,
+)
 from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
 from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
 from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -197,12 +200,59 @@ class S3Source(StatefulIngestionSourceBase):
     report: DataLakeSourceReport
     profiling_times_taken: List[float]
     container_WU_creator: ContainerWUCreator
+    object_store_adapter: Any
     def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.source_config = config
         self.report = DataLakeSourceReport()
         self.profiling_times_taken = []
+        self.container_WU_creator = ContainerWUCreator(
+            self.source_config.platform,
+            self.source_config.platform_instance,
+            self.source_config.env,
+        )
+        # Create an object store adapter for handling external URLs and paths
+        if self.is_s3_platform():
+            # Get the AWS region from config, if available
+            aws_region = None
+            if self.source_config.aws_config:
+                aws_region = self.source_config.aws_config.aws_region
+                # For backward compatibility with tests: if we're using a test endpoint, use us-east-1
+                if self.source_config.aws_config.aws_endpoint_url and (
+                    "localstack"
+                    in self.source_config.aws_config.aws_endpoint_url.lower()
+                    or "storage.googleapis.com"
+                    in self.source_config.aws_config.aws_endpoint_url.lower()
+                ):
+                    aws_region = "us-east-1"
+            # Create an S3 adapter with the configured region
+            self.object_store_adapter = create_object_store_adapter(
+                "s3", aws_region=aws_region
+            )
+            # Special handling for GCS via S3 (via boto compatibility layer)
+            if (
+                self.source_config.aws_config
+                and self.source_config.aws_config.aws_endpoint_url
+                and "storage.googleapis.com"
+                in self.source_config.aws_config.aws_endpoint_url.lower()
+            ):
+                # We need to preserve the S3-style paths but use GCS external URL generation
+                self.object_store_adapter = create_object_store_adapter("gcs")
+                # Override create_s3_path to maintain S3 compatibility
+                self.object_store_adapter.register_customization(
+                    "create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
+                )
+        else:
+            # For local files, create a default adapter
+            self.object_store_adapter = create_object_store_adapter(
+                self.source_config.platform or "file"
+            )
         config_report = {
             config_option: config.dict().get(config_option)
             for config_option in config_options_to_report
@@ -605,6 +655,19 @@ class S3Source(StatefulIngestionSourceBase):
             maxPartition=max_partition_summary, minPartition=min_partition_summary
         )
+    def get_external_url(self, table_data: TableData) -> Optional[str]:
+        """
+        Get the external URL for a table using the configured object store adapter.
+        Args:
+            table_data: Table data containing path information
+        Returns:
+            An external URL or None if not applicable
+        """
+        # The adapter handles all the URL generation with proper region handling
+        return self.object_store_adapter.get_external_url(table_data)
     def ingest_table(
         self, table_data: TableData, path_spec: PathSpec
     ) -> Iterable[MetadataWorkUnit]:
@@ -674,6 +737,7 @@ class S3Source(StatefulIngestionSourceBase):
                 if max_partition
                 else None
             ),
+            externalUrl=self.get_external_url(table_data),
         )
         aspects.append(dataset_properties)
         if table_data.size_in_bytes > 0:
@@ -1082,11 +1146,6 @@ class S3Source(StatefulIngestionSourceBase):
                     )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        self.container_WU_creator = ContainerWUCreator(
-            self.source_config.platform,
-            self.source_config.platform_instance,
-            self.source_config.env,
-        )
         with PerfTimer() as timer:
             assert self.source_config.path_specs
             for path_spec in self.source_config.path_specs:

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
     sql_aggregator: Optional[SqlAggregatorReport] = None
     num_ddl_queries_dropped: int = 0
+    num_stream_queries_observed: int = 0
+    num_create_temp_view_queries_observed: int = 0
     num_users: int = 0
@@ -373,6 +375,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                     if entry:
                         yield entry
+    @classmethod
+    def _has_temp_keyword(cls, query_text: str) -> bool:
+        return (
+            re.search(r"\bTEMP\b", query_text, re.IGNORECASE) is not None
+            or re.search(r"\bTEMPORARY\b", query_text, re.IGNORECASE) is not None
+        )
     def _parse_audit_log_row(
         self, row: Dict[str, Any], users: UsersMapping
     ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
@@ -389,6 +398,15 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             key = key.lower()
             res[key] = value
+        timestamp: datetime = res["query_start_time"]
+        timestamp = timestamp.astimezone(timezone.utc)
+        # TODO need to map snowflake query types to ours
+        query_text: str = res["query_text"]
+        query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
+            res["query_type"], QueryType.UNKNOWN
+        )
         direct_objects_accessed = res["direct_objects_accessed"]
         objects_modified = res["objects_modified"]
         object_modified_by_ddl = res["object_modified_by_ddl"]
@@ -399,9 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 "Error fetching ddl lineage from Snowflake"
             ):
                 known_ddl_entry = self.parse_ddl_query(
-                    res["query_text"],
+                    query_text,
                     res["session_id"],
-                    res["query_start_time"],
+                    timestamp,
                     object_modified_by_ddl,
                     res["query_type"],
                 )
@@ -419,24 +437,38 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             )
         )
-        # Use direct_objects_accessed instead objects_modified
-        # objects_modified returns $SYS_VIEW_X with no mapping
+        # There are a couple cases when we'd want to prefer our own SQL parsing
+        # over Snowflake's metadata.
+        # 1. For queries that use a stream, objects_modified returns $SYS_VIEW_X with no mapping.
+        #    We can check direct_objects_accessed to see if there is a stream used, and if so,
+        #    prefer doing SQL parsing over Snowflake's metadata.
+        # 2. For queries that create a view, objects_modified is empty and object_modified_by_ddl
+        #    contains the view name and columns. Because `object_modified_by_ddl` doesn't contain
+        #    source columns e.g. lineage information, we must do our own SQL parsing. We're mainly
+        #    focused on temporary views. It's fine if we parse a couple extra views, but in general
+        #    we want view definitions to come from Snowflake's schema metadata and not from query logs.
         has_stream_objects = any(
             obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
         )
+        is_create_view = query_type == QueryType.CREATE_VIEW
+        is_create_temp_view = is_create_view and self._has_temp_keyword(query_text)
+        if has_stream_objects or is_create_temp_view:
+            if has_stream_objects:
+                self.report.num_stream_queries_observed += 1
+            elif is_create_temp_view:
+                self.report.num_create_temp_view_queries_observed += 1
-        # If a stream is used, default to query parsing.
-        if has_stream_objects:
-            logger.debug("Found matching stream object")
             return ObservedQuery(
-                query=res["query_text"],
+                query=query_text,
                 session_id=res["session_id"],
-                timestamp=res["query_start_time"].astimezone(timezone.utc),
+                timestamp=timestamp,
                 user=user,
                 default_db=res["default_db"],
                 default_schema=res["default_schema"],
                 query_hash=get_query_fingerprint(
-                    res["query_text"], self.identifiers.platform, fast=True
+                    query_text, self.identifiers.platform, fast=True
                 ),
             )
@@ -502,25 +534,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                     )
                 )
-        timestamp: datetime = res["query_start_time"]
-        timestamp = timestamp.astimezone(timezone.utc)
-        # TODO need to map snowflake query types to ours
-        query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
-            res["query_type"], QueryType.UNKNOWN
-        )
         entry = PreparsedQuery(
             # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
             # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
             # here
             query_id=get_query_fingerprint(
-                res["query_text"],
+                query_text,
                 self.identifiers.platform,
                 fast=True,
                 secondary_id=res["query_secondary_fingerprint"],
             ),
-            query_text=res["query_text"],
+            query_text=query_text,
             upstreams=upstreams,
             downstream=downstream,
             column_lineage=column_lineage,
@@ -543,7 +567,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         object_modified_by_ddl: dict,
         query_type: str,
     ) -> Optional[Union[TableRename, TableSwap]]:
-        timestamp = timestamp.astimezone(timezone.utc)
         if (
             object_modified_by_ddl["operationType"] == "ALTER"
             and query_type == "RENAME_TABLE"

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -43,13 +43,6 @@ class SnowflakeQuery:
     ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
         ",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
     )
-    ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
-        "("
-        f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
-        f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
-        f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
-        ")"
-    )
     @staticmethod
     def current_account() -> str:

datahub/ingestion/source/sql/hive.py CHANGED Viewed

@@ -139,7 +139,7 @@ class StoragePathParser:
                 path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
             elif platform == StoragePlatform.AZURE:
-                if scheme in ("abfs", "abfss"):
+                if scheme in ("abfs", "abfss", "wasbs"):
                     # Format: abfss://container@account.dfs.core.windows.net/path
                     container = parsed.netloc.split("@")[0]
                     path = f"{container}/{parsed.path.lstrip('/')}"
@@ -153,7 +153,7 @@ class StoragePathParser:
             elif platform == StoragePlatform.DBFS:
                 # For DBFS, use path as-is
-                path = parsed.path.lstrip("/")
+                path = "/" + parsed.path.lstrip("/")
             elif platform == StoragePlatform.LOCAL:
                 # For local files, use full path
@@ -169,7 +169,6 @@ class StoragePathParser:
             # Clean up the path
             path = path.rstrip("/")  # Remove trailing slashes
             path = re.sub(r"/+", "/", path)  # Normalize multiple slashes
-            path = f"/{path}"
             return platform, path

datahub/sql_parsing/sql_parsing_aggregator.py CHANGED Viewed

@@ -109,7 +109,7 @@ class ObservedQuery:
     query_hash: Optional[str] = None
     usage_multiplier: int = 1
-    # Use this to store addtitional key-value information about query for debugging
+    # Use this to store additional key-value information about the query for debugging.
     extra_info: Optional[dict] = None

acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.1rc2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0rc4py3-none-any.whl → 1.1.1rc2py3-none-any.whl