PyPI - acryl-datahub - Versions diffs - 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7rc2__py3-none-any.whl - Mend

acryl-datahub 1.2.0.6rc1py3-none-any.whl → 1.2.0.7rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (65) hide show

{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/METADATA +2659 -2578
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/RECORD +65 -57
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/graphql/operation.py +1 -1
datahub/ingestion/autogenerated/capability_summary.json +45 -5
datahub/ingestion/autogenerated/lineage.json +3 -2
datahub/ingestion/run/pipeline.py +1 -0
datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
datahub/ingestion/source/common/subtypes.py +3 -0
datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
datahub/ingestion/source/dbt/dbt_common.py +74 -0
datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
datahub/ingestion/source/dremio/dremio_source.py +4 -0
datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
datahub/ingestion/source/excel/__init__.py +0 -0
datahub/ingestion/source/excel/config.py +92 -0
datahub/ingestion/source/excel/excel_file.py +539 -0
datahub/ingestion/source/excel/profiling.py +308 -0
datahub/ingestion/source/excel/report.py +49 -0
datahub/ingestion/source/excel/source.py +662 -0
datahub/ingestion/source/excel/util.py +18 -0
datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/powerbi/config.py +33 -0
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
datahub/ingestion/source/powerbi/powerbi.py +5 -0
datahub/ingestion/source/s3/source.py +65 -59
datahub/ingestion/source/snowflake/constants.py +2 -0
datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
datahub/ingestion/source/snowflake/snowflake_v2.py +5 -1
datahub/ingestion/source/sql/hive_metastore.py +1 -0
datahub/ingestion/source/sql_queries.py +24 -2
datahub/ingestion/source/state/checkpoint.py +3 -28
datahub/metadata/_internal_schema_classes.py +568 -512
datahub/metadata/_urns/urn_defs.py +1748 -1748
datahub/metadata/schema.avsc +18242 -18168
datahub/metadata/schemas/ChartInfo.avsc +2 -1
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
datahub/metadata/schemas/Ownership.avsc +69 -0
datahub/metadata/schemas/StructuredProperties.avsc +69 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/lineage_client.py +6 -26
datahub/sdk/main_client.py +7 -3
datahub/sdk/search_filters.py +16 -0
datahub/specific/aspect_helpers/siblings.py +73 -0
datahub/specific/dataset.py +2 -0
datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
datahub/sql_parsing/tool_meta_extractor.py +1 -3
datahub/upgrade/upgrade.py +14 -2
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -353,6 +353,19 @@ class PowerBiDashboardSourceConfig(
         "For example with an ODBC connection string 'DSN=database' where the database type "
         "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
     )
+    # ODBC DSN to database (or database.schema) mapping
+    dsn_to_database_schema: Dict[str, str] = pydantic.Field(
+        default={},
+        description="A mapping of ODBC DSN to database names with optional schema names "
+        "(some database platforms such a MySQL use the table name pattern 'database.table', "
+        "while others use the pattern 'database.schema.table'). "
+        "This mapping is used in conjunction with ODBC SQL query parsing. "
+        "If SQL queries used with ODBC do not reference fully qualified tables names, "
+        "then you should configure mappings for your DSNs. "
+        "For example with an ODBC connection string 'DSN=database' where the database "
+        "is 'prod' you would configure the mapping as 'database: prod'. "
+        "If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
+    )
     # deprecated warning
     _dataset_type_mapping = pydantic_field_deprecated(
         "dataset_type_mapping",
@@ -614,3 +627,23 @@ class PowerBiDashboardSourceConfig(
                 "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
             )
         return values
+    @root_validator(skip_on_failure=True)
+    def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
+        if values.get("dsn_to_database_schema") is not None:
+            dsn_mapping = values.get("dsn_to_database_schema")
+            if not isinstance(dsn_mapping, dict):
+                raise ValueError("dsn_to_database_schema must contain key-value pairs")
+            for _key, value in dsn_mapping.items():
+                if not isinstance(value, str):
+                    raise ValueError(
+                        "dsn_to_database_schema mapping values must be strings"
+                    )
+                parts = value.split(".")
+                if len(parts) != 1 and len(parts) != 2:
+                    raise ValueError(
+                        f"dsn_to_database_schema invalid mapping value: {value}"
+                    )
+        return values

datahub/ingestion/source/powerbi/m_query/data_classes.py CHANGED Viewed

@@ -76,3 +76,4 @@ class FunctionName(Enum):
     DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
     MYSQL_DATA_ACCESS = "MySQL.Database"
     ODBC_DATA_ACCESS = "Odbc.DataSource"
+    ODBC_QUERY = "Odbc.Query"

datahub/ingestion/source/powerbi/m_query/pattern_handler.py CHANGED Viewed

@@ -3,7 +3,9 @@ from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Dict, List, Optional, Tuple, Type, cast
+import sqlglot
 from lark import Tree
+from sqlglot import ParseError, expressions as exp
 from datahub.configuration.source_common import PlatformDetail
 from datahub.emitter import mce_builder as builder
@@ -209,15 +211,34 @@ class AbstractLineage(ABC):
         return None
+    @staticmethod
+    def is_sql_query(query: Optional[str]) -> bool:
+        if not query:
+            return False
+        query = native_sql_parser.remove_special_characters(query)
+        try:
+            expression = sqlglot.parse_one(query)
+            return isinstance(expression, exp.Select)
+        except (ParseError, Exception):
+            logger.debug(f"Failed to parse query as SQL: {query}")
+            return False
     def parse_custom_sql(
-        self, query: str, server: str, database: Optional[str], schema: Optional[str]
+        self,
+        query: str,
+        server: str,
+        database: Optional[str],
+        schema: Optional[str],
+        platform_pair: Optional[DataPlatformPair] = None,
     ) -> Lineage:
         dataplatform_tables: List[DataPlatformTable] = []
+        if not platform_pair:
+            platform_pair = self.get_platform_pair()
         platform_detail: PlatformDetail = (
             self.platform_instance_resolver.get_platform_instance(
                 PowerBIPlatformDetail(
-                    data_platform_pair=self.get_platform_pair(),
+                    data_platform_pair=platform_pair,
                     data_platform_server=server,
                 )
             )
@@ -231,7 +252,7 @@ class AbstractLineage(ABC):
             native_sql_parser.parse_custom_sql(
                 ctx=self.ctx,
                 query=query,
-                platform=self.get_platform_pair().datahub_data_platform_name,
+                platform=platform_pair.datahub_data_platform_name,
                 platform_instance=platform_detail.platform_instance,
                 env=platform_detail.env,
                 database=database,
@@ -258,7 +279,7 @@ class AbstractLineage(ABC):
         for urn in parsed_result.in_tables:
             dataplatform_tables.append(
                 DataPlatformTable(
-                    data_platform_pair=self.get_platform_pair(),
+                    data_platform_pair=platform_pair,
                     urn=urn,
                 )
             )
@@ -956,7 +977,7 @@ class OdbcLineage(AbstractLineage):
             f"data-access function detail {data_access_func_detail}"
         )
-        connect_string, _ = self.get_db_detail_from_argument(
+        connect_string, query = self.get_db_detail_from_argument(
             data_access_func_detail.arg_list
         )
@@ -972,12 +993,19 @@ class OdbcLineage(AbstractLineage):
         data_platform, powerbi_platform = extract_platform(connect_string)
         server_name = extract_server(connect_string)
+        dsn = extract_dsn(connect_string)
+        if not dsn:
+            self.reporter.warning(
+                title="Can not determine ODBC DSN",
+                message="Can not extract DSN from ODBC connect string. Skipping Lineage creation.",
+                context=f"table-name={self.table.full_name}, connect-string={connect_string}",
+            )
+            return Lineage.empty()
+        logger.debug(f"Extracted DSN: {dsn}")
         if not data_platform:
-            dsn = extract_dsn(connect_string)
-            if dsn:
-                logger.debug(f"Extracted DSN: {dsn}")
-                server_name = dsn
-            if dsn and self.config.dsn_to_platform_name:
+            server_name = dsn
+            if self.config.dsn_to_platform_name:
                 logger.debug(f"Attempting to map DSN {dsn} to platform")
                 name = self.config.dsn_to_platform_name.get(dsn)
                 if name:
@@ -1006,6 +1034,63 @@ class OdbcLineage(AbstractLineage):
         elif not server_name:
             server_name = "unknown"
+        if self.is_sql_query(query):
+            return self.query_lineage(query, platform_pair, server_name, dsn)
+        else:
+            return self.expression_lineage(
+                data_access_func_detail, data_platform, platform_pair, server_name
+            )
+    def query_lineage(
+        self,
+        query: Optional[str],
+        platform_pair: DataPlatformPair,
+        server_name: str,
+        dsn: str,
+    ) -> Lineage:
+        database = None
+        schema = None
+        if not query:
+            # query should never be None as it is checked before calling this function.
+            # however, we need to check just in case.
+            self.reporter.warning(
+                title="ODBC Query is null",
+                message="No SQL to parse. Skipping Lineage creation.",
+                context=f"table-name={self.table.full_name}",
+            )
+            return Lineage.empty()
+        if self.config.dsn_to_database_schema:
+            value = self.config.dsn_to_database_schema.get(dsn)
+            if value:
+                parts = value.split(".")
+                if len(parts) == 1:
+                    database = parts[0]
+                elif len(parts) == 2:
+                    database = parts[0]
+                    schema = parts[1]
+        logger.debug(
+            f"ODBC query processing: dsn={dsn} mapped to database={database}, schema={schema}"
+        )
+        result = self.parse_custom_sql(
+            query=query,
+            server=server_name,
+            database=database,
+            schema=schema,
+            platform_pair=platform_pair,
+        )
+        logger.debug(f"ODBC query lineage generated {len(result.upstreams)} upstreams")
+        return result
+    def expression_lineage(
+        self,
+        data_access_func_detail: DataAccessFunctionDetail,
+        data_platform: str,
+        platform_pair: DataPlatformPair,
+        server_name: str,
+    ) -> Lineage:
         database_name = None
         schema_name = None
         table_name = None
@@ -1144,6 +1229,11 @@ class SupportedPattern(Enum):
         FunctionName.ODBC_DATA_ACCESS,
     )
+    ODBC_QUERY = (
+        OdbcLineage,
+        FunctionName.ODBC_QUERY,
+    )
     def handler(self) -> Type[AbstractLineage]:
         return self.value[0]

datahub/ingestion/source/powerbi/powerbi.py CHANGED Viewed

@@ -40,6 +40,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import (
     BIAssetSubTypes,
     BIContainerSubTypes,
+    SourceCapabilityModifier,
 )
 from datahub.ingestion.source.powerbi.config import (
     Constant,
@@ -1229,6 +1230,10 @@ class Mapper:
 @capability(
     SourceCapability.CONTAINERS,
     "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.POWERBI_WORKSPACE,
+        SourceCapabilityModifier.POWERBI_DATASET,
+    ],
 )
 @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
 @capability(SourceCapability.OWNERSHIP, "Enabled by default")

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -34,7 +34,13 @@ from datahub.ingestion.api.decorators import (
 )
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders
+from datahub.ingestion.source.aws.s3_boto_utils import (
+    get_s3_tags,
+    list_folders,
+    list_folders_path,
+    list_objects_recursive,
+    list_objects_recursive_path,
+)
 from datahub.ingestion.source.aws.s3_util import (
     get_bucket_name,
     get_bucket_relative_path,
@@ -84,8 +90,6 @@ if TYPE_CHECKING:
 logging.getLogger("py4j").setLevel(logging.ERROR)
 logger: logging.Logger = logging.getLogger(__name__)
-PAGE_SIZE = 1000
 # Hack to support the .gzip extension with smart_open.
 so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
@@ -384,7 +388,10 @@ class S3Source(StatefulIngestionSourceBase):
     def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
         logger.debug(f"Opening file {file} for profiling in spark")
-        file = file.replace("s3://", "s3a://")
+        if "s3://" in file:
+            # replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
+            # Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
+            file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
         telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
@@ -836,29 +843,31 @@ class S3Source(StatefulIngestionSourceBase):
             content_type=browse_path.content_type,
         )
-    def resolve_templated_folders(self, bucket_name: str, prefix: str) -> Iterable[str]:
+    def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
         folder_split: List[str] = prefix.split("*", 1)
         # If the len of split is 1 it means we don't have * in the prefix
         if len(folder_split) == 1:
             yield prefix
             return
-        folders: Iterable[str] = list_folders(
-            bucket_name, folder_split[0], self.source_config.aws_config
+        basename_startswith = folder_split[0].split("/")[-1]
+        dirname = folder_split[0].removesuffix(basename_startswith)
+        folders = list_folders_path(
+            dirname,
+            startswith=basename_startswith,
+            aws_config=self.source_config.aws_config,
         )
         for folder in folders:
-            # Ensure proper path joining - folder already includes trailing slash from list_folders
-            # but we need to handle the case where folder_split[1] might start with a slash
+            # Ensure proper path joining - folders from list_folders path never include a
+            # trailing slash, but we need to handle the case where folder_split[1] might
+            # start with a slash
             remaining_pattern = folder_split[1]
             if remaining_pattern.startswith("/"):
                 remaining_pattern = remaining_pattern[1:]
-            # Ensure folder ends with slash for proper path construction
-            if not folder.endswith("/"):
-                folder = folder + "/"
             yield from self.resolve_templated_folders(
-                bucket_name, f"{folder}{remaining_pattern}"
+                f"{folder.path}/{remaining_pattern}"
             )
     def get_dir_to_process(
@@ -942,7 +951,9 @@ class S3Source(StatefulIngestionSourceBase):
         # Instead of loading all objects into memory, we'll accumulate folder data incrementally
         folder_data: Dict[str, FolderInfo] = {}  # dirname -> FolderInfo
-        for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
+        for obj in list_objects_recursive(
+            bucket.name, prefix, self.source_config.aws_config
+        ):
             s3_path = self.create_s3_path(obj.bucket_name, obj.key)
             if not _is_allowed_path(path_spec, s3_path):
@@ -1016,13 +1027,6 @@ class S3Source(StatefulIngestionSourceBase):
         if self.source_config.aws_config is None:
             raise ValueError("aws_config not set. Cannot browse s3")
-        s3 = self.source_config.aws_config.get_s3_resource(
-            self.source_config.verify_ssl
-        )
-        bucket_name = get_bucket_name(path_spec.include)
-        bucket = s3.Bucket(bucket_name)
-        logger.debug(f"Scanning bucket: {bucket_name}")
         logger.info(f"Processing path spec: {path_spec.include}")
         # Check if we have {table} template in the path
@@ -1034,16 +1038,14 @@ class S3Source(StatefulIngestionSourceBase):
             logger.info("Using templated path processing")
             # Always use templated processing when {table} is present
             # This groups files under table-level datasets
-            yield from self._process_templated_path(path_spec, bucket, bucket_name)
+            yield from self._process_templated_path(path_spec)
         else:
             logger.info("Using simple path processing")
             # Only use simple processing for non-templated paths
             # This creates individual file-level datasets
-            yield from self._process_simple_path(path_spec, bucket, bucket_name)
+            yield from self._process_simple_path(path_spec)
-    def _process_templated_path(
-        self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
-    ) -> Iterable[BrowsePath]:
+    def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:  # noqa: C901
         """
         Process S3 paths containing {table} templates to create table-level datasets.
@@ -1057,12 +1059,17 @@ class S3Source(StatefulIngestionSourceBase):
         Args:
             path_spec: Path specification with {table} template
-            bucket: S3 bucket resource
-            bucket_name: Name of the S3 bucket
         Yields:
             BrowsePath: One per table (not per file), containing aggregated metadata
         """
+        if self.source_config.aws_config is None:
+            raise ValueError("aws_config not set. Cannot browse s3")
+        s3 = self.source_config.aws_config.get_s3_resource(
+            self.source_config.verify_ssl
+        )
         # Find the part before {table}
         table_marker = "{table}"
         if table_marker not in path_spec.include:
@@ -1097,20 +1104,13 @@ class S3Source(StatefulIngestionSourceBase):
         # Split the path at {table} to get the prefix that needs wildcard resolution
         prefix_before_table = include.split(table_marker)[0]
-        # Remove the s3:// and bucket name to get the relative path
-        relative_path = get_bucket_relative_path(prefix_before_table)
         logger.info(f"Prefix before table: {prefix_before_table}")
-        logger.info(f"Relative path for resolution: {relative_path}")
         try:
             # STEP 2: Resolve ALL wildcards in the path up to {table}
-            # This converts patterns like "data/*/logs/" to actual paths like ["data/2023/logs/", "data/2024/logs/"]
-            table_index = include.find(table_marker)
-            folder_prefix = get_bucket_relative_path(include[:table_index])
+            # This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
             resolved_prefixes = list(
-                self.resolve_templated_folders(bucket_name, folder_prefix)
+                self.resolve_templated_folders(prefix_before_table)
             )
             logger.info(f"Resolved prefixes: {resolved_prefixes}")
@@ -1121,20 +1121,22 @@ class S3Source(StatefulIngestionSourceBase):
                 # Get all folders that could be tables under this resolved prefix
                 # These are the actual table names (e.g., "users", "events", "logs")
                 table_folders = list(
-                    list_folders(
-                        bucket_name, resolved_prefix, self.source_config.aws_config
+                    list_folders_path(
+                        resolved_prefix, aws_config=self.source_config.aws_config
                     )
                 )
                 logger.debug(
-                    f"Found table folders under {resolved_prefix}: {table_folders}"
+                    f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
                 )
                 # STEP 4: Process each table folder to create a table-level dataset
-                for table_folder in table_folders:
+                for folder in table_folders:
+                    bucket_name = get_bucket_name(folder.path)
+                    table_folder = get_bucket_relative_path(folder.path)
+                    bucket = s3.Bucket(bucket_name)
                     # Create the full S3 path for this table
-                    table_s3_path = self.create_s3_path(
-                        bucket_name, table_folder.rstrip("/")
-                    )
+                    table_s3_path = self.create_s3_path(bucket_name, table_folder)
                     logger.info(
                         f"Processing table folder: {table_folder} -> {table_s3_path}"
                     )
@@ -1269,17 +1271,16 @@ class S3Source(StatefulIngestionSourceBase):
                         )
         except Exception as e:
-            if "NoSuchBucket" in repr(e):
+            if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
                 self.get_report().report_warning(
-                    "Missing bucket", f"No bucket found {bucket_name}"
+                    "Missing bucket",
+                    f"No bucket found {e.response['Error'].get('BucketName')}",
                 )
                 return
             logger.error(f"Error in _process_templated_path: {e}")
             raise e
-    def _process_simple_path(
-        self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
-    ) -> Iterable[BrowsePath]:
+    def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
         """
         Process simple S3 paths without {table} templates to create file-level datasets.
@@ -1295,8 +1296,6 @@ class S3Source(StatefulIngestionSourceBase):
         Args:
             path_spec: Path specification without {table} template
-            bucket: S3 bucket resource
-            bucket_name: Name of the S3 bucket
         Yields:
             BrowsePath: One per file, containing individual file metadata
@@ -1305,20 +1304,27 @@ class S3Source(StatefulIngestionSourceBase):
             - BrowsePath(file="data/file1.csv", size=1000, partitions=[])
             - BrowsePath(file="data/file2.csv", size=2000, partitions=[])
         """
-        assert self.source_config.aws_config is not None, "aws_config not set"
+        if self.source_config.aws_config is None:
+            raise ValueError("aws_config not set")
+        s3 = self.source_config.aws_config.get_s3_resource(
+            self.source_config.verify_ssl
+        )
         path_spec.sample_files = False  # Disable sampling for simple paths
         # Extract the prefix from the path spec (stops at first wildcard)
-        prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
+        prefix = self.get_prefix(path_spec.include)
-        # Get s3 resource for content type checking
-        s3 = self.source_config.aws_config.get_s3_resource(
-            self.source_config.verify_ssl
-        )
+        basename_startswith = prefix.split("/")[-1]
+        dirname = prefix.removesuffix(basename_startswith)
         # Iterate through all objects in the bucket matching the prefix
-        for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
+        for obj in list_objects_recursive_path(
+            dirname,
+            startswith=basename_startswith,
+            aws_config=self.source_config.aws_config,
+        ):
             s3_path = self.create_s3_path(obj.bucket_name, obj.key)
             # Get content type if configured

datahub/ingestion/source/snowflake/constants.py CHANGED Viewed

@@ -9,6 +9,8 @@ class SnowflakeCloudProvider(StrEnum):
 SNOWFLAKE_DEFAULT_CLOUD = SnowflakeCloudProvider.AWS
+DEFAULT_SNOWFLAKE_DOMAIN = "snowflakecomputing.com"
 class SnowflakeEdition(StrEnum):
     STANDARD = "Standard"

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -216,6 +216,16 @@ class SnowflakeV2Config(
         description="If enabled, populates the ingested views' definitions.",
     )
+    fetch_views_from_information_schema: bool = Field(
+        default=False,
+        description="If enabled, uses information_schema.views to fetch view definitions instead of SHOW VIEWS command. "
+        "This alternative method can be more reliable for databases with large numbers of views (> 10K views), as the "
+        "SHOW VIEWS approach has proven unreliable and can lead to missing views in such scenarios. However, this method "
+        "requires OWNERSHIP privileges on views to retrieve their definitions. For views without ownership permissions "
+        "(where VIEW_DEFINITION is null/empty), the system will automatically fall back to using batched SHOW VIEWS queries "
+        "to populate the missing definitions.",
+    )
     include_technical_schema: bool = Field(
         default=True,
         description="If enabled, populates the snowflake technical schema and descriptions.",

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -22,6 +22,7 @@ from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.source.snowflake.constants import (
     CLIENT_PREFETCH_THREADS,
     CLIENT_SESSION_KEEP_ALIVE,
+    DEFAULT_SNOWFLAKE_DOMAIN,
 )
 from datahub.ingestion.source.snowflake.oauth_config import (
     OAuthConfiguration,
@@ -47,8 +48,6 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
     "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
 }
-_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
 class SnowflakePermissionError(MetaError):
     """A permission error has happened"""
@@ -110,6 +109,10 @@ class SnowflakeConnectionConfig(ConfigModel):
         default=None,
         description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
     )
+    snowflake_domain: str = pydantic.Field(
+        default=DEFAULT_SNOWFLAKE_DOMAIN,
+        description="Snowflake domain. Use 'snowflakecomputing.com' for most regions or 'snowflakecomputing.cn' for China (cn-northwest-1) region.",
+    )
     def get_account(self) -> str:
         assert self.account_id
@@ -118,10 +121,13 @@ class SnowflakeConnectionConfig(ConfigModel):
     rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
     @pydantic.validator("account_id")
-    def validate_account_id(cls, account_id: str) -> str:
+    def validate_account_id(cls, account_id: str, values: Dict) -> str:
         account_id = remove_protocol(account_id)
         account_id = remove_trailing_slashes(account_id)
-        account_id = remove_suffix(account_id, _SNOWFLAKE_HOST_SUFFIX)
+        # Get the domain from config, fallback to default
+        domain = values.get("snowflake_domain", DEFAULT_SNOWFLAKE_DOMAIN)
+        snowflake_host_suffix = f".{domain}"
+        account_id = remove_suffix(account_id, snowflake_host_suffix)
         return account_id
     @pydantic.validator("authentication_type", always=True)
@@ -311,6 +317,7 @@ class SnowflakeConnectionConfig(ConfigModel):
             warehouse=self.warehouse,
             authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
             application=_APPLICATION_NAME,
+            host=f"{self.account_id}.{self.snowflake_domain}",
             **connect_args,
         )
@@ -324,6 +331,7 @@ class SnowflakeConnectionConfig(ConfigModel):
             role=self.role,
             authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
             application=_APPLICATION_NAME,
+            host=f"{self.account_id}.{self.snowflake_domain}",
             **connect_args,
         )
@@ -337,6 +345,7 @@ class SnowflakeConnectionConfig(ConfigModel):
                 warehouse=self.warehouse,
                 role=self.role,
                 application=_APPLICATION_NAME,
+                host=f"{self.account_id}.{self.snowflake_domain}",
                 **connect_args,
             )
         elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
@@ -348,6 +357,7 @@ class SnowflakeConnectionConfig(ConfigModel):
                 warehouse=self.warehouse,
                 role=self.role,
                 application=_APPLICATION_NAME,
+                host=f"{self.account_id}.{self.snowflake_domain}",
                 **connect_args,
             )
         elif self.authentication_type == "OAUTH_AUTHENTICATOR":
@@ -363,6 +373,7 @@ class SnowflakeConnectionConfig(ConfigModel):
                 role=self.role,
                 authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
                 application=_APPLICATION_NAME,
+                host=f"{self.account_id}.{self.snowflake_domain}",
                 **connect_args,
             )
         else:
@@ -408,7 +419,7 @@ class SnowflakeConnection(Closeable):
             # We often run multiple queries in parallel across multiple threads,
             # so we need to number them to help with log readability.
             query_num = self.get_query_no()
-            logger.info(f"Query #{query_num}: {query}", stacklevel=2)
+            logger.info(f"Query #{query_num}: {query.rstrip()}", stacklevel=2)
             resp = self._connection.cursor(DictCursor).execute(query)
             if resp is not None and resp.rowcount is not None:
                 logger.info(

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -266,6 +266,33 @@ SHOW VIEWS IN DATABASE "{db_name}"
 LIMIT {limit} {from_clause};
 """
+    @staticmethod
+    def get_views_for_database(db_name: str) -> str:
+        # We've seen some issues with the `SHOW VIEWS` query,
+        # particularly when it requires pagination.
+        # This is an experimental alternative query that might be more reliable.
+        return f"""\
+SELECT
+  TABLE_CATALOG as "VIEW_CATALOG",
+  TABLE_SCHEMA as "VIEW_SCHEMA",
+  TABLE_NAME as "VIEW_NAME",
+  COMMENT,
+  VIEW_DEFINITION,
+  CREATED,
+  LAST_ALTERED,
+  IS_SECURE
+FROM "{db_name}".information_schema.views
+WHERE TABLE_CATALOG = '{db_name}'
+  AND TABLE_SCHEMA != 'INFORMATION_SCHEMA'
+"""
+    @staticmethod
+    def get_views_for_schema(db_name: str, schema_name: str) -> str:
+        return f"""\
+{SnowflakeQuery.get_views_for_database(db_name).rstrip()}
+  AND TABLE_SCHEMA = '{schema_name}'
+"""
     @staticmethod
     def get_secure_view_definitions() -> str:
         # https://docs.snowflake.com/en/sql-reference/account-usage/views

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -128,6 +128,7 @@ class SnowflakeV2Report(
     # "Information schema query returned too much data. Please repeat query with more selective predicates.""
     # This will result in overall increase in time complexity
     num_get_tables_for_schema_queries: int = 0
+    num_get_views_for_schema_queries: int = 0
     # these will be non-zero if the user choses to enable the extract_tags = "with_lineage" option, which requires
     # individual queries per object (database, schema, table) and an extra query per table to get the tags on the columns.

acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7rc2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.6rc1py3-none-any.whl → 1.2.0.7rc2py3-none-any.whl