PyPI - acryl-datahub - Versions diffs - 1.1.0rc4__py3-none-any.whl → 1.1.0.1rc6__py3-none-any.whl - Mend

acryl-datahub 1.1.0rc4py3-none-any.whl → 1.1.0.1rc6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show

{acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/METADATA +2609 -2607
{acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/RECORD +87 -70
{acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +9 -8
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/delete_cli.py +4 -4
datahub/cli/ingest_cli.py +9 -1
datahub/emitter/mce_builder.py +3 -1
datahub/emitter/response_helper.py +86 -1
datahub/emitter/rest_emitter.py +1 -1
datahub/ingestion/graph/client.py +3 -3
datahub/ingestion/source/apply/datahub_apply.py +4 -4
datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
datahub/ingestion/source/data_lake_common/object_store.py +644 -0
datahub/ingestion/source/datahub/config.py +11 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_common.py +30 -11
datahub/ingestion/source/gcs/gcs_source.py +22 -7
datahub/ingestion/source/gcs/gcs_utils.py +36 -9
datahub/ingestion/source/hex/query_fetcher.py +9 -3
datahub/ingestion/source/openapi.py +12 -0
datahub/ingestion/source/openapi_parser.py +56 -37
datahub/ingestion/source/s3/source.py +65 -6
datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
datahub/ingestion/source/sql/athena.py +1 -0
datahub/ingestion/source/sql/hive.py +2 -3
datahub/ingestion/source/sql/sql_common.py +98 -34
datahub/ingestion/source/sql/sql_types.py +5 -2
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +117 -0
datahub/ingestion/source/unity/source.py +167 -15
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/metadata/_internal_schema_classes.py +667 -522
datahub/metadata/_urns/urn_defs.py +1804 -1748
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/schema.avsc +17358 -17584
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +1 -0
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
datahub/metadata/schemas/MLModelKey.avsc +1 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +6 -0
datahub/sdk/_all_entities.py +11 -0
datahub/sdk/_shared.py +118 -1
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +309 -0
datahub/sdk/datajob.py +342 -0
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +90 -2
datahub/sdk/lineage_client.py +681 -82
datahub/sdk/main_client.py +27 -8
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
datahub/testing/mce_helpers.py +421 -0
datahub/testing/sdk_v2_helpers.py +18 -0
{acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/openapi_parser.py CHANGED Viewed

@@ -59,17 +59,21 @@ def request_call(
     username: Optional[str] = None,
     password: Optional[str] = None,
     proxies: Optional[dict] = None,
+    verify_ssl: bool = True,
 ) -> requests.Response:
     headers = {"accept": "application/json"}
     if username is not None and password is not None:
         return requests.get(
-            url, headers=headers, auth=HTTPBasicAuth(username, password)
+            url,
+            headers=headers,
+            auth=HTTPBasicAuth(username, password),
+            verify=verify_ssl,
         )
     elif token is not None:
         headers["Authorization"] = f"{token}"
-        return requests.get(url, proxies=proxies, headers=headers)
+        return requests.get(url, proxies=proxies, headers=headers, verify=verify_ssl)
     else:
-        return requests.get(url, headers=headers)
+        return requests.get(url, headers=headers, verify=verify_ssl)
 def get_swag_json(
@@ -79,10 +83,16 @@ def get_swag_json(
     password: Optional[str] = None,
     swagger_file: str = "",
     proxies: Optional[dict] = None,
+    verify_ssl: bool = True,
 ) -> Dict:
     tot_url = url + swagger_file
     response = request_call(
-        url=tot_url, token=token, username=username, password=password, proxies=proxies
+        url=tot_url,
+        token=token,
+        username=username,
+        password=password,
+        proxies=proxies,
+        verify_ssl=verify_ssl,
     )
     if response.status_code != 200:
@@ -127,37 +137,45 @@ def get_endpoints(sw_dict: dict) -> dict:
     check_sw_version(sw_dict)
     for p_k, p_o in sw_dict["paths"].items():
-        method = list(p_o)[0]
-        if "200" in p_o[method]["responses"]:
-            base_res = p_o[method]["responses"]["200"]
-        elif 200 in p_o[method]["responses"]:
-            # if you read a plain yml file the 200 will be an integer
-            base_res = p_o[method]["responses"][200]
-        else:
-            # the endpoint does not have a 200 response
-            continue
-        if "description" in p_o[method]:
-            desc = p_o[method]["description"]
-        elif "summary" in p_o[method]:
-            desc = p_o[method]["summary"]
-        else:  # still testing
-            desc = ""
-        try:
-            tags = p_o[method]["tags"]
-        except KeyError:
-            tags = []
-        url_details[p_k] = {"description": desc, "tags": tags, "method": method}
-        example_data = check_for_api_example_data(base_res, p_k)
-        if example_data:
-            url_details[p_k]["data"] = example_data
-        # checking whether there are defined parameters to execute the call...
-        if "parameters" in p_o[method]:
-            url_details[p_k]["parameters"] = p_o[method]["parameters"]
+        for method, method_spec in p_o.items():
+            # skip non-method keys like "parameters"
+            if method.lower() not in [
+                "get",
+                "post",
+                "put",
+                "delete",
+                "patch",
+                "options",
+                "head",
+            ]:
+                continue
+            responses = method_spec.get("responses", {})
+            base_res = responses.get("200") or responses.get(200)
+            if not base_res:
+                # if there is no 200 response, we skip this method
+                continue
+            # if the description is not present, we will use the summary
+            # if both are not present, we will use an empty string
+            desc = method_spec.get("description") or method_spec.get("summary", "")
+            # if the tags are not present, we will use an empty list
+            tags = method_spec.get("tags", [])
+            url_details[p_k] = {
+                "description": desc,
+                "tags": tags,
+                "method": method.upper(),
+            }
+            example_data = check_for_api_example_data(base_res, p_k)
+            if example_data:
+                url_details[p_k]["data"] = example_data
+            # checking whether there are defined parameters to execute the call...
+            if "parameters" in p_o[method]:
+                url_details[p_k]["parameters"] = p_o[method]["parameters"]
     return dict(sorted(url_details.items()))
@@ -358,6 +376,7 @@ def get_tok(
     tok_url: str = "",
     method: str = "post",
     proxies: Optional[dict] = None,
+    verify_ssl: bool = True,
 ) -> str:
     """
     Trying to post username/password to get auth.
@@ -368,7 +387,7 @@ def get_tok(
         # this will make a POST call with username and password
         data = {"username": username, "password": password, "maxDuration": True}
         # url2post = url + "api/authenticate/"
-        response = requests.post(url4req, proxies=proxies, json=data)
+        response = requests.post(url4req, proxies=proxies, json=data, verify=verify_ssl)
         if response.status_code == 200:
             cont = json.loads(response.content)
             if "token" in cont:  # other authentication scheme
@@ -377,7 +396,7 @@ def get_tok(
                 token = f"Bearer {cont['tokens']['access']}"
     elif method == "get":
         # this will make a GET call with username and password
-        response = requests.get(url4req)
+        response = requests.get(url4req, verify=verify_ssl)
         if response.status_code == 200:
             cont = json.loads(response.content)
             token = cont["token"]

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -7,7 +7,7 @@ import re
 import time
 from datetime import datetime
 from pathlib import PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
 from urllib.parse import urlparse
 import smart_open.compression as so_compression
@@ -43,6 +43,9 @@ from datahub.ingestion.source.aws.s3_util import (
     strip_s3_prefix,
 )
 from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
+from datahub.ingestion.source.data_lake_common.object_store import (
+    create_object_store_adapter,
+)
 from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
 from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
 from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -197,12 +200,59 @@ class S3Source(StatefulIngestionSourceBase):
     report: DataLakeSourceReport
     profiling_times_taken: List[float]
     container_WU_creator: ContainerWUCreator
+    object_store_adapter: Any
     def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.source_config = config
         self.report = DataLakeSourceReport()
         self.profiling_times_taken = []
+        self.container_WU_creator = ContainerWUCreator(
+            self.source_config.platform,
+            self.source_config.platform_instance,
+            self.source_config.env,
+        )
+        # Create an object store adapter for handling external URLs and paths
+        if self.is_s3_platform():
+            # Get the AWS region from config, if available
+            aws_region = None
+            if self.source_config.aws_config:
+                aws_region = self.source_config.aws_config.aws_region
+                # For backward compatibility with tests: if we're using a test endpoint, use us-east-1
+                if self.source_config.aws_config.aws_endpoint_url and (
+                    "localstack"
+                    in self.source_config.aws_config.aws_endpoint_url.lower()
+                    or "storage.googleapis.com"
+                    in self.source_config.aws_config.aws_endpoint_url.lower()
+                ):
+                    aws_region = "us-east-1"
+            # Create an S3 adapter with the configured region
+            self.object_store_adapter = create_object_store_adapter(
+                "s3", aws_region=aws_region
+            )
+            # Special handling for GCS via S3 (via boto compatibility layer)
+            if (
+                self.source_config.aws_config
+                and self.source_config.aws_config.aws_endpoint_url
+                and "storage.googleapis.com"
+                in self.source_config.aws_config.aws_endpoint_url.lower()
+            ):
+                # We need to preserve the S3-style paths but use GCS external URL generation
+                self.object_store_adapter = create_object_store_adapter("gcs")
+                # Override create_s3_path to maintain S3 compatibility
+                self.object_store_adapter.register_customization(
+                    "create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
+                )
+        else:
+            # For local files, create a default adapter
+            self.object_store_adapter = create_object_store_adapter(
+                self.source_config.platform or "file"
+            )
         config_report = {
             config_option: config.dict().get(config_option)
             for config_option in config_options_to_report
@@ -605,6 +655,19 @@ class S3Source(StatefulIngestionSourceBase):
             maxPartition=max_partition_summary, minPartition=min_partition_summary
         )
+    def get_external_url(self, table_data: TableData) -> Optional[str]:
+        """
+        Get the external URL for a table using the configured object store adapter.
+        Args:
+            table_data: Table data containing path information
+        Returns:
+            An external URL or None if not applicable
+        """
+        # The adapter handles all the URL generation with proper region handling
+        return self.object_store_adapter.get_external_url(table_data)
     def ingest_table(
         self, table_data: TableData, path_spec: PathSpec
     ) -> Iterable[MetadataWorkUnit]:
@@ -674,6 +737,7 @@ class S3Source(StatefulIngestionSourceBase):
                 if max_partition
                 else None
             ),
+            externalUrl=self.get_external_url(table_data),
         )
         aspects.append(dataset_properties)
         if table_data.size_in_bytes > 0:
@@ -1082,11 +1146,6 @@ class S3Source(StatefulIngestionSourceBase):
                     )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        self.container_WU_creator = ContainerWUCreator(
-            self.source_config.platform,
-            self.source_config.platform_instance,
-            self.source_config.env,
-        )
         with PerfTimer() as timer:
             assert self.source_config.path_specs
             for path_spec in self.source_config.path_specs:

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -22,6 +22,7 @@ from datahub.ingestion.api.incremental_properties_helper import (
 from datahub.ingestion.glossary.classification_mixin import (
     ClassificationSourceConfigMixin,
 )
+from datahub.ingestion.source.snowflake.constants import SnowflakeEdition
 from datahub.ingestion.source.snowflake.snowflake_connection import (
     SnowflakeConnectionConfig,
 )
@@ -326,6 +327,18 @@ class SnowflakeV2Config(
         " Map of share name -> details of share.",
     )
+    known_snowflake_edition: Optional[SnowflakeEdition] = Field(
+        default=None,
+        description="Explicitly specify the Snowflake edition (STANDARD or ENTERPRISE). If unset, the edition will be inferred automatically using 'SHOW TAGS'.",
+    )
+    # Allows empty containers to be ingested before datasets are added, avoiding permission errors
+    warn_no_datasets: bool = Field(
+        hidden_from_docs=True,
+        default=False,
+        description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
+    )
     include_assertion_results: bool = Field(
         default=False,
         description="Whether to ingest assertion run results for assertions created using Datahub"

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
     sql_aggregator: Optional[SqlAggregatorReport] = None
     num_ddl_queries_dropped: int = 0
+    num_stream_queries_observed: int = 0
+    num_create_temp_view_queries_observed: int = 0
     num_users: int = 0
@@ -373,6 +375,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                     if entry:
                         yield entry
+    @classmethod
+    def _has_temp_keyword(cls, query_text: str) -> bool:
+        return (
+            re.search(r"\bTEMP\b", query_text, re.IGNORECASE) is not None
+            or re.search(r"\bTEMPORARY\b", query_text, re.IGNORECASE) is not None
+        )
     def _parse_audit_log_row(
         self, row: Dict[str, Any], users: UsersMapping
     ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
@@ -389,6 +398,15 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             key = key.lower()
             res[key] = value
+        timestamp: datetime = res["query_start_time"]
+        timestamp = timestamp.astimezone(timezone.utc)
+        # TODO need to map snowflake query types to ours
+        query_text: str = res["query_text"]
+        query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
+            res["query_type"], QueryType.UNKNOWN
+        )
         direct_objects_accessed = res["direct_objects_accessed"]
         objects_modified = res["objects_modified"]
         object_modified_by_ddl = res["object_modified_by_ddl"]
@@ -399,9 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                 "Error fetching ddl lineage from Snowflake"
             ):
                 known_ddl_entry = self.parse_ddl_query(
-                    res["query_text"],
+                    query_text,
                     res["session_id"],
-                    res["query_start_time"],
+                    timestamp,
                     object_modified_by_ddl,
                     res["query_type"],
                 )
@@ -419,24 +437,38 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             )
         )
-        # Use direct_objects_accessed instead objects_modified
-        # objects_modified returns $SYS_VIEW_X with no mapping
+        # There are a couple cases when we'd want to prefer our own SQL parsing
+        # over Snowflake's metadata.
+        # 1. For queries that use a stream, objects_modified returns $SYS_VIEW_X with no mapping.
+        #    We can check direct_objects_accessed to see if there is a stream used, and if so,
+        #    prefer doing SQL parsing over Snowflake's metadata.
+        # 2. For queries that create a view, objects_modified is empty and object_modified_by_ddl
+        #    contains the view name and columns. Because `object_modified_by_ddl` doesn't contain
+        #    source columns e.g. lineage information, we must do our own SQL parsing. We're mainly
+        #    focused on temporary views. It's fine if we parse a couple extra views, but in general
+        #    we want view definitions to come from Snowflake's schema metadata and not from query logs.
         has_stream_objects = any(
             obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
         )
+        is_create_view = query_type == QueryType.CREATE_VIEW
+        is_create_temp_view = is_create_view and self._has_temp_keyword(query_text)
+        if has_stream_objects or is_create_temp_view:
+            if has_stream_objects:
+                self.report.num_stream_queries_observed += 1
+            elif is_create_temp_view:
+                self.report.num_create_temp_view_queries_observed += 1
-        # If a stream is used, default to query parsing.
-        if has_stream_objects:
-            logger.debug("Found matching stream object")
             return ObservedQuery(
-                query=res["query_text"],
+                query=query_text,
                 session_id=res["session_id"],
-                timestamp=res["query_start_time"].astimezone(timezone.utc),
+                timestamp=timestamp,
                 user=user,
                 default_db=res["default_db"],
                 default_schema=res["default_schema"],
                 query_hash=get_query_fingerprint(
-                    res["query_text"], self.identifiers.platform, fast=True
+                    query_text, self.identifiers.platform, fast=True
                 ),
             )
@@ -502,25 +534,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
                     )
                 )
-        timestamp: datetime = res["query_start_time"]
-        timestamp = timestamp.astimezone(timezone.utc)
-        # TODO need to map snowflake query types to ours
-        query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
-            res["query_type"], QueryType.UNKNOWN
-        )
         entry = PreparsedQuery(
             # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
             # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
             # here
             query_id=get_query_fingerprint(
-                res["query_text"],
+                query_text,
                 self.identifiers.platform,
                 fast=True,
                 secondary_id=res["query_secondary_fingerprint"],
             ),
-            query_text=res["query_text"],
+            query_text=query_text,
             upstreams=upstreams,
             downstream=downstream,
             column_lineage=column_lineage,
@@ -543,7 +567,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
         object_modified_by_ddl: dict,
         query_type: str,
     ) -> Optional[Union[TableRename, TableSwap]]:
-        timestamp = timestamp.astimezone(timezone.utc)
         if (
             object_modified_by_ddl["operationType"] == "ALTER"
             and query_type == "RENAME_TABLE"

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -43,13 +43,6 @@ class SnowflakeQuery:
     ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
         ",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
     )
-    ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
-        "("
-        f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
-        f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
-        f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
-        ")"
-    )
     @staticmethod
     def current_account() -> str:

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -9,6 +9,7 @@ import re
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Union
+from datahub.configuration.time_window_config import BaseTimeWindowConfig
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -551,11 +552,15 @@ class SnowflakeV2Source(
             and len(discovered_views) == 0
             and len(discovered_streams) == 0
         ):
-            self.structured_reporter.failure(
-                GENERIC_PERMISSION_ERROR_KEY,
-                "No tables/views/streams found. Please check permissions.",
-            )
-            return
+            if self.config.warn_no_datasets:
+                self.structured_reporter.warning(
+                    "No tables/views/streams found. Verify dataset permissions if Snowflake source is not empty.",
+                )
+            else:
+                self.structured_reporter.failure(
+                    GENERIC_PERMISSION_ERROR_KEY,
+                    "No tables/views/streams found. Verify dataset permissions in Snowflake.",
+                )
         self.discovered_datasets = (
             discovered_tables + discovered_views + discovered_streams
@@ -571,7 +576,11 @@ class SnowflakeV2Source(
                 queries_extractor = SnowflakeQueriesExtractor(
                     connection=self.connection,
                     config=SnowflakeQueriesExtractorConfig(
-                        window=self.config,
+                        window=BaseTimeWindowConfig(
+                            start_time=self.config.start_time,
+                            end_time=self.config.end_time,
+                            bucket_duration=self.config.bucket_duration,
+                        ),
                         temporary_tables_pattern=self.config.temporary_tables_pattern,
                         include_lineage=self.config.include_table_lineage,
                         include_usage_statistics=self.config.include_usage_stats,
@@ -732,6 +741,8 @@ class SnowflakeV2Source(
             return None
     def is_standard_edition(self) -> bool:
+        if self.config.known_snowflake_edition is not None:
+            return self.config.known_snowflake_edition == SnowflakeEdition.STANDARD
         try:
             self.connection.query(SnowflakeQuery.show_tags())
             return False

datahub/ingestion/source/sql/athena.py CHANGED Viewed

@@ -323,6 +323,7 @@ class Partitionitem:
     "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
 )
 @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
+@capability(SourceCapability.LINEAGE_FINE, "Supported for S3 tables")
 @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
 class AthenaSource(SQLAlchemySource):
     """

datahub/ingestion/source/sql/hive.py CHANGED Viewed

@@ -139,7 +139,7 @@ class StoragePathParser:
                 path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
             elif platform == StoragePlatform.AZURE:
-                if scheme in ("abfs", "abfss"):
+                if scheme in ("abfs", "abfss", "wasbs"):
                     # Format: abfss://container@account.dfs.core.windows.net/path
                     container = parsed.netloc.split("@")[0]
                     path = f"{container}/{parsed.path.lstrip('/')}"
@@ -153,7 +153,7 @@ class StoragePathParser:
             elif platform == StoragePlatform.DBFS:
                 # For DBFS, use path as-is
-                path = parsed.path.lstrip("/")
+                path = "/" + parsed.path.lstrip("/")
             elif platform == StoragePlatform.LOCAL:
                 # For local files, use full path
@@ -169,7 +169,6 @@ class StoragePathParser:
             # Clean up the path
             path = path.rstrip("/")  # Remove trailing slashes
             path = re.sub(r"/+", "/", path)  # Normalize multiple slashes
-            path = f"/{path}"
             return platform, path

acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.0.1rc6__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0rc4py3-none-any.whl → 1.1.0.1rc6py3-none-any.whl