PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show

{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +23 -1
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +0 -2
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/powerbi/m_query/tree_function.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from functools import partial
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Union
 from lark import Token, Tree
@@ -58,7 +58,7 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
         if isinstance(node, Token):
             return None
-        for child in cast(Tree, node).children:
+        for child in node.children:
             child_node: Optional[Tree] = internal(child)
             if child_node is not None:
                 return child_node
@@ -99,7 +99,7 @@ def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
                 logger.debug(f"Unable to resolve parameter reference to {ref}")
                 values.append(ref)
         elif isinstance(node, Token):
-            values.append(cast(Token, node).value)
+            values.append(node.value)
             return
         else:
             for child in node.children:

datahub/ingestion/source/powerbi/m_query/validator.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from typing import Optional, Tuple
-from datahub.ingestion.source.powerbi.m_query import resolver
+import datahub.ingestion.source.powerbi.m_query.data_classes
 logger = logging.getLogger(__name__)
@@ -14,12 +14,18 @@ def validate_parse_tree(
     :param native_query_enabled: Whether user want to extract lineage from native query
     :return: True or False.
     """
-    function_names = [fun.value for fun in resolver.FunctionName]
+    function_names = [
+        fun.value
+        for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName
+    ]
     if not any(fun in expression for fun in function_names):
         return False, "DataAccess function is not present in M-Query expression."
     if native_query_enabled is False:
-        if resolver.FunctionName.NATIVE_QUERY.value in function_names:
+        if (
+            datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value
+            in function_names
+        ):
             return (
                 False,
                 "Lineage extraction from native query is disabled. Enable native_query_parsing in recipe",

datahub/ingestion/source/powerbi/powerbi.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Iterable, List, Optional, Tuple, Union
 import more_itertools
 import datahub.emitter.mce_builder as builder
+import datahub.ingestion.source.powerbi.m_query.data_classes
 import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import ContainerKey, gen_containers
@@ -42,12 +43,13 @@ from datahub.ingestion.source.powerbi.config import (
     Constant,
     PowerBiDashboardSourceConfig,
     PowerBiDashboardSourceReport,
+    SupportedDataPlatform,
 )
 from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
     AbstractDataPlatformInstanceResolver,
     create_dataplatform_instance_resolver,
 )
-from datahub.ingestion.source.powerbi.m_query import parser, resolver
+from datahub.ingestion.source.powerbi.m_query import parser
 from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
@@ -182,7 +184,9 @@ class Mapper:
         return [schema_mcp]
     def make_fine_grained_lineage_class(
-        self, lineage: resolver.Lineage, dataset_urn: str
+        self,
+        lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage,
+        dataset_urn: str,
     ) -> List[FineGrainedLineage]:
         fine_grained_lineages: List[FineGrainedLineage] = []
@@ -234,7 +238,9 @@ class Mapper:
         upstream: List[UpstreamClass] = []
         cll_lineage: List[FineGrainedLineage] = []
-        upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables(
+        upstream_lineage: List[
+            datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+        ] = parser.get_upstream_tables(
             table=table,
             reporter=self.__reporter,
             platform_instance_resolver=self.__dataplatform_instance_resolver,
@@ -1294,7 +1300,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
     def validate_dataset_type_mapping(self):
         powerbi_data_platforms: List[str] = [
             data_platform.value.powerbi_data_platform_name
-            for data_platform in resolver.SupportedDataPlatform
+            for data_platform in SupportedDataPlatform
         ]
         for key in self.source_config.dataset_type_mapping.keys():
@@ -1481,7 +1487,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         # As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id
-        # This will result in creating checkpoint for each workspace_id
+        # This will result in creating a checkpoint for each workspace_id
         if self.source_config.modified_since:
             return []  # Handle these in get_workunits_internal
         else:
@@ -1492,7 +1498,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         """
-        Datahub Ingestion framework invoke this method
+        Datahub Ingestion framework invokes this method
         """
         logger.info("PowerBi plugin execution is started")
         # Validate dataset type mapping

datahub/ingestion/source/preset.py CHANGED Viewed

@@ -85,6 +85,7 @@ class PresetSource(SupersetSource):
         super().__init__(ctx, config)
         self.config = config
         self.report = StaleEntityRemovalSourceReport()
+        self.platform = "preset"
     def login(self):
         try:

datahub/ingestion/source/pulsar.py CHANGED Viewed

@@ -78,8 +78,27 @@ class PulsarSchema:
     def __init__(self, schema):
         self.schema_version = schema.get("version")
-        avro_schema = json.loads(schema.get("data"))
-        self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
+        schema_data = schema.get("data")
+        if not schema_data:
+            logger.warning("Schema data is empty or None. Using default empty schema.")
+            schema_data = "{}"
+        try:
+            avro_schema = json.loads(schema_data)
+        except json.JSONDecodeError as e:
+            logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
+            avro_schema = {}
+        self.schema_name = "null"
+        if avro_schema.get("namespace") and avro_schema.get("name"):
+            self.schema_name = (
+                avro_schema.get("namespace") + "." + avro_schema.get("name")
+            )
+        elif avro_schema.get("namespace"):
+            self.schema_name = avro_schema.get("namespace")
+        elif avro_schema.get("name"):
+            self.schema_name = avro_schema.get("name")
         self.schema_description = avro_schema.get("doc")
         self.schema_type = schema.get("type")
         self.schema_str = schema.get("data")

datahub/ingestion/source/qlik_sense/data_classes.py CHANGED Viewed

@@ -15,6 +15,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     TimeType,
 )
+# TODO: Replace with standardized types in sql_types.py
 FIELD_TYPE_MAPPING: Dict[
     str,
     Type[

datahub/ingestion/source/redash.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import sys
 from dataclasses import dataclass, field
-from typing import Dict, Iterable, List, Optional, Set, Type
+from typing import Dict, Iterable, List, Optional, Set
 import dateutil.parser as dp
 from packaging import version
@@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import (  # SourceCapability,; capability,
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.registry import import_path
 from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.metadata.com.linkedin.pegasus2avro.common import (
@@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
     ChartTypeClass,
     DashboardInfoClass,
 )
+from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
 from datahub.utilities.lossy_collections import LossyDict, LossyList
 from datahub.utilities.perf_timer import PerfTimer
-from datahub.utilities.sql_parser_base import SQLParser
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
 logger = logging.getLogger(__name__)
@@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
     parse_table_names_from_sql: bool = Field(
         default=False, description="See note below."
     )
-    sql_parser: str = Field(
-        default="datahub.utilities.sql_parser.DefaultSQLParser",
-        description="custom SQL parser. See note below for details.",
-    )
     env: str = Field(
         default=DEFAULT_ENV,
@@ -354,7 +349,6 @@ class RedashSource(Source):
         self.api_page_limit = self.config.api_page_limit or math.inf
         self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
-        self.sql_parser_path = self.config.sql_parser
         logger.info(
             f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
@@ -380,31 +374,6 @@ class RedashSource(Source):
         config = RedashConfig.parse_obj(config_dict)
         return cls(ctx, config)
-    @classmethod
-    def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
-        assert "." in sql_parser_path, "sql_parser-path must contain a ."
-        parser_cls = import_path(sql_parser_path)
-        if not issubclass(parser_cls, SQLParser):
-            raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
-        return parser_cls
-    @classmethod
-    def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
-        parser_cls = cls._import_sql_parser_cls(sql_parser_path)
-        try:
-            sql_table_names: List[str] = parser_cls(sql).get_tables()
-        except Exception as e:
-            logger.warning(f"Sql parser failed on {sql} with {e}")
-            return []
-        # Remove quotes from table names
-        sql_table_names = [t.replace('"', "") for t in sql_table_names]
-        sql_table_names = [t.replace("`", "") for t in sql_table_names]
-        return sql_table_names
     def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
         url = f"/api/data_sources/{data_source_id}"
         resp = self.client._get(url).json()
@@ -441,14 +410,6 @@ class RedashSource(Source):
         return database_name
-    def _construct_datalineage_urn(
-        self, platform: str, database_name: str, sql_table_name: str
-    ) -> str:
-        full_dataset_name = get_full_qualified_name(
-            platform, database_name, sql_table_name
-        )
-        return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
     def _get_datasource_urns(
         self, data_source: Dict, sql_query_data: Dict = {}
     ) -> Optional[List[str]]:
@@ -464,34 +425,23 @@ class RedashSource(Source):
             # Getting table lineage from SQL parsing
             if self.parse_table_names_from_sql and data_source_syntax == "sql":
                 dataset_urns = list()
-                try:
-                    sql_table_names = self._get_sql_table_names(
-                        query, self.sql_parser_path
-                    )
-                except Exception as e:
+                sql_parser_in_tables = create_lineage_sql_parsed_result(
+                    query=query,
+                    platform=platform,
+                    env=self.config.env,
+                    platform_instance=None,
+                    default_db=database_name,
+                )
+                # make sure dataset_urns is not empty list
+                dataset_urns = sql_parser_in_tables.in_tables
+                if sql_parser_in_tables.debug_info.table_error:
                     self.report.queries_problem_parsing.add(str(query_id))
                     self.error(
                         logger,
                         "sql-parsing",
-                        f"exception {e} in parsing query-{query_id}-datasource-{data_source_id}",
+                        f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
                     )
-                    sql_table_names = []
-                for sql_table_name in sql_table_names:
-                    try:
-                        dataset_urns.append(
-                            self._construct_datalineage_urn(
-                                platform, database_name, sql_table_name
-                            )
-                        )
-                    except Exception:
-                        self.report.queries_problem_parsing.add(str(query_id))
-                        self.warn(
-                            logger,
-                            "data-urn-invalid",
-                            f"Problem making URN for {sql_table_name} parsed from query {query_id}",
-                        )
-                # make sure dataset_urns is not empty list
                 return dataset_urns if len(dataset_urns) > 0 else None
             else:

datahub/ingestion/source/redshift/config.py CHANGED Viewed

@@ -159,6 +159,7 @@ class RedshiftConfig(
         description="Whether to extract column level lineage. This config works with rest-sink only.",
     )
+    # TODO - use DatasetPropertiesConfigMixin instead
     patch_custom_properties: bool = Field(
         default=True,
         description="Whether to patch custom properties on existing datasets rather than replace.",

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
     ```
     """
+    # TODO: Replace with standardized types in sql_types.py
     REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
         str,
         Type[
@@ -830,6 +831,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             customProperties=custom_properties,
         )
         if self.config.patch_custom_properties:
+            # TODO: use auto_incremental_properties workunit processor instead
+            # Deprecate use of patch_custom_properties
             patch_builder = create_dataset_props_patch_builder(
                 dataset_urn, dataset_properties
             )

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -9,6 +9,7 @@ from datetime import datetime
 from itertools import groupby
 from pathlib import PurePath
 from typing import Any, Dict, Iterable, List, Optional, Tuple
+from urllib.parse import urlparse
 import smart_open.compression as so_compression
 from more_itertools import peekable
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
                         folders = []
                         for dir in dirs_to_process:
                             logger.info(f"Getting files from folder: {dir}")
-                            prefix_to_process = dir.rstrip("\\").lstrip(
-                                self.create_s3_path(bucket_name, "/")
-                            )
+                            prefix_to_process = urlparse(dir).path.lstrip("/")
                             folders.extend(
                                 self.get_folder_info(

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, cast
+from typing import Dict, List, Optional, Set
 import pydantic
 from pydantic import Field, SecretStr, root_validator, validator
@@ -16,6 +16,9 @@ from datahub.configuration.source_common import (
 from datahub.configuration.time_window_config import BaseTimeWindowConfig
 from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
+from datahub.ingestion.api.incremental_properties_helper import (
+    IncrementalPropertiesConfigMixin,
+)
 from datahub.ingestion.glossary.classification_mixin import (
     ClassificationSourceConfigMixin,
 )
@@ -118,9 +121,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
             )
         # Always exclude reporting metadata for INFORMATION_SCHEMA schema
-        if schema_pattern is not None and schema_pattern:
+        if schema_pattern:
             logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
-            cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$")
+            assert isinstance(schema_pattern, AllowDenyPattern)
+            schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
         return values
@@ -187,6 +191,7 @@ class SnowflakeV2Config(
     StatefulUsageConfigMixin,
     StatefulProfilingConfigMixin,
     ClassificationSourceConfigMixin,
+    IncrementalPropertiesConfigMixin,
 ):
     include_usage_stats: bool = Field(
         default=True,

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
     "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
     "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
     "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
+    "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
 }
 _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
         description="Connect args to pass to Snowflake SqlAlchemy driver",
         exclude=True,
     )
+    token: Optional[str] = pydantic.Field(
+        default=None,
+        description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
+    )
     def get_account(self) -> str:
         assert self.account_id
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
         logger.info(f"using authenticator type '{v}'")
         return v
+    @pydantic.validator("token", always=True)
+    def validate_token_oauth_config(cls, v, values):
+        auth_type = values.get("authentication_type")
+        if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
+            if not v:
+                raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
+        elif v is not None:
+            raise ValueError(
+                "Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
+            )
+        return v
     @staticmethod
     def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
         if oauth_config is None:
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
                 application=_APPLICATION_NAME,
                 **connect_args,
             )
+        elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
+            return snowflake.connector.connect(
+                user=self.username,
+                account=self.account_id,
+                authenticator="oauth",
+                token=self.token,  # Token generated externally and provided directly to the recipe
+                warehouse=self.warehouse,
+                role=self.role,
+                application=_APPLICATION_NAME,
+                **connect_args,
+            )
         elif self.authentication_type == "OAUTH_AUTHENTICATOR":
             return self.get_oauth_connection()
         elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -413,9 +413,14 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
             return UpstreamLineageEdge.parse_obj(db_row)
         except Exception as e:
             self.report.num_upstream_lineage_edge_parsing_failed += 1
+            upstream_tables = db_row.get("UPSTREAM_TABLES")
+            downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
             self.structured_reporter.warning(
                 "Failed to parse lineage edge",
-                context=db_row.get("DOWNSTREAM_TABLE_NAME") or None,
+                # Tricky: sometimes the full row data is too large, and so the context
+                # message gets truncated. By pulling out the upstreams and downstream
+                # list, we can at least get the important fields if truncation does occur.
+                context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
                 exc=e,
             )
             return None

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -129,10 +129,12 @@ class SnowflakeQuery:
         row_count AS "ROW_COUNT",
         bytes AS "BYTES",
         clustering_key AS "CLUSTERING_KEY",
-        auto_clustering_on AS "AUTO_CLUSTERING_ON"
+        auto_clustering_on AS "AUTO_CLUSTERING_ON",
+        is_dynamic AS "IS_DYNAMIC",
+        is_iceberg AS "IS_ICEBERG"
         FROM {db_clause}information_schema.tables t
         WHERE table_schema != 'INFORMATION_SCHEMA'
-        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
+        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -149,10 +151,12 @@ class SnowflakeQuery:
         row_count AS "ROW_COUNT",
         bytes AS "BYTES",
         clustering_key AS "CLUSTERING_KEY",
-        auto_clustering_on AS "AUTO_CLUSTERING_ON"
+        auto_clustering_on AS "AUTO_CLUSTERING_ON",
+        is_dynamic AS "IS_DYNAMIC",
+        is_iceberg AS "IS_ICEBERG"
         FROM {db_clause}information_schema.tables t
         where table_schema='{schema_name}'
-        and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
+        and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -233,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
 LIMIT {limit} {from_clause};
 """
+    @staticmethod
+    def get_secure_view_definitions() -> str:
+        # https://docs.snowflake.com/en/sql-reference/account-usage/views
+        return """
+            SELECT
+                TABLE_CATALOG as "TABLE_CATALOG",
+                TABLE_SCHEMA as "TABLE_SCHEMA",
+                TABLE_NAME as "TABLE_NAME",
+                VIEW_DEFINITION as "VIEW_DEFINITION"
+            FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
+            WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
+        """
     @staticmethod
     def columns_for_schema(
         schema_name: str,

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -113,6 +113,7 @@ class SnowflakeV2Report(
     external_lineage_queries_secs: float = -1
     num_tables_with_known_upstreams: int = 0
     num_upstream_lineage_edge_parsing_failed: int = 0
+    num_secure_views_missing_definition: int = 0
     data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
     foreign_keys: List[SnowflakeFK] = field(default_factory=list)
     tags: Optional[List[SnowflakeTag]] = None
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
+    is_dynamic: bool = False
+    is_iceberg: bool = False
+    @property
+    def is_hybrid(self) -> bool:
+        return self.type is not None and self.type == "HYBRID TABLE"
 @dataclass
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
     columns: List[SnowflakeColumn] = field(default_factory=list)
     tags: Optional[List[SnowflakeTag]] = None
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
+    is_secure: bool = False
 @dataclass
@@ -259,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
             snowflake_schemas.append(snowflake_schema)
         return snowflake_schemas
+    @serialized_lru_cache(maxsize=1)
+    def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
+        secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
+            lambda: defaultdict(lambda: defaultdict())
+        )
+        cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
+        for view in cur:
+            db_name = view["TABLE_CATALOG"]
+            schema_name = view["TABLE_SCHEMA"]
+            view_name = view["TABLE_NAME"]
+            secure_view_definitions[db_name][schema_name][view_name] = view[
+                "VIEW_DEFINITION"
+            ]
+        return secure_view_definitions
     @serialized_lru_cache(maxsize=1)
     def get_tables_for_database(
         self, db_name: str
@@ -289,6 +312,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     rows_count=table["ROW_COUNT"],
                     comment=table["COMMENT"],
                     clustering_key=table["CLUSTERING_KEY"],
+                    is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
+                    is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -313,6 +338,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     rows_count=table["ROW_COUNT"],
                     comment=table["COMMENT"],
                     clustering_key=table["CLUSTERING_KEY"],
+                    is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
+                    is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -356,6 +383,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                         materialized=(
                             view.get("is_materialized", "false").lower() == "true"
                         ),
+                        is_secure=(view.get("is_secure", "false").lower() == "true"),
                     )
                 )

acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl