PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show

{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +46 -9
datahub/ingestion/source/ge_profiling_config.py +5 -0
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/sigma/data_classes.py +1 -0
datahub/ingestion/source/sigma/sigma.py +101 -43
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +18 -6
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
     "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
     "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
     "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
+    "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
 }
 _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
         description="Connect args to pass to Snowflake SqlAlchemy driver",
         exclude=True,
     )
+    token: Optional[str] = pydantic.Field(
+        default=None,
+        description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
+    )
     def get_account(self) -> str:
         assert self.account_id
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
         logger.info(f"using authenticator type '{v}'")
         return v
+    @pydantic.validator("token", always=True)
+    def validate_token_oauth_config(cls, v, values):
+        auth_type = values.get("authentication_type")
+        if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
+            if not v:
+                raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
+        elif v is not None:
+            raise ValueError(
+                "Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
+            )
+        return v
     @staticmethod
     def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
         if oauth_config is None:
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
                 application=_APPLICATION_NAME,
                 **connect_args,
             )
+        elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
+            return snowflake.connector.connect(
+                user=self.username,
+                account=self.account_id,
+                authenticator="oauth",
+                token=self.token,  # Token generated externally and provided directly to the recipe
+                warehouse=self.warehouse,
+                role=self.role,
+                application=_APPLICATION_NAME,
+                **connect_args,
+            )
         elif self.authentication_type == "OAUTH_AUTHENTICATOR":
             return self.get_oauth_connection()
         elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -413,9 +413,14 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
             return UpstreamLineageEdge.parse_obj(db_row)
         except Exception as e:
             self.report.num_upstream_lineage_edge_parsing_failed += 1
+            upstream_tables = db_row.get("UPSTREAM_TABLES")
+            downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
             self.structured_reporter.warning(
                 "Failed to parse lineage edge",
-                context=db_row.get("DOWNSTREAM_TABLE_NAME") or None,
+                # Tricky: sometimes the full row data is too large, and so the context
+                # message gets truncated. By pulling out the upstreams and downstream
+                # list, we can at least get the important fields if truncation does occur.
+                context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
                 exc=e,
             )
             return None

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -129,10 +129,12 @@ class SnowflakeQuery:
         row_count AS "ROW_COUNT",
         bytes AS "BYTES",
         clustering_key AS "CLUSTERING_KEY",
-        auto_clustering_on AS "AUTO_CLUSTERING_ON"
+        auto_clustering_on AS "AUTO_CLUSTERING_ON",
+        is_dynamic AS "IS_DYNAMIC",
+        is_iceberg AS "IS_ICEBERG"
         FROM {db_clause}information_schema.tables t
         WHERE table_schema != 'INFORMATION_SCHEMA'
-        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
+        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -149,10 +151,12 @@ class SnowflakeQuery:
         row_count AS "ROW_COUNT",
         bytes AS "BYTES",
         clustering_key AS "CLUSTERING_KEY",
-        auto_clustering_on AS "AUTO_CLUSTERING_ON"
+        auto_clustering_on AS "AUTO_CLUSTERING_ON",
+        is_dynamic AS "IS_DYNAMIC",
+        is_iceberg AS "IS_ICEBERG"
         FROM {db_clause}information_schema.tables t
         where table_schema='{schema_name}'
-        and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
+        and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -233,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
 LIMIT {limit} {from_clause};
 """
+    @staticmethod
+    def get_secure_view_definitions() -> str:
+        # https://docs.snowflake.com/en/sql-reference/account-usage/views
+        return """
+            SELECT
+                TABLE_CATALOG as "TABLE_CATALOG",
+                TABLE_SCHEMA as "TABLE_SCHEMA",
+                TABLE_NAME as "TABLE_NAME",
+                VIEW_DEFINITION as "VIEW_DEFINITION"
+            FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
+            WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
+        """
     @staticmethod
     def columns_for_schema(
         schema_name: str,

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -113,6 +113,7 @@ class SnowflakeV2Report(
     external_lineage_queries_secs: float = -1
     num_tables_with_known_upstreams: int = 0
     num_upstream_lineage_edge_parsing_failed: int = 0
+    num_secure_views_missing_definition: int = 0
     data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
     foreign_keys: List[SnowflakeFK] = field(default_factory=list)
     tags: Optional[List[SnowflakeTag]] = None
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
+    is_dynamic: bool = False
+    is_iceberg: bool = False
+    @property
+    def is_hybrid(self) -> bool:
+        return self.type is not None and self.type == "HYBRID TABLE"
 @dataclass
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
     columns: List[SnowflakeColumn] = field(default_factory=list)
     tags: Optional[List[SnowflakeTag]] = None
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
+    is_secure: bool = False
 @dataclass
@@ -259,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
             snowflake_schemas.append(snowflake_schema)
         return snowflake_schemas
+    @serialized_lru_cache(maxsize=1)
+    def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
+        secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
+            lambda: defaultdict(lambda: defaultdict())
+        )
+        cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
+        for view in cur:
+            db_name = view["TABLE_CATALOG"]
+            schema_name = view["TABLE_SCHEMA"]
+            view_name = view["TABLE_NAME"]
+            secure_view_definitions[db_name][schema_name][view_name] = view[
+                "VIEW_DEFINITION"
+            ]
+        return secure_view_definitions
     @serialized_lru_cache(maxsize=1)
     def get_tables_for_database(
         self, db_name: str
@@ -289,6 +312,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     rows_count=table["ROW_COUNT"],
                     comment=table["COMMENT"],
                     clustering_key=table["CLUSTERING_KEY"],
+                    is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
+                    is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -313,6 +338,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     rows_count=table["ROW_COUNT"],
                     comment=table["COMMENT"],
                     clustering_key=table["CLUSTERING_KEY"],
+                    is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
+                    is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -356,6 +383,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                         materialized=(
                             view.get("is_materialized", "false").lower() == "true"
                         ),
+                        is_secure=(view.get("is_secure", "false").lower() == "true"),
                     )
                 )

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
 logger = logging.getLogger(__name__)
 # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
+# TODO: Move to the standardized types in sql_types.py
 SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
     "DATE": DateType,
     "BIGINT": NumberType,
@@ -423,6 +424,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                     view_identifier = self.identifiers.get_dataset_identifier(
                         view.name, schema_name, db_name
                     )
+                    if view.is_secure and not view.view_definition:
+                        view.view_definition = self.fetch_secure_view_definition(
+                            view.name, schema_name, db_name
+                        )
                     if view.view_definition:
                         self.aggregator.add_view_definition(
                             view_urn=self.identifiers.gen_dataset_urn(view_identifier),
@@ -430,6 +435,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                             default_db=db_name,
                             default_schema=schema_name,
                         )
+                    elif view.is_secure:
+                        self.report.num_secure_views_missing_definition += 1
             if self.config.include_technical_schema:
                 for view in views:
@@ -446,6 +453,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                 context=f"{db_name}.{schema_name}",
             )
+    def fetch_secure_view_definition(
+        self, table_name: str, schema_name: str, db_name: str
+    ) -> Optional[str]:
+        try:
+            view_definitions = self.data_dictionary.get_secure_view_definitions()
+            return view_definitions[db_name][schema_name][table_name]
+        except Exception as e:
+            if isinstance(e, SnowflakePermissionError):
+                error_msg = (
+                    "Failed to get secure views definitions. Please check permissions."
+                )
+            else:
+                error_msg = "Failed to get secure views definitions"
+            self.structured_reporter.warning(
+                error_msg,
+                exc=e,
+            )
+            return None
     def fetch_views_for_schema(
         self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
     ) -> List[SnowflakeView]:
@@ -748,8 +774,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
     ) -> DatasetProperties:
         custom_properties = {}
-        if isinstance(table, SnowflakeTable) and table.clustering_key:
-            custom_properties["CLUSTERING_KEY"] = table.clustering_key
+        if isinstance(table, SnowflakeTable):
+            if table.clustering_key:
+                custom_properties["CLUSTERING_KEY"] = table.clustering_key
+            if table.is_hybrid:
+                custom_properties["IS_HYBRID"] = "true"
+            if table.is_dynamic:
+                custom_properties["IS_DYNAMIC"] = "true"
+            if table.is_iceberg:
+                custom_properties["IS_ICEBERG"] = "true"
+        if isinstance(table, SnowflakeView) and table.is_secure:
+            custom_properties["IS_SECURE"] = "true"
         return DatasetProperties(
             name=table.name,

datahub/ingestion/source/snowflake/snowflake_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import abc
 from functools import cached_property
-from typing import ClassVar, Literal, Optional, Tuple
+from typing import ClassVar, List, Literal, Optional, Tuple
 from datahub.configuration.pattern_utils import is_schema_allowed
 from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
@@ -119,7 +119,6 @@ class SnowflakeFilter:
     ) -> bool:
         if not dataset_type or not dataset_name:
             return True
-        dataset_params = dataset_name.split(".")
         if dataset_type.lower() not in (
             SnowflakeObjectDomain.TABLE,
             SnowflakeObjectDomain.EXTERNAL_TABLE,
@@ -131,6 +130,7 @@ class SnowflakeFilter:
         if _is_sys_table(dataset_name):
             return False
+        dataset_params = _split_qualified_name(dataset_name)
         if len(dataset_params) != 3:
             self.structured_reporter.info(
                 title="Unexpected dataset pattern",
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
     return table_name.lower().startswith("sys$")
+def _split_qualified_name(qualified_name: str) -> List[str]:
+    """
+    Split a qualified name into its constituent parts.
+    >>> _split_qualified_name("db.my_schema.my_table")
+    ['db', 'my_schema', 'my_table']
+    >>> _split_qualified_name('"db"."my_schema"."my_table"')
+    ['db', 'my_schema', 'my_table']
+    >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
+    ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
+    >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
+    ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
+    """
+    # Fast path - no quotes.
+    if '"' not in qualified_name:
+        return qualified_name.split(".")
+    # First pass - split on dots that are not inside quotes.
+    in_quote = False
+    parts: List[List[str]] = [[]]
+    for char in qualified_name:
+        if char == '"':
+            in_quote = not in_quote
+        elif char == "." and not in_quote:
+            parts.append([])
+        else:
+            parts[-1].append(char)
+    # Second pass - remove outer pairs of quotes.
+    result = []
+    for part in parts:
+        if len(part) > 2 and part[0] == '"' and part[-1] == '"':
+            part = part[1:-1]
+        result.append("".join(part))
+    return result
 # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
 # For example "test-database"."test-schema".test_table
 # whereas we generate urns without quotes even for quoted identifiers for backward compatibility
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
 def _cleanup_qualified_name(
     qualified_name: str, structured_reporter: SourceReport
 ) -> str:
-    name_parts = qualified_name.split(".")
+    name_parts = _split_qualified_name(qualified_name)
     if len(name_parts) != 3:
         if not _is_sys_table(qualified_name):
             structured_reporter.info(
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
             )
         return qualified_name.replace('"', "")
     return _combine_identifier_parts(
-        db_name=name_parts[0].strip('"'),
-        schema_name=name_parts[1].strip('"'),
-        table_name=name_parts[2].strip('"'),
+        db_name=name_parts[0],
+        schema_name=name_parts[1],
+        table_name=name_parts[2],
     )

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -17,6 +17,9 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
+from datahub.ingestion.api.incremental_properties_helper import (
+    auto_incremental_properties,
+)
 from datahub.ingestion.api.source import (
     CapabilityReport,
     MetadataWorkUnitProcessor,
@@ -446,6 +449,9 @@ class SnowflakeV2Source(
             functools.partial(
                 auto_incremental_lineage, self.config.incremental_lineage
             ),
+            functools.partial(
+                auto_incremental_properties, self.config.incremental_properties
+            ),
             StaleEntityRemovalHandler.create(
                 self, self.config, self.ctx
             ).workunit_processor,

datahub/ingestion/source/sql/athena.py CHANGED Viewed

@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.api.source import StructuredLogLevel
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws.s3_util import make_s3_urn
 from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
     register_custom_type,
 )
 from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source.sql.sql_utils import (
     add_table_to_schema_container,
     gen_database_container,
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
     get_schema_fields_for_sqlalchemy_column,
 )
+try:
+    from typing_extensions import override
+except ImportError:
+    _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
+    def override(f: _F, /) -> _F:  # noqa: F811
+        return f
 logger = logging.getLogger(__name__)
 assert STRUCT, "required type modules are not available"
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
     - Profiling when enabled.
     """
-    table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
+    config: AthenaConfig
+    report: SQLSourceReport
     def __init__(self, config, ctx):
         super().__init__(config, ctx, "athena")
         self.cursor: Optional[BaseCursor] = None
+        self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
     @classmethod
     def create(cls, config_dict, ctx):
         config = AthenaConfig.parse_obj(config_dict)
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
         )
     # It seems like database/schema filter in the connection string does not work and this to work around that
+    @override
     def get_schema_names(self, inspector: Inspector) -> List[str]:
         athena_config = typing.cast(AthenaConfig, self.config)
         schemas = inspector.get_schema_names()
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
             return [schema for schema in schemas if schema == athena_config.database]
         return schemas
-    # Overwrite to get partitions
+    @classmethod
+    def _casted_partition_key(cls, key: str) -> str:
+        # We need to cast the partition keys to a VARCHAR, since otherwise
+        # Athena may throw an error during concatenation / comparison.
+        return f"CAST({key} as VARCHAR)"
+    @override
     def get_partitions(
         self, inspector: Inspector, schema: str, table: str
-    ) -> List[str]:
-        partitions = []
-        athena_config = typing.cast(AthenaConfig, self.config)
-        if not athena_config.extract_partitions:
-            return []
+    ) -> Optional[List[str]]:
+        if not self.config.extract_partitions:
+            return None
         if not self.cursor:
-            return []
+            return None
         metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
             table_name=table, schema_name=schema
         )
-        if metadata.partition_keys:
-            for key in metadata.partition_keys:
-                if key.name:
-                    partitions.append(key.name)
-            if not partitions:
-                return []
+        partitions = []
+        for key in metadata.partition_keys:
+            if key.name:
+                partitions.append(key.name)
+        if not partitions:
+            return []
-            # We create an artiificaial concatenated partition key to be able to query max partition easier
-            part_concat = "|| '-' ||".join(partitions)
+        with self.report.report_exc(
+            message="Failed to extract partition details",
+            context=f"{schema}.{table}",
+            level=StructuredLogLevel.WARN,
+        ):
+            # We create an artifical concatenated partition key to be able to query max partition easier
+            part_concat = " || '-' || ".join(
+                self._casted_partition_key(key) for key in partitions
+            )
             max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
             ret = self.cursor.execute(max_partition_query)
             max_partition: Dict[str, str] = {}
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
                 partitions=partitions,
                 max_partition=max_partition,
             )
-            return partitions
-        return []
+        return partitions
     # Overwrite to modify the creation of schema fields
     def get_schema_fields_for_column(
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
         if partition and partition.max_partition:
             max_partition_filters = []
             for key, value in partition.max_partition.items():
-                max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'")
+                max_partition_filters.append(
+                    f"{self._casted_partition_key(key)} = '{value}'"
+                )
             max_partition = str(partition.max_partition)
             return (
                 max_partition,

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import pydantic
 import sqlalchemy.dialects.mssql
-# This import verifies that the dependencies are available.
 from pydantic.fields import Field
 from sqlalchemy import create_engine, inspect
 from sqlalchemy.engine.base import Connection
@@ -50,6 +48,7 @@ from datahub.ingestion.source.sql.sql_config import (
     BasicSQLAlchemyConfig,
     make_sqlalchemy_uri,
 )
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.metadata.schema_classes import (
     BooleanTypeClass,
     NumberTypeClass,
@@ -78,6 +77,11 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
     include_stored_procedures_code: bool = Field(
         default=True, description="Include information about object code."
     )
+    procedure_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for stored procedures to filter in ingestion."
+        "Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
+    )
     include_jobs: bool = Field(
         default=True,
         description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.",
@@ -164,6 +168,8 @@ class SQLServerSource(SQLAlchemySource):
     If you do use pyodbc, make sure to change the source type from `mssql` to `mssql-odbc` so that we pull in the right set of dependencies. This will be needed in most cases where encryption is required, such as managed SQL Server services in Azure.
     """
+    report: SQLSourceReport
     def __init__(self, config: SQLServerConfig, ctx: PipelineContext):
         super().__init__(config, ctx, "mssql")
         # Cache the table and column descriptions
@@ -416,10 +422,16 @@ class SQLServerSource(SQLAlchemySource):
         data_flow = MSSQLDataFlow(entity=mssql_default_job)
         with inspector.engine.connect() as conn:
             procedures_data_list = self._get_stored_procedures(conn, db_name, schema)
-            procedures = [
-                StoredProcedure(flow=mssql_default_job, **procedure_data)
-                for procedure_data in procedures_data_list
-            ]
+            procedures: List[StoredProcedure] = []
+            for procedure_data in procedures_data_list:
+                procedure_full_name = f"{db_name}.{schema}.{procedure_data['name']}"
+                if not self.config.procedure_pattern.allowed(procedure_full_name):
+                    self.report.report_dropped(procedure_full_name)
+                    continue
+                procedures.append(
+                    StoredProcedure(flow=mssql_default_job, **procedure_data)
+                )
             if procedures:
                 yield from self.construct_flow_workunits(data_flow=data_flow)
             for procedure in procedures:

acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl