PyPI - acryl-datahub - Versions diffs - 0.15.0rc16__py3-none-any.whl → 0.15.0rc18__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc16py3-none-any.whl → 0.15.0rc18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show

datahub/ingestion/source/looker/view_upstream.py CHANGED Viewed

@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
     LookMLSourceReport,
 )
 from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
+from datahub.sql_parsing.schema_resolver import match_columns_to_schema
 from datahub.sql_parsing.sqlglot_lineage import (
     ColumnLineageInfo,
     ColumnRef,
     SqlParsingResult,
     Urn,
+    create_and_cache_schema_resolver,
     create_lineage_sql_parsed_result,
 )
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
 class AbstractViewUpstream(ABC):
     """
     Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
-    For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
+    For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
     """
     view_context: LookerViewContext
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
     def create_fields(self) -> List[ViewField]:
         return []  # it is for the special case
+    def create_upstream_column_refs(
+        self, upstream_urn: str, downstream_looker_columns: List[str]
+    ) -> List[ColumnRef]:
+        """
+        - **`upstream_urn`**: The URN of the upstream dataset.
+        - **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
+        - This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
+        """
+        schema_resolver = create_and_cache_schema_resolver(
+            platform=self.view_context.view_connection.platform,
+            platform_instance=self.view_context.view_connection.platform_instance,
+            env=self.view_context.view_connection.platform_env or self.config.env,
+            graph=self.ctx.graph,
+        )
+        urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
+        if schema_info:
+            actual_columns = match_columns_to_schema(
+                schema_info, downstream_looker_columns
+            )
+        else:
+            logger.info(
+                f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
+            )
+            actual_columns = [column.lower() for column in downstream_looker_columns]
+        upstream_column_refs: List[ColumnRef] = []
+        for column in actual_columns:
+            upstream_column_refs.append(
+                ColumnRef(
+                    column=column,
+                    table=upstream_urn,
+                )
+            )
+        return upstream_column_refs
 class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
     """
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
         # in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
         # referring to upstream table
         if self._get_upstream_dataset_urn() and not upstreams_column_refs:
-            upstreams_column_refs = [
-                ColumnRef(
-                    table=self._get_upstream_dataset_urn()[
-                        0
-                    ],  # 0th index has table of from clause
-                    column=column,
-                )
-                for column in field_context.column_name_in_sql_attribute()
-            ]
+            upstreams_column_refs = self.create_upstream_column_refs(
+                upstream_urn=self._get_upstream_dataset_urn()[
+                    0
+                ],  # 0th index has table of from clause,
+                downstream_looker_columns=field_context.column_name_in_sql_attribute(),
+            )
         # fix any derived view reference present in urn
         upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
             return upstream_column_refs
         explore_urn: str = self._get_upstream_dataset_urn()[0]
+        expected_columns: List[str] = []
         for column in field_context.column_name_in_sql_attribute():
             if column in self._get_explore_column_mapping():
                 explore_column: Dict = self._get_explore_column_mapping()[column]
-                upstream_column_refs.append(
-                    ColumnRef(
-                        column=explore_column.get("field", explore_column[NAME]),
-                        table=explore_urn,
-                    )
+                expected_columns.append(
+                    explore_column.get("field", explore_column[NAME])
                 )
-        return upstream_column_refs
+        return self.create_upstream_column_refs(
+            upstream_urn=explore_urn, downstream_looker_columns=expected_columns
+        )
     def get_upstream_dataset_urn(self) -> List[Urn]:
         return self._get_upstream_dataset_urn()
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
     def get_upstream_column_ref(
         self, field_context: LookerFieldContext
     ) -> List[ColumnRef]:
-        upstream_column_ref: List[ColumnRef] = []
-        for column_name in field_context.column_name_in_sql_attribute():
-            upstream_column_ref.append(
-                ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
-            )
-        return upstream_column_ref
+        return self.create_upstream_column_refs(
+            upstream_urn=self._get_upstream_dataset_urn(),
+            downstream_looker_columns=field_context.column_name_in_sql_attribute(),
+        )
     def get_upstream_dataset_urn(self) -> List[Urn]:
         return [self._get_upstream_dataset_urn()]
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
         self, field_context: LookerFieldContext
     ) -> List[ColumnRef]:
         upstream_column_ref: List[ColumnRef] = []
         if not self._get_upstream_dataset_urn():
             return upstream_column_ref
-        for column_name in field_context.column_name_in_sql_attribute():
-            upstream_column_ref.append(
-                ColumnRef(table=self._get_upstream_dataset_urn()[0], column=column_name)
-            )
-        return upstream_column_ref
+        return self.create_upstream_column_refs(
+            upstream_urn=self._get_upstream_dataset_urn()[0],
+            downstream_looker_columns=field_context.column_name_in_sql_attribute(),
+        )
     def get_upstream_dataset_urn(self) -> List[Urn]:
         return self._get_upstream_dataset_urn()

datahub/ingestion/source/metadata/business_glossary.py CHANGED Viewed

@@ -45,6 +45,9 @@ class Owners(ConfigModel):
     groups: Optional[List[str]] = None
+OwnersMultipleTypes = Union[List[Owners], Owners]
 class KnowledgeCard(ConfigModel):
     url: Optional[str] = None
     label: Optional[str] = None
@@ -57,7 +60,7 @@ class GlossaryTermConfig(ConfigModel):
     term_source: Optional[str] = None
     source_ref: Optional[str] = None
     source_url: Optional[str] = None
-    owners: Optional[Owners] = None
+    owners: Optional[OwnersMultipleTypes] = None
     inherits: Optional[List[str]] = None
     contains: Optional[List[str]] = None
     values: Optional[List[str]] = None
@@ -74,7 +77,7 @@ class GlossaryNodeConfig(ConfigModel):
     id: Optional[str] = None
     name: str
     description: str
-    owners: Optional[Owners] = None
+    owners: Optional[OwnersMultipleTypes] = None
     terms: Optional[List["GlossaryTermConfig"]] = None
     nodes: Optional[List["GlossaryNodeConfig"]] = None
     knowledge_links: Optional[List[KnowledgeCard]] = None
@@ -88,7 +91,7 @@ class DefaultConfig(ConfigModel):
     """Holds defaults for populating fields in glossary terms"""
     source: Optional[str] = None
-    owners: Owners
+    owners: OwnersMultipleTypes
     url: Optional[str] = None
     source_type: str = "INTERNAL"
@@ -153,30 +156,44 @@ def make_glossary_term_urn(
     return "urn:li:glossaryTerm:" + create_id(path, default_id, enable_auto_id)
-def get_owners(owners: Owners) -> models.OwnershipClass:
-    ownership_type, ownership_type_urn = validate_ownership_type(owners.type)
+def get_owners_multiple_types(owners: OwnersMultipleTypes) -> models.OwnershipClass:
+    """Allows owner types to be a list and maintains backward compatibility"""
+    if isinstance(owners, Owners):
+        return models.OwnershipClass(owners=list(get_owners(owners)))
+    owners_meta: List[models.OwnerClass] = []
+    for owner in owners:
+        owners_meta.extend(get_owners(owner))
+    return models.OwnershipClass(owners=owners_meta)
+def get_owners(owners: Owners) -> Iterable[models.OwnerClass]:
+    actual_type = owners.type or models.OwnershipTypeClass.DEVELOPER
+    if actual_type.startswith("urn:li:ownershipType:"):
+        ownership_type: str = "CUSTOM"
+        ownership_type_urn: Optional[str] = actual_type
+    else:
+        ownership_type, ownership_type_urn = validate_ownership_type(actual_type)
     if owners.typeUrn is not None:
         ownership_type_urn = owners.typeUrn
-    owners_meta: List[models.OwnerClass] = []
     if owners.users is not None:
-        owners_meta = owners_meta + [
-            models.OwnerClass(
+        for o in owners.users:
+            yield models.OwnerClass(
                 owner=make_user_urn(o),
                 type=ownership_type,
                 typeUrn=ownership_type_urn,
             )
-            for o in owners.users
-        ]
     if owners.groups is not None:
-        owners_meta = owners_meta + [
-            models.OwnerClass(
+        for o in owners.groups:
+            yield models.OwnerClass(
                 owner=make_group_urn(o),
                 type=ownership_type,
                 typeUrn=ownership_type_urn,
             )
-            for o in owners.groups
-        ]
-    return models.OwnershipClass(owners=owners_meta)
 def get_mces(
@@ -185,7 +202,7 @@ def get_mces(
     ingestion_config: BusinessGlossarySourceConfig,
     ctx: PipelineContext,
 ) -> Iterable[Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]]:
-    root_owners = get_owners(glossary.owners)
+    root_owners = get_owners_multiple_types(glossary.owners)
     if glossary.nodes:
         for node in glossary.nodes:
@@ -270,7 +287,7 @@ def get_mces_from_node(
     node_owners = parentOwners
     if glossaryNode.owners is not None:
         assert glossaryNode.owners is not None
-        node_owners = get_owners(glossaryNode.owners)
+        node_owners = get_owners_multiple_types(glossaryNode.owners)
     node_snapshot = models.GlossaryNodeSnapshotClass(
         urn=node_urn,
@@ -426,7 +443,7 @@ def get_mces_from_term(
     ownership: models.OwnershipClass = parentOwnership
     if glossaryTerm.owners is not None:
         assert glossaryTerm.owners is not None
-        ownership = get_owners(glossaryTerm.owners)
+        ownership = get_owners_multiple_types(glossaryTerm.owners)
     aspects.append(ownership)
     if glossaryTerm.domain is not None:

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -129,7 +129,9 @@ class SnowflakeQuery:
         row_count AS "ROW_COUNT",
         bytes AS "BYTES",
         clustering_key AS "CLUSTERING_KEY",
-        auto_clustering_on AS "AUTO_CLUSTERING_ON"
+        auto_clustering_on AS "AUTO_CLUSTERING_ON",
+        is_dynamic AS "IS_DYNAMIC",
+        is_iceberg AS "IS_ICEBERG"
         FROM {db_clause}information_schema.tables t
         WHERE table_schema != 'INFORMATION_SCHEMA'
         and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
@@ -149,7 +151,9 @@ class SnowflakeQuery:
         row_count AS "ROW_COUNT",
         bytes AS "BYTES",
         clustering_key AS "CLUSTERING_KEY",
-        auto_clustering_on AS "AUTO_CLUSTERING_ON"
+        auto_clustering_on AS "AUTO_CLUSTERING_ON",
+        is_dynamic AS "IS_DYNAMIC",
+        is_iceberg AS "IS_ICEBERG"
         FROM {db_clause}information_schema.tables t
         where table_schema='{schema_name}'
         and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -113,6 +113,7 @@ class SnowflakeV2Report(
     external_lineage_queries_secs: float = -1
     num_tables_with_known_upstreams: int = 0
     num_upstream_lineage_edge_parsing_failed: int = 0
+    num_secure_views_missing_definition: int = 0
     data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
     foreign_keys: List[SnowflakeFK] = field(default_factory=list)
     tags: Optional[List[SnowflakeTag]] = None
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
+    is_dynamic: bool = False
+    is_iceberg: bool = False
+    @property
+    def is_hybrid(self) -> bool:
+        return self.type is not None and self.type == "HYBRID TABLE"
 @dataclass
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
     columns: List[SnowflakeColumn] = field(default_factory=list)
     tags: Optional[List[SnowflakeTag]] = None
     column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
+    is_secure: bool = False
 @dataclass
@@ -289,6 +296,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     rows_count=table["ROW_COUNT"],
                     comment=table["COMMENT"],
                     clustering_key=table["CLUSTERING_KEY"],
+                    is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
+                    is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -313,6 +322,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
                     rows_count=table["ROW_COUNT"],
                     comment=table["COMMENT"],
                     clustering_key=table["CLUSTERING_KEY"],
+                    is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
+                    is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
                 )
             )
         return tables
@@ -356,6 +367,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
                         materialized=(
                             view.get("is_materialized", "false").lower() == "true"
                         ),
+                        is_secure=(view.get("is_secure", "false").lower() == "true"),
                     )
                 )

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -431,6 +431,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
                             default_db=db_name,
                             default_schema=schema_name,
                         )
+                    elif view.is_secure:
+                        self.report.num_secure_views_missing_definition += 1
             if self.config.include_technical_schema:
                 for view in views:
@@ -749,8 +751,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
     ) -> DatasetProperties:
         custom_properties = {}
-        if isinstance(table, SnowflakeTable) and table.clustering_key:
-            custom_properties["CLUSTERING_KEY"] = table.clustering_key
+        if isinstance(table, SnowflakeTable):
+            if table.clustering_key:
+                custom_properties["CLUSTERING_KEY"] = table.clustering_key
+            if table.is_hybrid:
+                custom_properties["IS_HYBRID"] = "true"
+            if table.is_dynamic:
+                custom_properties["IS_DYNAMIC"] = "true"
+            if table.is_iceberg:
+                custom_properties["IS_ICEBERG"] = "true"
+        if isinstance(table, SnowflakeView) and table.is_secure:
+            custom_properties["IS_SECURE"] = "true"
         return DatasetProperties(
             name=table.name,

datahub/ingestion/source/snowflake/snowflake_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import abc
 from functools import cached_property
-from typing import ClassVar, Literal, Optional, Tuple
+from typing import ClassVar, List, Literal, Optional, Tuple
 from datahub.configuration.pattern_utils import is_schema_allowed
 from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
     return table_name.lower().startswith("sys$")
+def _split_qualified_name(qualified_name: str) -> List[str]:
+    """
+    Split a qualified name into its constituent parts.
+    >>> _split_qualified_name("db.my_schema.my_table")
+    ['db', 'my_schema', 'my_table']
+    >>> _split_qualified_name('"db"."my_schema"."my_table"')
+    ['db', 'my_schema', 'my_table']
+    >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
+    ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
+    >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
+    ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
+    """
+    # Fast path - no quotes.
+    if '"' not in qualified_name:
+        return qualified_name.split(".")
+    # First pass - split on dots that are not inside quotes.
+    in_quote = False
+    parts: List[List[str]] = [[]]
+    for char in qualified_name:
+        if char == '"':
+            in_quote = not in_quote
+        elif char == "." and not in_quote:
+            parts.append([])
+        else:
+            parts[-1].append(char)
+    # Second pass - remove outer pairs of quotes.
+    result = []
+    for part in parts:
+        if len(part) > 2 and part[0] == '"' and part[-1] == '"':
+            part = part[1:-1]
+        result.append("".join(part))
+    return result
 # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
 # For example "test-database"."test-schema".test_table
 # whereas we generate urns without quotes even for quoted identifiers for backward compatibility
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
 def _cleanup_qualified_name(
     qualified_name: str, structured_reporter: SourceReport
 ) -> str:
-    name_parts = qualified_name.split(".")
+    name_parts = _split_qualified_name(qualified_name)
     if len(name_parts) != 3:
         if not _is_sys_table(qualified_name):
             structured_reporter.info(
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
             )
         return qualified_name.replace('"', "")
     return _combine_identifier_parts(
-        db_name=name_parts[0].strip('"'),
-        schema_name=name_parts[1].strip('"'),
-        table_name=name_parts[2].strip('"'),
+        db_name=name_parts[0],
+        schema_name=name_parts[1],
+        table_name=name_parts[2],
     )

datahub/ingestion/source/state/redundant_run_skip_handler.py CHANGED Viewed

@@ -69,7 +69,7 @@ class RedundantRunSkipHandler(
         platform: Optional[str] = None
         source_class = type(self.source)
         if hasattr(source_class, "get_platform_name"):
-            platform = source_class.get_platform_name()  # type: ignore
+            platform = source_class.get_platform_name()
         # Default name for everything else
         job_name_suffix = self.get_job_name_suffix()

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -353,7 +353,7 @@ class TableauConfig(
     project_path_separator: str = Field(
         default="/",
-        description="The separator used for the project_pattern field between project names. By default, we use a slash. "
+        description="The separator used for the project_path_pattern field between project names. By default, we use a slash. "
         "You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.",
     )
@@ -959,19 +959,36 @@ class TableauSiteSource:
         return is_allowed
     def _is_denied_project(self, project: TableauProject) -> bool:
-        # Either project name or project path should exist in deny
-        for deny_pattern in self.config.project_pattern.deny:
-            # Either name or project path is denied
-            if re.match(
-                deny_pattern, project.name, self.config.project_pattern.regex_flags
-            ) or re.match(
-                deny_pattern,
-                self._get_project_path(project),
-                self.config.project_pattern.regex_flags,
-            ):
-                return True
-        logger.info(f"project({project.name}) is not denied as per project_pattern")
-        return False
+        """
+        Why use an explicit denial check instead of the `AllowDenyPattern.allowed` method?
+        Consider a scenario where a Tableau site contains four projects: A, B, C, and D, with the following hierarchical relationship:
+        - **A**
+          - **B** (Child of A)
+          - **C** (Child of A)
+        - **D**
+        In this setup:
+        - `project_pattern` is configured with `allow: ["A"]` and `deny: ["B"]`.
+        - `extract_project_hierarchy` is set to `True`.
+        The goal is to extract assets from project A and its children while explicitly denying the child project B.
+        If we rely solely on the `project_pattern.allowed()` method, project C's assets will not be ingested.
+        This happens because project C is not explicitly included in the `allow` list, nor is it part of the `deny` list.
+        However, since `extract_project_hierarchy` is enabled, project C should ideally be included in the ingestion process unless explicitly denied.
+        To address this, the function explicitly checks the deny regex to ensure that project C’s assets are ingested if it is not specifically denied in the deny list. This approach ensures that the hierarchy is respected while adhering to the configured allow/deny rules.
+        """
+        # Either project_pattern or project_path_pattern is set in a recipe
+        # TableauConfig.projects_backward_compatibility ensures that at least one of these properties is configured.
+        return self.config.project_pattern.denied(
+            project.name
+        ) or self.config.project_path_pattern.denied(self._get_project_path(project))
     def _init_tableau_project_registry(self, all_project_map: dict) -> None:
         list_of_skip_projects: List[TableauProject] = []
@@ -999,9 +1016,11 @@ class TableauSiteSource:
             for project in list_of_skip_projects:
                 if (
                     project.parent_id in projects_to_ingest
-                    and self._is_denied_project(project) is False
+                    and not self._is_denied_project(project)
                 ):
-                    logger.debug(f"Project {project.name} is added in project registry")
+                    logger.debug(
+                        f"Project {project.name} is added in project registry as it's a child project and not explicitly denied in `deny` list"
+                    )
                     projects_to_ingest[project.id] = project
         # We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -979,7 +979,6 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
         len(query_filter.keys()) == 1
         and query_filter.get(c.ID_WITH_IN)
         and isinstance(query_filter[c.ID_WITH_IN], list)
-        and len(query_filter[c.ID_WITH_IN]) > 100 * page_size
     ):
         ids = query_filter[c.ID_WITH_IN]
         filter_pages = [

acryl-datahub 0.15.0rc16__py3-none-any.whl → 0.15.0rc18__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc16py3-none-any.whl → 0.15.0rc18py3-none-any.whl