PyPI - acryl-datahub - Versions diffs - 0.15.0rc19__py3-none-any.whl → 0.15.0rc21__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc19py3-none-any.whl → 0.15.0rc21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (20) hide show

{acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/METADATA +2334 -2334
{acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/RECORD +20 -20
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +56 -68
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/sink/datahub_rest.py +12 -1
datahub/ingestion/source/dremio/dremio_api.py +193 -86
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +2 -1
datahub/ingestion/source/snowflake/snowflake_query.py +13 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +16 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +23 -0
datahub/metadata/_schema_classes.py +400 -400
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/schema.avsc +17221 -17574
{acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/WHEEL +0 -0
{acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/dremio/dremio_api.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import concurrent.futures
 import json
 import logging
+import re
 import warnings
 from collections import defaultdict
 from enum import Enum
@@ -609,32 +610,6 @@ class DremioAPIOperations:
         return self.execute_query(query=jobs_query)
-    def get_source_by_id(self, source_id: str) -> Optional[Dict]:
-        """
-        Fetch source details by ID.
-        """
-        response = self.get(
-            url=f"/source/{source_id}",
-        )
-        return response if response else None
-    def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
-        """
-        Get source information for a dataset given its schema and name.
-        """
-        dataset_id = self.get_dataset_id(schema, dataset)
-        if not dataset_id:
-            return None
-        catalog_entry = self.get(
-            url=f"/catalog/{dataset_id}",
-        )
-        if not catalog_entry or "path" not in catalog_entry:
-            return None
-        source_id = catalog_entry["path"][0]
-        return self.get_source_by_id(source_id)
     def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
         """
         Get Dremio tags for a given resource_id.
@@ -673,55 +648,119 @@ class DremioAPIOperations:
             )
         return None
-    def get_containers_for_location(
-        self, resource_id: str, path: List[str]
-    ) -> List[Dict[str, str]]:
-        containers = []
+    def _check_pattern_match(
+        self,
+        pattern: str,
+        paths: List[str],
+        allow_prefix: bool = True,
+    ) -> bool:
+        """
+        Helper method to check if a pattern matches any of the paths.
+        Handles hierarchical matching where each level is matched independently.
+        Also handles prefix matching for partial paths.
+        """
+        if pattern == ".*":
+            return True
-        def traverse_path(location_id: str, entity_path: List[str]) -> List:
-            nonlocal containers
-            try:
-                response = self.get(url=f"/catalog/{location_id}")
-                if (
-                    response.get("entityType")
-                    == DremioEntityContainerType.FOLDER.value.lower()
-                ):
-                    containers.append(
-                        {
-                            "id": location_id,
-                            "name": entity_path[-1],
-                            "path": entity_path[:-1],
-                            "container_type": DremioEntityContainerType.FOLDER,
-                        }
-                    )
+        # Convert the pattern to regex with proper anchoring
+        regex_pattern = pattern
+        if pattern.startswith("^"):
+            # Already has start anchor
+            regex_pattern = pattern.replace(".", r"\.")  # Escape dots
+            regex_pattern = regex_pattern.replace(
+                r"\.*", ".*"
+            )  # Convert .* to wildcard
+        else:
+            # Add start anchor and handle dots
+            regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
+        # Handle end matching
+        if not pattern.endswith(".*"):
+            if pattern.endswith("$"):
+                # Keep explicit end anchor
+                pass
+            elif not allow_prefix:
+                # Add end anchor for exact matching
+                regex_pattern = regex_pattern + "$"
+        for path in paths:
+            if re.match(regex_pattern, path, re.IGNORECASE):
+                return True
-                for container in response.get("children", []):
-                    if (
-                        container.get("type")
-                        == DremioEntityContainerType.CONTAINER.value
-                    ):
-                        traverse_path(container.get("id"), container.get("path"))
+        return False
-            except Exception as exc:
-                logging.info(
-                    "Location {} contains no tables or views. Skipping...".format(id)
-                )
-                self.report.warning(
-                    message="Failed to get tables or views",
-                    context=f"{id}",
-                    exc=exc,
-                )
+    def should_include_container(self, path: List[str], name: str) -> bool:
+        """
+        Helper method to check if a container should be included based on schema patterns.
+        Used by both get_all_containers and get_containers_for_location.
+        """
+        path_components = path + [name] if path else [name]
+        full_path = ".".join(path_components)
-            return containers
+        # Default allow everything case
+        if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
+            self.report.report_container_scanned(full_path)
+            return True
-        return traverse_path(location_id=resource_id, entity_path=path)
+        # Check deny patterns first
+        if self.deny_schema_pattern:
+            for pattern in self.deny_schema_pattern:
+                if self._check_pattern_match(
+                    pattern=pattern,
+                    paths=[full_path],
+                    allow_prefix=False,
+                ):
+                    self.report.report_container_filtered(full_path)
+                    return False
+        # Check allow patterns
+        for pattern in self.allow_schema_pattern:
+            # For patterns with wildcards, check if this path is a parent of the pattern
+            if "*" in pattern:
+                pattern_parts = pattern.split(".")
+                path_parts = path_components
+                # If pattern has exact same number of parts, check each component
+                if len(pattern_parts) == len(path_parts):
+                    matches = True
+                    for p_part, c_part in zip(pattern_parts, path_parts):
+                        if p_part != "*" and p_part.lower() != c_part.lower():
+                            matches = False
+                            break
+                    if matches:
+                        self.report.report_container_scanned(full_path)
+                        return True
+                # Otherwise check if current path is prefix match
+                else:
+                    # Remove the trailing wildcard if present
+                    if pattern_parts[-1] == "*":
+                        pattern_parts = pattern_parts[:-1]
+                    for i in range(len(path_parts)):
+                        current_path = ".".join(path_parts[: i + 1])
+                        pattern_prefix = ".".join(pattern_parts[: i + 1])
+                        if pattern_prefix.startswith(current_path):
+                            self.report.report_container_scanned(full_path)
+                            return True
+            # Direct pattern matching
+            if self._check_pattern_match(
+                pattern=pattern,
+                paths=[full_path],
+                allow_prefix=True,
+            ):
+                self.report.report_container_scanned(full_path)
+                return True
+        self.report.report_container_filtered(full_path)
+        return False
     def get_all_containers(self):
         """
-        Query the Dremio sources API and return source information.
+        Query the Dremio sources API and return filtered source information.
         """
         containers = []
         response = self.get(url="/catalog")
         def process_source(source):
@@ -731,34 +770,41 @@ class DremioAPIOperations:
                 )
                 source_config = source_resp.get("config", {})
-                if source_config.get("database"):
-                    db = source_config.get("database")
-                else:
-                    db = source_config.get("databaseName", "")
-                return {
-                    "id": source.get("id"),
-                    "name": source.get("path")[0],
-                    "path": [],
-                    "container_type": DremioEntityContainerType.SOURCE,
-                    "source_type": source_resp.get("type"),
-                    "root_path": source_config.get("rootPath"),
-                    "database_name": db,
-                }
+                db = source_config.get(
+                    "database", source_config.get("databaseName", "")
+                )
+                if self.should_include_container([], source.get("path")[0]):
+                    return {
+                        "id": source.get("id"),
+                        "name": source.get("path")[0],
+                        "path": [],
+                        "container_type": DremioEntityContainerType.SOURCE,
+                        "source_type": source_resp.get("type"),
+                        "root_path": source_config.get("rootPath"),
+                        "database_name": db,
+                    }
             else:
-                return {
-                    "id": source.get("id"),
-                    "name": source.get("path")[0],
-                    "path": [],
-                    "container_type": DremioEntityContainerType.SPACE,
-                }
+                if self.should_include_container([], source.get("path")[0]):
+                    return {
+                        "id": source.get("id"),
+                        "name": source.get("path")[0],
+                        "path": [],
+                        "container_type": DremioEntityContainerType.SPACE,
+                    }
+            return None
         def process_source_and_containers(source):
             container = process_source(source)
+            if not container:
+                return []
+            # Get sub-containers
             sub_containers = self.get_containers_for_location(
                 resource_id=container.get("id"),
                 path=[container.get("name")],
             )
             return [container] + sub_containers
         # Use ThreadPoolExecutor to parallelize the processing of sources
@@ -771,7 +817,16 @@ class DremioAPIOperations:
             }
             for future in concurrent.futures.as_completed(future_to_source):
-                containers.extend(future.result())
+                source = future_to_source[future]
+                try:
+                    containers.extend(future.result())
+                except Exception as exc:
+                    logger.error(f"Error processing source: {exc}")
+                    self.report.warning(
+                        message="Failed to process source",
+                        context=f"{source}",
+                        exc=exc,
+                    )
         return containers
@@ -785,3 +840,55 @@ class DremioAPIOperations:
             )
         else:
             return ""
+    def get_containers_for_location(
+        self, resource_id: str, path: List[str]
+    ) -> List[Dict[str, str]]:
+        containers = []
+        def traverse_path(location_id: str, entity_path: List[str]) -> List:
+            nonlocal containers
+            try:
+                response = self.get(url=f"/catalog/{location_id}")
+                # Check if current folder should be included
+                if (
+                    response.get("entityType")
+                    == DremioEntityContainerType.FOLDER.value.lower()
+                ):
+                    folder_name = entity_path[-1]
+                    folder_path = entity_path[:-1]
+                    if self.should_include_container(folder_path, folder_name):
+                        containers.append(
+                            {
+                                "id": location_id,
+                                "name": folder_name,
+                                "path": folder_path,
+                                "container_type": DremioEntityContainerType.FOLDER,
+                            }
+                        )
+                # Recursively process child containers
+                for container in response.get("children", []):
+                    if (
+                        container.get("type")
+                        == DremioEntityContainerType.CONTAINER.value
+                    ):
+                        traverse_path(container.get("id"), container.get("path"))
+            except Exception as exc:
+                logging.info(
+                    "Location {} contains no tables or views. Skipping...".format(
+                        location_id
+                    )
+                )
+                self.report.warning(
+                    message="Failed to get tables or views",
+                    context=f"{location_id}",
+                    exc=exc,
+                )
+            return containers
+        return traverse_path(location_id=resource_id, entity_path=path)

datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py CHANGED Viewed

@@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
         "SNOWFLAKE": "snowflake",
         "SYNAPSE": "mssql",
         "TERADATA": "teradata",
+        "VERTICA": "vertica",
     }
     DATABASE_SOURCE_TYPES = {
@@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
         "SNOWFLAKE",
         "SYNAPSE",
         "TERADATA",
+        "VERTICA",
     }
     FILE_OBJECT_STORAGE_TYPES = {

datahub/ingestion/source/dremio/dremio_reporting.py CHANGED Viewed

@@ -14,12 +14,27 @@ class DremioSourceReport(
 ):
     num_containers_failed: int = 0
     num_datasets_failed: int = 0
+    containers_scanned: int = 0
+    containers_filtered: int = 0
     def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
         # recording total combined latency is not very useful, keeping this method as a placeholder
         # for future implementation of min / max / percentiles etc.
         pass
+    def report_container_scanned(self, name: str) -> None:
+        """
+        Record that a container was successfully scanned
+        """
+        self.containers_scanned += 1
+    def report_container_filtered(self, container_name: str) -> None:
+        """
+        Record that a container was filtered out
+        """
+        self.containers_filtered += 1
+        self.report_dropped(container_name)
     def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
         """
         Entity could be a view or a table

datahub/ingestion/source/kafka/kafka_connect.py CHANGED Viewed

@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
         query: str
         transforms: list
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_parser(
         self,
         connector_manifest: ConnectorManifest,
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
                     source_table = f"{table_name_tuple[-2]}.{source_table}"
                 else:
                     include_source_dataset = False
-                    self.report_warning(
-                        self.connector_manifest.name,
-                        f"could not find schema for table {source_table}",
+                    self.report.warning(
+                        "Could not find schema for table"
+                        f"{self.connector_manifest.name} : {source_table}",
                     )
             dataset_name: str = get_dataset_name(database_name, source_table)
             lineage = KafkaConnectLineage(
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
                     target_platform=KAFKA,
                 )
                 lineages.append(lineage)
-                self.report_warning(
+                self.report.warning(
+                    "Could not find input dataset, the connector has query configuration set",
                     self.connector_manifest.name,
-                    "could not find input dataset, the connector has query configuration set",
                 )
                 self.connector_manifest.lineages = lineages
                 return
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
                         include_source_dataset=False,
                     )
                 )
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find input dataset, for connector topics {topic_names}",
+                self.report.warning(
+                    "Could not find input dataset for connector topics",
+                    f"{self.connector_manifest.name} : {topic_names}",
                 )
             self.connector_manifest.lineages = lineages
             return
         else:
             include_source_dataset = True
             if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find input dataset, connector has unknown transform - {transforms[0]['type']}",
+                self.report.warning(
+                    "Could not find input dataset, connector has unknown transform",
+                    f"{self.connector_manifest.name} : {transforms[0]['type']}",
                 )
                 include_source_dataset = False
             if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
-                self.report_warning(
+                self.report.warning(
+                    "Could not find input dataset, connector has one or more unknown transforms",
                     self.connector_manifest.name,
-                    "could not find input dataset, connector has one or more unknown transforms",
                 )
                 include_source_dataset = False
             lineages = self.default_get_lineages(
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
                     lineages.append(lineage)
             self.connector_manifest.lineages = lineages
         except Exception as e:
-            self.report.report_warning(
-                self.connector_manifest.name, f"Error resolving lineage: {e}"
+            self.report.warning(
+                "Error resolving lineage for connector",
+                self.connector_manifest.name,
+                exc=e,
             )
         return
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
         defaultDataset: Optional[str] = None
         version: str = "v1"
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_parser(
         self,
         connector_manifest: ConnectorManifest,
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
             transformed_topic = self.apply_transformations(topic, transforms)
             dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
             if dataset_table is None:
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find target dataset for topic {transformed_topic}, please check your connector configuration",
+                self.report.warning(
+                    "Could not find target dataset for topic, please check your connector configuration"
+                    f"{self.connector_manifest.name} : {transformed_topic} ",
                 )
                 continue
             target_dataset = f"{project}.{dataset_table}"
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
         schema_name: str
         topics_to_tables: Dict[str, str]
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_table_name_from_topic_name(self, topic_name: str) -> str:
         """
         This function converts the topic name to a valid Snowflake table name using some rules.
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
                 )
             self.connector_manifest.lineages = lineages
         except Exception as e:
-            self.report.report_warning(
-                self.connector_manifest.name, f"Error resolving lineage: {e}"
+            self.report.warning(
+                "Error resolving lineage for connector",
+                self.connector_manifest.name,
+                exc=e,
             )
         return
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             )
             self.session.auth = (self.config.username, self.config.password)
-        test_response = self.session.get(f"{self.config.connect_uri}")
+        test_response = self.session.get(f"{self.config.connect_uri}/connectors")
         test_response.raise_for_status()
         logger.info(f"Connection to {self.config.connect_uri} is ok")
         if not jpype.isJVMStarted():
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         payload = connector_response.json()
-        for c in payload:
-            connector_url = f"{self.config.connect_uri}/connectors/{c}"
-            connector_response = self.session.get(connector_url)
-            manifest = connector_response.json()
-            connector_manifest = ConnectorManifest(**manifest)
-            if not self.config.connector_patterns.allowed(connector_manifest.name):
-                self.report.report_dropped(connector_manifest.name)
+        for connector_name in payload:
+            connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
+            connector_manifest = self._get_connector_manifest(
+                connector_name, connector_url
+            )
+            if (
+                connector_manifest is None
+                or not self.config.connector_patterns.allowed(connector_manifest.name)
+            ):
+                self.report.report_dropped(connector_name)
                 continue
             if self.config.provided_configs:
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             connector_manifest.lineages = list()
             connector_manifest.url = connector_url
-            topics = self.session.get(
-                f"{self.config.connect_uri}/connectors/{c}/topics",
-            ).json()
-            connector_manifest.topic_names = topics[c]["topics"]
+            connector_manifest.topic_names = self._get_connector_topics(connector_name)
             # Populate Source Connector metadata
             if connector_manifest.type == SOURCE:
-                tasks = self.session.get(
-                    f"{self.config.connect_uri}/connectors/{c}/tasks",
-                ).json()
-                connector_manifest.tasks = tasks
+                connector_manifest.tasks = self._get_connector_tasks(connector_name)
                 # JDBC source connector lineages
                 if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
                         )
                         continue
-                    for topic in topics:
+                    for topic in connector_manifest.topic_names:
                         lineage = KafkaConnectLineage(
                             source_dataset=target_connector.source_dataset,
                             source_platform=target_connector.source_platform,
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         return connectors_manifest
+    def _get_connector_manifest(
+        self, connector_name: str, connector_url: str
+    ) -> Optional[ConnectorManifest]:
+        try:
+            connector_response = self.session.get(connector_url)
+            connector_response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Failed to get connector details", connector_name, exc=e
+            )
+            return None
+        manifest = connector_response.json()
+        connector_manifest = ConnectorManifest(**manifest)
+        return connector_manifest
+    def _get_connector_tasks(self, connector_name: str) -> dict:
+        try:
+            response = self.session.get(
+                f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
+            )
+            response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Error getting connector tasks", context=connector_name, exc=e
+            )
+            return {}
+        return response.json()
+    def _get_connector_topics(self, connector_name: str) -> List[str]:
+        try:
+            response = self.session.get(
+                f"{self.config.connect_uri}/connectors/{connector_name}/topics",
+            )
+            response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Error getting connector topics", context=connector_name, exc=e
+            )
+            return []
+        return response.json()[connector_name]["topics"]
     def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
         connector_name = connector.name
         connector_type = connector.type

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -413,9 +413,10 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
             return UpstreamLineageEdge.parse_obj(db_row)
         except Exception as e:
             self.report.num_upstream_lineage_edge_parsing_failed += 1
+            upstream_tables = db_row.get("UPSTREAM_TABLES")
             self.structured_reporter.warning(
                 "Failed to parse lineage edge",
-                context=db_row.get("DOWNSTREAM_TABLE_NAME") or None,
+                context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
                 exc=e,
             )
             return None

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -237,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
 LIMIT {limit} {from_clause};
 """
+    @staticmethod
+    def get_secure_view_definitions() -> str:
+        # https://docs.snowflake.com/en/sql-reference/account-usage/views
+        return """
+            SELECT
+                TABLE_CATALOG as "TABLE_CATALOG",
+                TABLE_SCHEMA as "TABLE_SCHEMA",
+                TABLE_NAME as "TABLE_NAME",
+                VIEW_DEFINITION as "VIEW_DEFINITION"
+            FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
+            WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
+        """
     @staticmethod
     def columns_for_schema(
         schema_name: str,

acryl-datahub 0.15.0rc19__py3-none-any.whl → 0.15.0rc21__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc19py3-none-any.whl → 0.15.0rc21py3-none-any.whl