PyPI - acryl-datahub - Versions diffs - 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl - Mend

acryl-datahub 0.15.0.5rc9py3-none-any.whl → 0.15.0.6rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show

datahub/ingestion/source/snowflake/snowflake_utils.py CHANGED Viewed

@@ -124,19 +124,20 @@ class SnowflakeFilter:
             SnowflakeObjectDomain.VIEW,
             SnowflakeObjectDomain.MATERIALIZED_VIEW,
             SnowflakeObjectDomain.ICEBERG_TABLE,
+            SnowflakeObjectDomain.STREAM,
         ):
             return False
         if _is_sys_table(dataset_name):
             return False
-        dataset_params = _split_qualified_name(dataset_name)
+        dataset_params = split_qualified_name(dataset_name)
         if len(dataset_params) != 3:
             self.structured_reporter.info(
                 title="Unexpected dataset pattern",
                 message=f"Found a {dataset_type} with an unexpected number of parts. Database and schema filtering will not work as expected, but table filtering will still work.",
                 context=dataset_name,
             )
-            # We fall-through here so table/view filtering still works.
+            # We fall-through here so table/view/stream filtering still works.
         if (
             len(dataset_params) >= 1
@@ -169,6 +170,14 @@ class SnowflakeFilter:
         ):
             return False
+        if (
+            dataset_type.lower() == SnowflakeObjectDomain.STREAM
+            and not self.filter_config.stream_pattern.allowed(
+                _cleanup_qualified_name(dataset_name, self.structured_reporter)
+            )
+        ):
+            return False
         return True
@@ -183,17 +192,17 @@ def _is_sys_table(table_name: str) -> bool:
     return table_name.lower().startswith("sys$")
-def _split_qualified_name(qualified_name: str) -> List[str]:
+def split_qualified_name(qualified_name: str) -> List[str]:
     """
     Split a qualified name into its constituent parts.
-    >>> _split_qualified_name("db.my_schema.my_table")
+    >>> split_qualified_name("db.my_schema.my_table")
     ['db', 'my_schema', 'my_table']
-    >>> _split_qualified_name('"db"."my_schema"."my_table"')
+    >>> split_qualified_name('"db"."my_schema"."my_table"')
     ['db', 'my_schema', 'my_table']
-    >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
+    >>> split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
     ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
-    >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
+    >>> split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
     ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
     """
@@ -231,7 +240,7 @@ def _split_qualified_name(qualified_name: str) -> List[str]:
 def _cleanup_qualified_name(
     qualified_name: str, structured_reporter: SourceReport
 ) -> str:
-    name_parts = _split_qualified_name(qualified_name)
+    name_parts = split_qualified_name(qualified_name)
     if len(name_parts) != 3:
         if not _is_sys_table(qualified_name):
             structured_reporter.info(

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -539,15 +539,27 @@ class SnowflakeV2Source(
             for schema in db.schemas
             for table_name in schema.views
         ]
+        discovered_streams: List[str] = [
+            self.identifiers.get_dataset_identifier(stream_name, schema.name, db.name)
+            for db in databases
+            for schema in db.schemas
+            for stream_name in schema.streams
+        ]
-        if len(discovered_tables) == 0 and len(discovered_views) == 0:
+        if (
+            len(discovered_tables) == 0
+            and len(discovered_views) == 0
+            and len(discovered_streams) == 0
+        ):
             self.structured_reporter.failure(
                 GENERIC_PERMISSION_ERROR_KEY,
-                "No tables/views found. Please check permissions.",
+                "No tables/views/streams found. Please check permissions.",
             )
             return
-        self.discovered_datasets = discovered_tables + discovered_views
+        self.discovered_datasets = (
+            discovered_tables + discovered_views + discovered_streams
+        )
         if self.config.use_queries_v2:
             with self.report.new_stage(f"*: {VIEW_PARSING}"):

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
     create_lineage_sql_parsed_result,
 )
 from datahub.utilities import config_clean
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -798,7 +799,7 @@ class TableauSourceReport(
     num_upstream_table_lineage_failed_parse_sql: int = 0
     num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
     num_hidden_assets_skipped: int = 0
-    logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
+    logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
     last_authenticated_at: Optional[datetime] = None

datahub/ingestion/source/unity/ge_profiler.py CHANGED Viewed

@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from typing import Iterable, List, Optional
+from databricks.sdk.service.catalog import DataSourceFormat
 from sqlalchemy import create_engine
 from sqlalchemy.engine import Connection
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
         self.size_in_bytes = None
         self.rows_count = None
         self.ddl = None
+        self.data_source_format = table.data_source_format
+    @property
+    def is_delta_table(self) -> bool:
+        return self.data_source_format == DataSourceFormat.DELTA
 class UnityCatalogGEProfiler(GenericProfiler):
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
         profile_table_level_only = self.profiling_config.profile_table_level_only
         dataset_name = table.ref.qualified_table_name
-        try:
-            table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
-        except Exception as e:
-            logger.warning(f"Failed to get table size for {dataset_name}: {e}")
+        if table.is_delta_table:
+            try:
+                table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
+            except Exception as e:
+                self.report.warning(
+                    title="Incomplete Dataset Profile",
+                    message="Failed to get table size",
+                    context=dataset_name,
+                    exc=e,
+                )
         if table.size_in_bytes is None:
             self.report.num_profile_missing_size_in_bytes += 1
         if not self.is_dataset_eligible_for_profiling(
             dataset_name,
             size_in_bytes=table.size_in_bytes,
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
                 self.report.report_dropped(dataset_name)
             return None
+        if profile_table_level_only and table.is_delta_table:
+            # For requests with profile_table_level_only set, dataset profile is generated
+            # by looking at table.rows_count. For delta tables (a typical databricks table)
+            # count(*) is an efficient query to compute row count.
+            try:
+                table.rows_count = _get_dataset_row_count(table, conn)
+            except Exception as e:
+                self.report.warning(
+                    title="Incomplete Dataset Profile",
+                    message="Failed to get table row count",
+                    context=dataset_name,
+                    exc=e,
+                )
+        if table.rows_count is None:
+            self.report.num_profile_missing_row_count += 1
         self.report.report_entity_profiled(dataset_name)
         logger.debug(f"Preparing profiling request for {dataset_name}")
         return TableProfilerRequest(
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
         conn.dialect.identifier_preparer.quote(c)
         for c in [table.ref.catalog, table.ref.schema, table.ref.table]
     )
+    # This query only works for delta table.
+    # Ref: https://docs.databricks.com/en/delta/table-details.html
+    # Note: Any change here should also update _get_dataset_row_count
     row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
     if row is None:
         return None
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
             return int(row._asdict()["sizeInBytes"])
         except Exception:
             return None
+def _get_dataset_row_count(
+    table: UnityCatalogSQLGenericTable, conn: Connection
+) -> Optional[int]:
+    name = ".".join(
+        conn.dialect.identifier_preparer.quote(c)
+        for c in [table.ref.catalog, table.ref.schema, table.ref.table]
+    )
+    # This query only works efficiently for delta table
+    row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
+    if row is None:
+        return None
+    else:
+        try:
+            return int(row._asdict()["numRows"])
+        except Exception:
+            return None

datahub/ingestion/source/unity/report.py CHANGED Viewed

@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
         default_factory=LossyDict
     )
     num_profile_missing_size_in_bytes: int = 0
+    num_profile_missing_row_count: int = 0
     num_profile_failed_unsupported_column_type: int = 0
     num_profile_failed_int_casts: int = 0

datahub/ingestion/source_report/pulsar.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import Optional
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalSourceReport,
 )
+from datahub.utilities.lossy_collections import LossyList
 @dataclass
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
     tenants_scanned: Optional[int] = None
     namespaces_scanned: Optional[int] = None
     topics_scanned: Optional[int] = None
-    tenants_filtered: List[str] = field(default_factory=list)
-    namespaces_filtered: List[str] = field(default_factory=list)
-    topics_filtered: List[str] = field(default_factory=list)
+    tenants_filtered: LossyList[str] = field(default_factory=LossyList)
+    namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
+    topics_filtered: LossyList[str] = field(default_factory=LossyList)
     def report_pulsar_version(self, version: str) -> None:
         self.pulsar_version = version

{acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/LICENSE RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.5rc9py3-none-any.whl → 0.15.0.6rc1py3-none-any.whl