PyPI - acryl-datahub - Versions diffs - 0.15.0.5rc8__py3-none-any.whl → 0.15.0.5rc10__py3-none-any.whl - Mend

acryl-datahub 0.15.0.5rc8py3-none-any.whl → 0.15.0.5rc10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show

datahub/ingestion/source/bigquery_v2/bigquery_report.py CHANGED Viewed

@@ -141,7 +141,7 @@ class BigQueryV2Report(
     profiling_skipped_invalid_partition_type: Dict[str, str] = field(
         default_factory=TopKDict
     )
-    profiling_skipped_partition_profiling_disabled: List[str] = field(
+    profiling_skipped_partition_profiling_disabled: LossyList[str] = field(
         default_factory=LossyList
     )
     allow_pattern: Optional[str] = None

datahub/ingestion/source/delta_lake/report.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import dataclasses
 from dataclasses import field as dataclass_field
-from typing import List
 from datahub.ingestion.api.source import SourceReport
+from datahub.utilities.lossy_collections import LossyList
 @dataclasses.dataclass
 class DeltaLakeSourceReport(SourceReport):
     files_scanned = 0
-    filtered: List[str] = dataclass_field(default_factory=list)
+    filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_file_scanned(self) -> None:
         self.files_scanned += 1

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
     StringTypeClass,
     UnionTypeClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.registries.domain_registry import DomainRegistry
 MAX_ITEMS_TO_RETRIEVE = 100
@@ -120,7 +121,7 @@ class DynamoDBConfig(
 @dataclass
 class DynamoDBSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_dropped(self, name: str) -> None:
         self.filtered.append(name)

datahub/ingestion/source/elastic_search.py CHANGED Viewed

@@ -62,6 +62,7 @@ from datahub.metadata.schema_classes import (
     SubTypesClass,
 )
 from datahub.utilities.config_clean import remove_protocol
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.urns.dataset_urn import DatasetUrn
 logger = logging.getLogger(__name__)
@@ -189,7 +190,7 @@ class ElasticToSchemaFieldConverter:
 @dataclass
 class ElasticsearchSourceReport(SourceReport):
     index_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_index_scanned(self, index: str) -> None:
         self.index_scanned += 1

datahub/ingestion/source/ge_profiling_config.py CHANGED Viewed

@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
     )
     max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
         default=None,
-        description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.",
+        description="A positive integer that specifies the maximum number of columns to profile for "
+        "any table. `None` implies all columns. The cost of profiling goes up significantly as the "
+        "number of columns to profile goes up.",
     )
     profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
         default=None,
-        description="Profile table only if it has been updated since these many number of days. If set to `null`, no constraint of last modified time for tables to profile. Supported only in `snowflake` and `BigQuery`.",
+        description="Profile table only if it has been updated since these many number of days. "
+        "If set to `null`, no constraint of last modified time for tables to profile. "
+        "Supported only in `snowflake` and `BigQuery`.",
     )
     profile_table_size_limit: Optional[int] = Field(
         default=5,
         description="Profile tables only if their size is less than specified GBs. If set to `null`, "
-        "no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`"
-        "Supported for `oracle` based on calculated size from gathered stats.",
+        "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
+        "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
     )
     profile_table_row_limit: Optional[int] = Field(
         default=5000000,
-        description="Profile tables only if their row count is less than specified count. If set to `null`, "
-        "no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`"
-        "Supported for `oracle` based on gathered stats.",
+        description="Profile tables only if their row count is less than specified count. "
+        "If set to `null`, no limit on the row count of tables to profile. Supported only in "
+        "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
     )
     profile_table_row_count_estimate_only: bool = Field(

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 from humanfriendly import format_timespan
 from pydantic import Field, validator
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
     OperationConfig,
     is_profiling_enabled,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
 logger = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class TimingClass:
 class IcebergSourceReport(StaleEntityRemovalSourceReport):
     tables_scanned: int = 0
     entities_profiled: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     load_table_timings: TimingClass = field(default_factory=TimingClass)
     processing_table_timings: TimingClass = field(default_factory=TimingClass)
     profiling_table_timings: TimingClass = field(default_factory=TimingClass)

datahub/ingestion/source/identity/azure_ad.py CHANGED Viewed

@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.emitter.mce_builder import make_group_urn, make_user_urn
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
     OriginTypeClass,
     StatusClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
         description="regex patterns for groups to include in ingestion.",
     )
-    # If enabled, report will contain names of filtered users and groups.
-    filtered_tracking: bool = Field(
-        default=True,
-        description="If enabled, report will contain names of filtered users and groups.",
-    )
+    _remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
     # Optional: Whether to mask sensitive information from workunit ID's. On by default.
     mask_group_id: bool = Field(
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
 @dataclass
 class AzureADSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
-    filtered_tracking: bool = field(default=True, repr=False)
-    filtered_count: int = field(default=0)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_filtered(self, name: str) -> None:
-        self.filtered_count += 1
-        if self.filtered_tracking:
-            self.filtered.append(name)
+        self.filtered.append(name)
 # Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
     def __init__(self, config: AzureADConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.config = config
-        self.report = AzureADSourceReport(
-            filtered_tracking=self.config.filtered_tracking
-        )
+        self.report = AzureADSourceReport()
         session = requests.Session()
         retries = Retry(
             total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]

datahub/ingestion/source/identity/okta.py CHANGED Viewed

@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
     OriginTypeClass,
     StatusClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
 nest_asyncio.apply()
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
 @dataclass
 class OktaSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_filtered(self, name: str) -> None:
         self.filtered.append(name)

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
     OwnershipSourceTypeClass,
     SubTypesClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.mapping import Constants, OperationProcessor
 from datahub.utilities.registries.domain_registry import DomainRegistry
 from datahub.utilities.str_enum import StrEnum
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
 @dataclass
 class KafkaSourceReport(StaleEntityRemovalSourceReport):
     topics_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_topic_scanned(self, topic: str) -> None:
         self.topics_scanned += 1

datahub/ingestion/source/kafka_connect/common.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
 @dataclass
 class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
     connectors_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_connector_scanned(self, connector: str) -> None:
         self.connectors_scanned += 1

datahub/ingestion/source/ldap.py CHANGED Viewed

@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
     CorpUserSnapshotClass,
     GroupMembershipClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 # default mapping for attrs
 user_attrs_map: Dict[str, Any] = {}
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
 @dataclasses.dataclass
 class LDAPSourceReport(StaleEntityRemovalSourceReport):
-    dropped_dns: List[str] = dataclasses.field(default_factory=list)
+    dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
     def report_dropped(self, dn: str) -> None:
         self.dropped_dns.append(dn)

datahub/ingestion/source/looker/lookml_config.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from dataclasses import dataclass, field as dataclass_field
 from datetime import timedelta
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, Literal, Optional, Union
 import pydantic
 from pydantic import root_validator, validator
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
 class LookMLSourceReport(StaleEntityRemovalSourceReport):
     git_clone_latency: Optional[timedelta] = None
     models_discovered: int = 0
-    models_dropped: List[str] = dataclass_field(default_factory=LossyList)
+    models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
     views_discovered: int = 0
-    views_dropped: List[str] = dataclass_field(default_factory=LossyList)
-    views_dropped_unreachable: List[str] = dataclass_field(default_factory=LossyList)
+    views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
+    views_dropped_unreachable: LossyList[str] = dataclass_field(
+        default_factory=LossyList
+    )
     query_parse_attempts: int = 0
     query_parse_failures: int = 0
-    query_parse_failure_views: List[str] = dataclass_field(default_factory=LossyList)
+    query_parse_failure_views: LossyList[str] = dataclass_field(
+        default_factory=LossyList
+    )
     _looker_api: Optional[LookerAPI] = None
     def report_models_scanned(self) -> None:

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -24,6 +24,7 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
 import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.source_common import DatasetLineageProviderConfigBase
+from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import (
     ContainerKey,
@@ -155,10 +156,7 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
     workspace: str = Field(
         description="The Mode workspace name. Find it in Settings > Workspace > Details."
     )
-    default_schema: str = Field(
-        default="public",
-        description="Default schema to use when schema is not provided in an SQL query",
-    )
+    _default_schema = pydantic_removed_field("default_schema")
     space_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern(

datahub/ingestion/source/mongodb.py CHANGED Viewed

@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
     UnionTypeClass,
 )
 from datahub.metadata.urns import DatasetUrn
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -143,7 +144,7 @@ class MongoDBConfig(
 @dataclass
 class MongoDBSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_dropped(self, name: str) -> None:
         self.filtered.append(name)

datahub/ingestion/source/nifi.py CHANGED Viewed

@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
     DatasetPropertiesClass,
 )
 from datahub.specific.datajob import DataJobPatchBuilder
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
 NIFI = "nifi"
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
 @dataclass
 class NifiSourceReport(SourceReport):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_dropped(self, ent_name: str) -> None:
         self.filtered.append(ent_name)

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -195,8 +195,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
     dashboards_scanned: int = 0
     charts_scanned: int = 0
-    filtered_dashboards: List[str] = dataclass_field(default_factory=list)
-    filtered_charts: List[str] = dataclass_field(default_factory=list)
+    filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
+    filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
     m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
     m_query_parse_attempts: int = 0

datahub/ingestion/source/powerbi_report_server/report_server.py CHANGED Viewed

@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
     StatusClass,
 )
 from datahub.utilities.dedup_list import deduplicate_list
+from datahub.utilities.lossy_collections import LossyList
 LOGGER = logging.getLogger(__name__)
@@ -476,7 +477,7 @@ class Mapper:
 @dataclass
 class PowerBiReportServerDashboardSourceReport(SourceReport):
     scanned_report: int = 0
-    filtered_reports: List[str] = dataclass_field(default_factory=list)
+    filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_scanned(self, count: int = 1) -> None:
         self.scanned_report += count

datahub/ingestion/source/redash.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import sys
 from dataclasses import dataclass, field
-from typing import Dict, Iterable, List, Optional, Set
+from typing import Dict, Iterable, List, Optional
 import dateutil.parser as dp
 from packaging import version
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
     DashboardInfoClass,
 )
 from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
-from datahub.utilities.lossy_collections import LossyDict, LossyList
+from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
 class RedashSourceReport(SourceReport):
     items_scanned: int = 0
     filtered: LossyList[str] = field(default_factory=LossyList)
-    queries_problem_parsing: Set[str] = field(default_factory=set)
-    queries_no_dataset: Set[str] = field(default_factory=set)
-    charts_no_input: Set[str] = field(default_factory=set)
+    queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
+    queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
+    charts_no_input: LossySet[str] = field(default_factory=LossySet)
     total_queries: Optional[int] = field(
         default=None,
     )

datahub/ingestion/source/salesforce.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
 import time
+from dataclasses import dataclass, field as dataclass_field
 from datetime import datetime
 from enum import Enum
 from typing import Any, Dict, Iterable, List, Optional
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
     TagAssociationClass,
 )
 from datahub.utilities import config_clean
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
         return config_clean.remove_trailing_slashes(v)
+@dataclass
 class SalesforceSourceReport(SourceReport):
-    filtered: List[str] = []
+    filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_dropped(self, ent_name: str) -> None:
         self.filtered.append(ent_name)

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -308,6 +308,13 @@ class SnowflakeV2Config(
         " assertions CLI in snowflake",
     )
+    pushdown_deny_usernames: List[str] = Field(
+        default=[],
+        description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
+        "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
+        "Only applicable if `use_queries_v2` is enabled.",
+    )
     @validator("convert_urns_to_lowercase")
     def validate_convert_urns_to_lowercase(cls, v):
         if not v:

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -12,6 +12,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
 from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
 from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
 from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
+from datahub.utilities.lossy_collections import LossyDict
 from datahub.utilities.perf_timer import PerfTimer
 if TYPE_CHECKING:
@@ -66,7 +67,7 @@ class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport):
     num_external_table_edges_scanned: int = 0
     ignore_start_time_lineage: Optional[bool] = None
     upstream_lineage_in_report: Optional[bool] = None
-    upstream_lineage: Dict[str, List[str]] = field(default_factory=dict)
+    upstream_lineage: LossyDict[str, List[str]] = field(default_factory=LossyDict)
     lineage_start_time: Optional[datetime] = None
     lineage_end_time: Optional[datetime] = None

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -567,6 +567,7 @@ class SnowflakeV2Source(
                         include_queries=self.config.include_queries,
                         include_query_usage_statistics=self.config.include_query_usage_statistics,
                         user_email_pattern=self.config.user_email_pattern,
+                        pushdown_deny_usernames=self.config.pushdown_deny_usernames,
                     ),
                     structured_report=self.report,
                     filters=self.filters,

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
     create_lineage_sql_parsed_result,
 )
 from datahub.utilities import config_clean
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -798,7 +799,7 @@ class TableauSourceReport(
     num_upstream_table_lineage_failed_parse_sql: int = 0
     num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
     num_hidden_assets_skipped: int = 0
-    logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
+    logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
     last_authenticated_at: Optional[datetime] = None

datahub/ingestion/source/unity/ge_profiler.py CHANGED Viewed

@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from typing import Iterable, List, Optional
+from databricks.sdk.service.catalog import DataSourceFormat
 from sqlalchemy import create_engine
 from sqlalchemy.engine import Connection
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
         self.size_in_bytes = None
         self.rows_count = None
         self.ddl = None
+        self.data_source_format = table.data_source_format
+    @property
+    def is_delta_table(self) -> bool:
+        return self.data_source_format == DataSourceFormat.DELTA
 class UnityCatalogGEProfiler(GenericProfiler):
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
         profile_table_level_only = self.profiling_config.profile_table_level_only
         dataset_name = table.ref.qualified_table_name
-        try:
-            table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
-        except Exception as e:
-            logger.warning(f"Failed to get table size for {dataset_name}: {e}")
+        if table.is_delta_table:
+            try:
+                table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
+            except Exception as e:
+                self.report.warning(
+                    title="Incomplete Dataset Profile",
+                    message="Failed to get table size",
+                    context=dataset_name,
+                    exc=e,
+                )
         if table.size_in_bytes is None:
             self.report.num_profile_missing_size_in_bytes += 1
         if not self.is_dataset_eligible_for_profiling(
             dataset_name,
             size_in_bytes=table.size_in_bytes,
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
                 self.report.report_dropped(dataset_name)
             return None
+        if profile_table_level_only and table.is_delta_table:
+            # For requests with profile_table_level_only set, dataset profile is generated
+            # by looking at table.rows_count. For delta tables (a typical databricks table)
+            # count(*) is an efficient query to compute row count.
+            try:
+                table.rows_count = _get_dataset_row_count(table, conn)
+            except Exception as e:
+                self.report.warning(
+                    title="Incomplete Dataset Profile",
+                    message="Failed to get table row count",
+                    context=dataset_name,
+                    exc=e,
+                )
+        if table.rows_count is None:
+            self.report.num_profile_missing_row_count += 1
         self.report.report_entity_profiled(dataset_name)
         logger.debug(f"Preparing profiling request for {dataset_name}")
         return TableProfilerRequest(
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
         conn.dialect.identifier_preparer.quote(c)
         for c in [table.ref.catalog, table.ref.schema, table.ref.table]
     )
+    # This query only works for delta table.
+    # Ref: https://docs.databricks.com/en/delta/table-details.html
+    # Note: Any change here should also update _get_dataset_row_count
     row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
     if row is None:
         return None
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
             return int(row._asdict()["sizeInBytes"])
         except Exception:
             return None
+def _get_dataset_row_count(
+    table: UnityCatalogSQLGenericTable, conn: Connection
+) -> Optional[int]:
+    name = ".".join(
+        conn.dialect.identifier_preparer.quote(c)
+        for c in [table.ref.catalog, table.ref.schema, table.ref.table]
+    )
+    # This query only works efficiently for delta table
+    row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
+    if row is None:
+        return None
+    else:
+        try:
+            return int(row._asdict()["numRows"])
+        except Exception:
+            return None

datahub/ingestion/source/unity/report.py CHANGED Viewed

@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
         default_factory=LossyDict
     )
     num_profile_missing_size_in_bytes: int = 0
+    num_profile_missing_row_count: int = 0
     num_profile_failed_unsupported_column_type: int = 0
     num_profile_failed_int_casts: int = 0

datahub/ingestion/source_report/pulsar.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import Optional
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalSourceReport,
 )
+from datahub.utilities.lossy_collections import LossyList
 @dataclass
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
     tenants_scanned: Optional[int] = None
     namespaces_scanned: Optional[int] = None
     topics_scanned: Optional[int] = None
-    tenants_filtered: List[str] = field(default_factory=list)
-    namespaces_filtered: List[str] = field(default_factory=list)
-    topics_filtered: List[str] = field(default_factory=list)
+    tenants_filtered: LossyList[str] = field(default_factory=LossyList)
+    namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
+    topics_filtered: LossyList[str] = field(default_factory=LossyList)
     def report_pulsar_version(self, version: str) -> None:
         self.pulsar_version = version

datahub/metadata/schema.avsc CHANGED Viewed

@@ -4730,16 +4730,16 @@
       {
         "Relationship": {
           "/*/destinationUrn": {
-            "createdActor": "datasetEdges/*/created/actor",
-            "createdOn": "datasetEdges/*/created/time",
+            "createdActor": "dashboards/*/created/actor",
+            "createdOn": "dashboards/*/created/time",
             "entityTypes": [
               "dashboard"
             ],
             "isLineage": true,
             "name": "DashboardContainsDashboard",
-            "properties": "datasetEdges/*/properties",
-            "updatedActor": "datasetEdges/*/lastModified/actor",
-            "updatedOn": "datasetEdges/*/lastModified/time"
+            "properties": "dashboards/*/properties",
+            "updatedActor": "dashboards/*/lastModified/actor",
+            "updatedOn": "dashboards/*/lastModified/time"
           }
         },
         "type": {

acryl-datahub 0.15.0.5rc8__py3-none-any.whl → 0.15.0.5rc10__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.5rc8py3-none-any.whl → 0.15.0.5rc10py3-none-any.whl