PyPI - acryl-datahub - Versions diffs - 0.15.0.4rc2__py3-none-any.whl → 0.15.0.5__py3-none-any.whl - Mend

acryl-datahub 0.15.0.4rc2py3-none-any.whl → 0.15.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (96) hide show

acryl_datahub-0.15.0.5.dist-info/LICENSE +202 -0
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2444 -2404
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +96 -86
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
datahub/__init__.py +1 -25
datahub/_version.py +13 -0
datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
datahub/cli/check_cli.py +1 -1
datahub/cli/cli_utils.py +3 -3
datahub/cli/container_cli.py +1 -64
datahub/cli/iceberg_cli.py +707 -0
datahub/cli/ingest_cli.py +2 -2
datahub/emitter/composite_emitter.py +36 -0
datahub/emitter/rest_emitter.py +1 -1
datahub/entrypoints.py +26 -5
datahub/ingestion/api/incremental_lineage_helper.py +4 -0
datahub/ingestion/api/registry.py +4 -2
datahub/ingestion/glossary/classification_mixin.py +6 -0
datahub/ingestion/glossary/classifier.py +3 -2
datahub/ingestion/graph/client.py +2 -1
datahub/ingestion/graph/entity_versioning.py +201 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
datahub/ingestion/run/connection.py +1 -1
datahub/ingestion/run/pipeline.py +3 -3
datahub/ingestion/source/abs/report.py +2 -2
datahub/ingestion/source/apply/__init__.py +0 -0
datahub/ingestion/source/apply/datahub_apply.py +223 -0
datahub/ingestion/source/aws/glue.py +15 -6
datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
datahub/ingestion/source/dbt/dbt_core.py +1 -1
datahub/ingestion/source/delta_lake/report.py +2 -2
datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
datahub/ingestion/source/elastic_search.py +2 -1
datahub/ingestion/source/ge_profiling_config.py +11 -7
datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
datahub/ingestion/source/identity/azure_ad.py +6 -14
datahub/ingestion/source/identity/okta.py +2 -1
datahub/ingestion/source/kafka/kafka.py +2 -1
datahub/ingestion/source/kafka_connect/common.py +2 -1
datahub/ingestion/source/ldap.py +2 -1
datahub/ingestion/source/looker/looker_config.py +3 -1
datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
datahub/ingestion/source/looker/looker_file_loader.py +14 -3
datahub/ingestion/source/looker/looker_template_language.py +104 -14
datahub/ingestion/source/looker/lookml_config.py +29 -8
datahub/ingestion/source/looker/lookml_source.py +110 -22
datahub/ingestion/source/mode.py +2 -4
datahub/ingestion/source/mongodb.py +2 -1
datahub/ingestion/source/nifi.py +2 -1
datahub/ingestion/source/powerbi/config.py +2 -2
datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
datahub/ingestion/source/redash.py +5 -5
datahub/ingestion/source/salesforce.py +4 -1
datahub/ingestion/source/slack/slack.py +6 -0
datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
datahub/ingestion/source/sql/clickhouse.py +5 -43
datahub/ingestion/source/sql/mssql/job_models.py +37 -8
datahub/ingestion/source/sql/mssql/source.py +17 -0
datahub/ingestion/source/sql/sql_config.py +0 -10
datahub/ingestion/source/tableau/tableau.py +16 -13
datahub/ingestion/source/tableau/tableau_common.py +1 -1
datahub/ingestion/source/unity/ge_profiler.py +55 -4
datahub/ingestion/source/unity/proxy.py +2 -2
datahub/ingestion/source/unity/report.py +1 -0
datahub/ingestion/source_config/operation_config.py +9 -0
datahub/ingestion/source_report/pulsar.py +5 -4
datahub/metadata/_schema_classes.py +304 -6
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
datahub/metadata/schema.avsc +211 -12
datahub/metadata/schemas/AssertionInfo.avsc +2 -2
datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
datahub/metadata/schemas/DashboardInfo.avsc +5 -5
datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/Deprecation.avsc +12 -0
datahub/metadata/schemas/DisplayProperties.avsc +62 -0
datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
datahub/metadata/schemas/PostInfo.avsc +28 -2
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/specific/dashboard.py +43 -1
datahub/telemetry/telemetry.py +4 -4
datahub/testing/check_imports.py +28 -0
datahub/upgrade/upgrade.py +17 -9
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/sql/sql_config.py CHANGED Viewed

@@ -2,8 +2,6 @@ import logging
 from abc import abstractmethod
 from typing import Any, Dict, Optional
-import cachetools
-import cachetools.keys
 import pydantic
 from pydantic import Field
 from sqlalchemy.engine import URL
@@ -29,7 +27,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
 from datahub.ingestion.source_config.operation_config import is_profiling_enabled
-from datahub.utilities.cachetools_keys import self_methodkey
 logger: logging.Logger = logging.getLogger(__name__)
@@ -118,13 +115,6 @@ class SQLCommonConfig(
     # Custom Stateful Ingestion settings
     stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
-    # TRICKY: The operation_config is time-dependent. Because we don't want to change
-    # whether or not we're running profiling mid-ingestion, we cache the result of this method.
-    # TODO: This decorator should be moved to the is_profiling_enabled(operation_config) method.
-    @cachetools.cached(
-        cache=cachetools.LRUCache(maxsize=1),
-        key=self_methodkey,
-    )
     def is_profiling_enabled(self) -> bool:
         return self.profiling.enabled and is_profiling_enabled(
             self.profiling.operation_config

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
     create_lineage_sql_parsed_result,
 )
 from datahub.utilities import config_clean
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -798,7 +799,7 @@ class TableauSourceReport(
     num_upstream_table_lineage_failed_parse_sql: int = 0
     num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
     num_hidden_assets_skipped: int = 0
-    logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
+    logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
     last_authenticated_at: Optional[datetime] = None
@@ -2428,10 +2429,12 @@ class TableauSiteSource:
             ]
         ],
     ) -> Optional["SqlParsingResult"]:
-        database_info = datasource.get(c.DATABASE) or {
-            c.NAME: c.UNKNOWN.lower(),
-            c.CONNECTION_TYPE: datasource.get(c.CONNECTION_TYPE),
-        }
+        database_field = datasource.get(c.DATABASE) or {}
+        database_id: Optional[str] = database_field.get(c.ID)
+        database_name: Optional[str] = database_field.get(c.NAME) or c.UNKNOWN.lower()
+        database_connection_type: Optional[str] = database_field.get(
+            c.CONNECTION_TYPE
+        ) or datasource.get(c.CONNECTION_TYPE)
         if (
             datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False)
@@ -2440,10 +2443,7 @@ class TableauSiteSource:
             logger.debug(f"datasource {datasource_urn} is not created from custom sql")
             return None
-        if (
-            database_info.get(c.NAME) is None
-            or database_info.get(c.CONNECTION_TYPE) is None
-        ):
+        if database_connection_type is None:
             logger.debug(
                 f"database information is missing from datasource {datasource_urn}"
             )
@@ -2459,14 +2459,14 @@ class TableauSiteSource:
         logger.debug(f"Parsing sql={query}")
-        upstream_db = database_info.get(c.NAME)
+        upstream_db = database_name
         if func_overridden_info is not None:
             # Override the information as per configuration
             upstream_db, platform_instance, platform, _ = func_overridden_info(
-                database_info[c.CONNECTION_TYPE],
-                database_info.get(c.NAME),
-                database_info.get(c.ID),
+                database_connection_type,
+                database_name,
+                database_id,
                 self.config.platform_instance_map,
                 self.config.lineage_overrides,
                 self.config.database_hostname_to_platform_instance_map,
@@ -2534,6 +2534,9 @@ class TableauSiteSource:
             platform_instance=self.config.platform_instance,
             func_overridden_info=get_overridden_info,
         )
+        logger.debug(
+            f"_create_lineage_from_unsupported_csql parsed_result = {parsed_result}"
+        )
         if parsed_result is None:
             return

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -761,7 +761,7 @@ class TableauUpstreamReference:
 def get_overridden_info(
-    connection_type: Optional[str],
+    connection_type: str,
     upstream_db: Optional[str],
     upstream_db_id: Optional[str],
     platform_instance_map: Optional[Dict[str, str]],

datahub/ingestion/source/unity/ge_profiler.py CHANGED Viewed

@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from typing import Iterable, List, Optional
+from databricks.sdk.service.catalog import DataSourceFormat
 from sqlalchemy import create_engine
 from sqlalchemy.engine import Connection
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
         self.size_in_bytes = None
         self.rows_count = None
         self.ddl = None
+        self.data_source_format = table.data_source_format
+    @property
+    def is_delta_table(self) -> bool:
+        return self.data_source_format == DataSourceFormat.DELTA
 class UnityCatalogGEProfiler(GenericProfiler):
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
         profile_table_level_only = self.profiling_config.profile_table_level_only
         dataset_name = table.ref.qualified_table_name
-        try:
-            table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
-        except Exception as e:
-            logger.warning(f"Failed to get table size for {dataset_name}: {e}")
+        if table.is_delta_table:
+            try:
+                table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
+            except Exception as e:
+                self.report.warning(
+                    title="Incomplete Dataset Profile",
+                    message="Failed to get table size",
+                    context=dataset_name,
+                    exc=e,
+                )
         if table.size_in_bytes is None:
             self.report.num_profile_missing_size_in_bytes += 1
         if not self.is_dataset_eligible_for_profiling(
             dataset_name,
             size_in_bytes=table.size_in_bytes,
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
                 self.report.report_dropped(dataset_name)
             return None
+        if profile_table_level_only and table.is_delta_table:
+            # For requests with profile_table_level_only set, dataset profile is generated
+            # by looking at table.rows_count. For delta tables (a typical databricks table)
+            # count(*) is an efficient query to compute row count.
+            try:
+                table.rows_count = _get_dataset_row_count(table, conn)
+            except Exception as e:
+                self.report.warning(
+                    title="Incomplete Dataset Profile",
+                    message="Failed to get table row count",
+                    context=dataset_name,
+                    exc=e,
+                )
+        if table.rows_count is None:
+            self.report.num_profile_missing_row_count += 1
         self.report.report_entity_profiled(dataset_name)
         logger.debug(f"Preparing profiling request for {dataset_name}")
         return TableProfilerRequest(
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
         conn.dialect.identifier_preparer.quote(c)
         for c in [table.ref.catalog, table.ref.schema, table.ref.table]
     )
+    # This query only works for delta table.
+    # Ref: https://docs.databricks.com/en/delta/table-details.html
+    # Note: Any change here should also update _get_dataset_row_count
     row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
     if row is None:
         return None
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
             return int(row._asdict()["sizeInBytes"])
         except Exception:
             return None
+def _get_dataset_row_count(
+    table: UnityCatalogSQLGenericTable, conn: Connection
+) -> Optional[int]:
+    name = ".".join(
+        conn.dialect.identifier_preparer.quote(c)
+        for c in [table.ref.catalog, table.ref.schema, table.ref.table]
+    )
+    # This query only works efficiently for delta table
+    row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
+    if row is None:
+        return None
+    else:
+        try:
+            return int(row._asdict()["numRows"])
+        except Exception:
+            return None

datahub/ingestion/source/unity/proxy.py CHANGED Viewed

@@ -26,7 +26,7 @@ from databricks.sdk.service.sql import (
 )
 from databricks.sdk.service.workspace import ObjectType
-import datahub
+from datahub._version import nice_version_name
 from datahub.emitter.mce_builder import parse_ts_millis
 from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
 from datahub.ingestion.source.unity.proxy_profiling import (
@@ -103,7 +103,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
             host=workspace_url,
             token=personal_access_token,
             product="datahub",
-            product_version=datahub.nice_version_name(),
+            product_version=nice_version_name(),
         )
         self.warehouse_id = warehouse_id or ""
         self.report = report

datahub/ingestion/source/unity/report.py CHANGED Viewed

@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
         default_factory=LossyDict
     )
     num_profile_missing_size_in_bytes: int = 0
+    num_profile_missing_row_count: int = 0
     num_profile_failed_unsupported_column_type: int = 0
     num_profile_failed_int_casts: int = 0

datahub/ingestion/source_config/operation_config.py CHANGED Viewed

@@ -2,10 +2,12 @@ import datetime
 import logging
 from typing import Any, Dict, Optional
+import cachetools
 import pydantic
 from pydantic.fields import Field
 from datahub.configuration.common import ConfigModel
+from datahub.utilities.cachetools_keys import self_methodkey
 logger = logging.getLogger(__name__)
@@ -62,6 +64,13 @@ class OperationConfig(ConfigModel):
         return profile_date_of_month
+# TRICKY: The operation_config is time-dependent. Because we don't want to change
+# whether or not we're running profiling mid-ingestion, we cache the result of this method.
+# An additional benefit is that we only print the log lines on the first call.
+@cachetools.cached(
+    cache=cachetools.LRUCache(maxsize=1),
+    key=self_methodkey,
+)
 def is_profiling_enabled(operation_config: OperationConfig) -> bool:
     if operation_config.lower_freq_profile_enabled is False:
         return True

datahub/ingestion/source_report/pulsar.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import Optional
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalSourceReport,
 )
+from datahub.utilities.lossy_collections import LossyList
 @dataclass
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
     tenants_scanned: Optional[int] = None
     namespaces_scanned: Optional[int] = None
     topics_scanned: Optional[int] = None
-    tenants_filtered: List[str] = field(default_factory=list)
-    namespaces_filtered: List[str] = field(default_factory=list)
-    topics_filtered: List[str] = field(default_factory=list)
+    tenants_filtered: LossyList[str] = field(default_factory=LossyList)
+    namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
+    topics_filtered: LossyList[str] = field(default_factory=LossyList)
     def report_pulsar_version(self, version: str) -> None:
         self.pulsar_version = version

acryl-datahub 0.15.0.4rc2__py3-none-any.whl → 0.15.0.5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.4rc2py3-none-any.whl → 0.15.0.5py3-none-any.whl