PyPI - acryl-datahub - Versions diffs - 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl - Mend

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show

{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
datahub/_version.py +1 -1
datahub/cli/check_cli.py +0 -7
datahub/cli/cli_utils.py +73 -0
datahub/cli/delete_cli.py +0 -6
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +148 -228
datahub/cli/exists_cli.py +0 -4
datahub/cli/get_cli.py +0 -4
datahub/cli/ingest_cli.py +1 -20
datahub/cli/put_cli.py +0 -6
datahub/cli/quickstart_versioning.py +50 -5
datahub/cli/specific/assertions_cli.py +0 -6
datahub/cli/specific/datacontract_cli.py +0 -6
datahub/cli/specific/dataproduct_cli.py +0 -22
datahub/cli/specific/dataset_cli.py +0 -11
datahub/cli/specific/forms_cli.py +0 -6
datahub/cli/specific/group_cli.py +0 -4
datahub/cli/specific/structuredproperties_cli.py +0 -7
datahub/cli/specific/user_cli.py +0 -4
datahub/cli/state_cli.py +0 -4
datahub/cli/timeline_cli.py +0 -4
datahub/entrypoints.py +4 -3
datahub/ingestion/api/report.py +183 -35
datahub/ingestion/autogenerated/capability_summary.json +3431 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +30 -128
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/graph/client.py +2 -2
datahub/ingestion/run/pipeline.py +47 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +1 -1
datahub/ingestion/source/data_lake_common/object_store.py +40 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dremio/dremio_source.py +7 -7
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +28 -20
datahub/ingestion/source/identity/okta.py +0 -13
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
datahub/ingestion/source/powerbi/powerbi.py +0 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/source.py +19 -3
datahub/ingestion/source/sigma/sigma.py +6 -1
datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/hive_metastore.py +0 -10
datahub/ingestion/source/sql/sql_common.py +4 -0
datahub/ingestion/source/sql/vertica.py +0 -4
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/superset.py +56 -1
datahub/ingestion/source/tableau/tableau.py +40 -34
datahub/ingestion/source/tableau/tableau_constant.py +0 -2
datahub/ingestion/source/unity/proxy.py +4 -3
datahub/ingestion/source/unity/source.py +19 -9
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +85 -4
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
datahub/metadata/schema.avsc +54 -1
datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
datahub/sdk/lineage_client.py +2 -0
datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
datahub/sql_parsing/sqlglot_lineage.py +40 -13
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0

datahub/ingestion/api/report.py CHANGED Viewed

@@ -6,19 +6,25 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 from enum import Enum
-from typing import Any, Dict, Optional, Set, cast, runtime_checkable
+from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
 import humanfriendly
 import pydantic
 from pydantic import BaseModel
 from typing_extensions import Literal, Protocol
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import mcps_from_mce
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.report_helpers import format_datetime_relative
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
-from datahub.metadata.schema_classes import SubTypesClass, UpstreamLineageClass
+from datahub.metadata.schema_classes import (
+    MetadataChangeProposalClass,
+    SubTypesClass,
+    UpstreamLineageClass,
+)
 from datahub.utilities.file_backed_collections import FileBackedDict
 from datahub.utilities.lossy_collections import LossyList
@@ -125,8 +131,6 @@ class ReportAttribute(BaseModel):
 @dataclass
 class ExamplesReport(Report, Closeable):
-    _urns_seen: Set[str] = field(default_factory=set)
-    entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
     aspects: Dict[str, Dict[str, int]] = field(
         default_factory=lambda: defaultdict(lambda: defaultdict(int))
     )
@@ -135,11 +139,16 @@ class ExamplesReport(Report, Closeable):
             lambda: defaultdict(lambda: defaultdict(int))
         )
     )
-    aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
-        default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
+    samples: Dict[str, Dict[str, List[str]]] = field(
+        default_factory=lambda: defaultdict(lambda: defaultdict(list))
     )
     _file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
+    # We are adding this to make querying easier for fine-grained lineage
+    _fine_grained_lineage_special_case_name = "fineGrainedLineages"
+    _samples_to_add: int = 20
+    _lineage_aspects_seen: Set[str] = field(default_factory=set)
     def __post_init__(self) -> None:
         self._file_based_dict = FileBackedDict(
             tablename="urn_aspects",
@@ -157,6 +166,151 @@ class ExamplesReport(Report, Closeable):
             self._file_based_dict.close()
             self._file_based_dict = None
+    def _build_aspects_where_clause(self, aspects: List[str]) -> str:
+        """Build WHERE clause for matching any of the given aspects."""
+        if not aspects:
+            return ""
+        conditions = []
+        for aspect in aspects:
+            conditions.append(f"aspects LIKE '%{aspect}%'")
+        return " OR ".join(conditions)
+    def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
+        """Helper method to collect samples organized by subtype for a given where clause."""
+        subtype_query = f"""
+        SELECT DISTINCT subTypes
+        FROM urn_aspects
+        WHERE {where_clause}
+        """
+        assert self._file_based_dict is not None
+        subtypes = set()
+        for row in self._file_based_dict.sql_query(subtype_query):
+            sub_type = row["subTypes"] or "unknown"
+            subtypes.add(sub_type)
+        for sub_type in subtypes:
+            query = f"""
+            SELECT urn
+            FROM urn_aspects
+            WHERE {where_clause} AND subTypes = ?
+            limit {self._samples_to_add}
+            """
+            for row in self._file_based_dict.sql_query(query, (sub_type,)):
+                self.samples[sample_key][sub_type].append(row["urn"])
+    def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
+        """Helper method to collect samples for entities that have any of the given aspects."""
+        if not aspects:
+            return
+        where_clause = self._build_aspects_where_clause(aspects)
+        self._collect_samples_by_subtype(where_clause, sample_key)
+    def _collect_samples_by_lineage_aspects(
+        self, aspects: List[str], sample_key: str
+    ) -> None:
+        """Helper method to collect samples for entities that have any of the given lineage aspects.
+        Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
+        """
+        if not aspects:
+            return
+        lineage_conditions = []
+        for aspect in aspects:
+            lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
+        where_clause = " OR ".join(lineage_conditions)
+        self._collect_samples_by_subtype(where_clause, sample_key)
+    def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
+        """
+        Collect samples for entities that have lineage, profiling, and usage aspects.
+        These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
+        """
+        if not self._lineage_aspects_seen:
+            return
+        assert self._file_based_dict is not None
+        # Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
+        lineage_conditions = []
+        for aspect in self._lineage_aspects_seen:
+            lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
+        lineage_where_clause = " OR ".join(lineage_conditions)
+        # Build profiling conditions using the same logic as _collect_samples_by_aspects
+        profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
+        # Build usage conditions using the same logic as _collect_samples_by_aspects
+        usage_where_clause = self._build_aspects_where_clause(
+            [
+                "datasetUsageStatistics",
+                "chartUsageStatistics",
+                "dashboardUsageStatistics",
+            ]
+        )
+        query = f"""
+        SELECT urn, subTypes
+        FROM urn_aspects
+        WHERE ({lineage_where_clause})
+        AND ({profiling_where_clause})
+        AND ({usage_where_clause})
+        limit {self._samples_to_add}
+        """
+        for row in self._file_based_dict.sql_query(query):
+            sub_type = row["subTypes"] or "unknown"
+            self.samples[sample_key][sub_type].append(row["urn"])
+    def _has_fine_grained_lineage(
+        self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
+    ) -> bool:
+        if isinstance(mcp.aspect, UpstreamLineageClass):
+            upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
+            if upstream_lineage.fineGrainedLineages:
+                return True
+        return False
+    def _update_file_based_dict(
+        self,
+        urn: str,
+        entityType: str,
+        aspectName: str,
+        mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
+    ) -> None:
+        if is_lineage_aspect(entityType, aspectName):
+            self._lineage_aspects_seen.add(aspectName)
+        has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
+        sub_type = "unknown"
+        if isinstance(mcp.aspect, SubTypesClass):
+            sub_type = mcp.aspect.typeNames[0]
+        assert self._file_based_dict is not None
+        if urn in self._file_based_dict:
+            if sub_type != "unknown":
+                self._file_based_dict[urn].subType = sub_type
+            self._file_based_dict[urn].aspects.add(aspectName)
+            if has_fine_grained_lineage:
+                self._file_based_dict[urn].aspects.add(
+                    self._fine_grained_lineage_special_case_name
+                )
+            self._file_based_dict.mark_dirty(urn)
+        else:
+            self._file_based_dict[urn] = SourceReportSubtypes(
+                urn=urn,
+                entity_type=entityType,
+                subType=sub_type,
+                aspects={aspectName}
+                if not has_fine_grained_lineage
+                else {aspectName, self._fine_grained_lineage_special_case_name},
+            )
     def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
         urn = wu.get_urn()
@@ -169,41 +323,15 @@ class ExamplesReport(Report, Closeable):
             entityType = mcp.entityType
             aspectName = mcp.aspectName
-            if urn not in self._urns_seen:
-                self._urns_seen.add(urn)
-                self.entities[entityType].append(urn)
             if aspectName is None:
                 continue
-            self.aspects[entityType][aspectName] += 1
-            self.aspect_urn_samples[entityType][aspectName].append(urn)
-            sub_type = "unknown"
-            if isinstance(mcp.aspect, UpstreamLineageClass):
-                upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
-                if upstream_lineage.fineGrainedLineages:
-                    self.aspect_urn_samples[entityType]["fineGrainedLineages"].append(
-                        urn
-                    )
-                    self.aspects[entityType]["fineGrainedLineages"] += 1
-            elif isinstance(mcp.aspect, SubTypesClass):
-                sub_type = mcp.aspect.typeNames[0]
-            assert self._file_based_dict is not None
-            if urn in self._file_based_dict:
-                if sub_type != "unknown":
-                    self._file_based_dict[urn].subType = sub_type
-                self._file_based_dict[urn].aspects.add(aspectName)
-                self._file_based_dict.mark_dirty(urn)
-            else:
-                self._file_based_dict[urn] = SourceReportSubtypes(
-                    urn=urn,
-                    entity_type=entityType,
-                    subType=sub_type,
-                    aspects={aspectName},
-                )
+            self._update_file_based_dict(urn, entityType, aspectName, mcp)
     def compute_stats(self) -> None:
         if self._file_based_dict is None:
             return
         query = """
         SELECT entityType, subTypes, aspects, count(*) as count
         FROM urn_aspects
@@ -223,11 +351,31 @@ class ExamplesReport(Report, Closeable):
             for aspect in aspects:
                 entity_subtype_aspect_counts[entity_type][sub_type][aspect] += count
+        self.aspects.clear()
         self.aspects_by_subtypes.clear()
+        _aspects_seen: Set[str] = set()
         for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
             for sub_type, aspect_counts in subtype_counts.items():
+                for aspect, count in aspect_counts.items():
+                    self.aspects[entity_type][aspect] += count
+                    _aspects_seen.add(aspect)
                 self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
+        self.samples.clear()
+        self._collect_samples_by_aspects(["datasetProfile"], "profiling")
+        self._collect_samples_by_aspects(
+            [
+                "datasetUsageStatistics",
+                "chartUsageStatistics",
+                "dashboardUsageStatistics",
+            ],
+            "usage",
+        )
+        self._collect_samples_by_lineage_aspects(
+            list(self._lineage_aspects_seen), "lineage"
+        )
+        self._collect_samples_with_all_conditions("all_3")
 class EntityFilterReport(ReportAttribute):
     type: str

acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc8py3-none-any.whl