PyPI - acryl-datahub - Versions diffs - 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (221) hide show

{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -1
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
datahub/cli/check_cli.py +88 -7
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +124 -27
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +3 -15
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/emitter/rest_emitter.py +70 -12
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +48 -44
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +100 -15
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +13 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +489 -244
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
datahub/ingestion/source/dbt/dbt_common.py +6 -2
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +7 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -11
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -11
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +239 -34
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +121 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +58 -3
datahub/ingestion/source/tableau/tableau.py +58 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +118 -0
datahub/ingestion/source/unity/source.py +195 -17
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1433 -546
datahub/metadata/_urns/urn_defs.py +1826 -1658
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +17736 -17112
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +2 -0
datahub/sdk/_all_entities.py +7 -0
datahub/sdk/_shared.py +116 -0
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +7 -0
datahub/sdk/datajob.py +45 -13
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +82 -2
datahub/sdk/lineage_client.py +683 -82
datahub/sdk/main_client.py +46 -16
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sdk/search_client.py +4 -3
datahub/specific/chart.py +1 -1
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +62 -13
datahub/telemetry/telemetry.py +17 -11
datahub/testing/sdk_v2_helpers.py +7 -1
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0

datahub/entrypoints.py CHANGED Viewed

@@ -10,6 +10,7 @@ import click
 import datahub._version as datahub_version
 from datahub.cli.check_cli import check
 from datahub.cli.cli_utils import (
+    enable_auto_decorators,
     fixup_gms_url,
     generate_access_token,
     make_shim_command,
@@ -38,7 +39,6 @@ from datahub.cli.timeline_cli import timeline
 from datahub.configuration.common import should_show_stack_trace
 from datahub.ingestion.graph.client import get_default_graph
 from datahub.ingestion.graph.config import ClientMode
-from datahub.telemetry import telemetry
 from datahub.utilities._custom_package_loader import model_version_name
 from datahub.utilities.logging_manager import configure_logging
 from datahub.utilities.server_config_util import get_gms_config
@@ -111,7 +111,6 @@ def datahub(
     default=False,
     help="If passed will show server config. Assumes datahub init has happened.",
 )
-@telemetry.with_telemetry()
 def version(include_server: bool = False) -> None:
     """Print version number and exit."""
@@ -131,7 +130,6 @@ def version(include_server: bool = False) -> None:
     default=False,
     help="If passed then uses password to initialise token.",
 )
-@telemetry.with_telemetry()
 def init(use_password: bool = False) -> None:
     """Configure which datahub instance to connect to"""
@@ -218,6 +216,9 @@ except ImportError as e:
         make_shim_command("actions", "run `pip install acryl-datahub-actions`")
     )
+# Adding telemetry and upgrade decorators to all commands
+enable_auto_decorators(datahub)
 def main(**kwargs):
     # We use threads in a variety of places within our CLI. The multiprocessing

datahub/ingestion/api/decorators.py CHANGED Viewed

@@ -1,12 +1,16 @@
+# So that SourceCapabilityModifier can be resolved at runtime
+from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Callable, Dict, Optional, Type
+from typing import Callable, Dict, List, Optional, Type
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.source import (
     Source,
     SourceCapability as SourceCapability,
 )
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 def config_class(config_cls: Type) -> Callable[[Type], Type]:
@@ -88,10 +92,14 @@ class CapabilitySetting:
     capability: SourceCapability
     description: str
     supported: bool
+    subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
 def capability(
-    capability_name: SourceCapability, description: str, supported: bool = True
+    capability_name: SourceCapability,
+    description: str,
+    supported: bool = True,
+    subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
 ) -> Callable[[Type], Type]:
     """
     A decorator to mark a source as having a certain capability
@@ -104,6 +112,7 @@ def capability(
             for base in cls.__bases__
         ):
             cls.__capabilities = {}
             cls.get_capabilities = lambda: cls.__capabilities.values()
             # If the superclasses have capability annotations, copy those over.
@@ -113,7 +122,10 @@ def capability(
                     cls.__capabilities.update(base_caps)
         cls.__capabilities[capability_name] = CapabilitySetting(
-            capability=capability_name, description=description, supported=supported
+            capability=capability_name,
+            description=description,
+            supported=supported,
+            subtype_modifier=subtype_modifier,
         )
         return cls

datahub/ingestion/api/report.py CHANGED Viewed

@@ -2,17 +2,31 @@ import dataclasses
 import json
 import logging
 import pprint
-from dataclasses import dataclass
+from collections import defaultdict
+from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 from enum import Enum
-from typing import Any, Optional, runtime_checkable
+from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
 import humanfriendly
 import pydantic
 from pydantic import BaseModel
+from tabulate import tabulate
 from typing_extensions import Literal, Protocol
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.mcp_builder import mcps_from_mce
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.report_helpers import format_datetime_relative
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
+from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
+from datahub.metadata.schema_classes import (
+    MetadataChangeProposalClass,
+    SubTypesClass,
+    UpstreamLineageClass,
+)
+from datahub.utilities.file_backed_collections import FileBackedDict
 from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -82,7 +96,58 @@ class Report(SupportsAsObj):
         }
     def as_string(self) -> str:
-        return pprint.pformat(self.as_obj(), width=150, sort_dicts=False)
+        self_obj = self.as_obj()
+        _aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
+        # Format the main report data
+        result = pprint.pformat(self_obj, width=150, sort_dicts=False)
+        # Add aspects_by_subtypes table if it exists
+        if _aspects_by_subtypes:
+            result += "\n\nAspects by Subtypes:\n"
+            result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
+        return result
+    def _format_aspects_by_subtypes_table(
+        self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
+    ) -> str:
+        """Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
+        if not aspects_by_subtypes:
+            return "No aspects by subtypes data available."
+        all_aspects: set[str] = {
+            aspect
+            for subtypes in aspects_by_subtypes.values()
+            for aspects in subtypes.values()
+            for aspect in aspects
+        }
+        aspect_rows = sorted(all_aspects)
+        entity_subtype_columns = []
+        for entity_type, subtypes in aspects_by_subtypes.items():
+            for subtype in subtypes:
+                entity_subtype_columns.append(f"{entity_type} ({subtype})")
+        entity_subtype_columns.sort()
+        headers = ["Aspect"] + entity_subtype_columns
+        table_data = [
+            [aspect]
+            + [
+                aspects.get(aspect, 0)
+                for subtypes in aspects_by_subtypes.values()
+                for aspects in subtypes.values()
+            ]
+            for aspect in aspect_rows
+        ]
+        if table_data:
+            return tabulate(table_data, headers=headers, tablefmt="grid")
+        else:
+            return "No aspects by subtypes data available."
     def as_json(self) -> str:
         return json.dumps(self.as_obj())
@@ -90,6 +155,14 @@ class Report(SupportsAsObj):
     # TODO add helper method for warning / failure status + counts?
+@dataclass
+class SourceReportSubtypes:
+    urn: str
+    entity_type: str
+    subType: str = field(default="unknown")
+    aspects: Dict[str, int] = field(default_factory=dict)
 class ReportAttribute(BaseModel):
     severity: LogLevel = "DEBUG"
     help: Optional[str] = None
@@ -108,6 +181,262 @@ class ReportAttribute(BaseModel):
         logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
+@dataclass
+class ExamplesReport(Report, Closeable):
+    aspects: Dict[str, Dict[str, int]] = field(
+        default_factory=lambda: defaultdict(lambda: defaultdict(int))
+    )
+    aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
+        default_factory=lambda: defaultdict(
+            lambda: defaultdict(lambda: defaultdict(int))
+        )
+    )
+    samples: Dict[str, Dict[str, List[str]]] = field(
+        default_factory=lambda: defaultdict(lambda: defaultdict(list))
+    )
+    _file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
+    # We are adding this to make querying easier for fine-grained lineage
+    _fine_grained_lineage_special_case_name = "fineGrainedLineages"
+    _samples_to_add: int = 20
+    _lineage_aspects_seen: Set[str] = field(default_factory=set)
+    def __post_init__(self) -> None:
+        self._file_based_dict = FileBackedDict(
+            tablename="urn_aspects",
+            extra_columns={
+                "urn": lambda val: val.urn,
+                "entityType": lambda val: val.entity_type,
+                "subTypes": lambda val: val.subType,
+                "aspects": lambda val: json.dumps(val.aspects),
+            },
+        )
+    def close(self) -> None:
+        self.compute_stats()
+        if self._file_based_dict is not None:
+            self._file_based_dict.close()
+            self._file_based_dict = None
+    def _build_aspects_where_clause(self, aspects: List[str]) -> str:
+        """Build WHERE clause for matching any of the given aspects."""
+        if not aspects:
+            return ""
+        conditions = []
+        for aspect in aspects:
+            conditions.append(f"aspects LIKE '%{aspect}%'")
+        return " OR ".join(conditions)
+    def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
+        """Helper method to collect samples organized by subtype for a given where clause."""
+        subtype_query = f"""
+        SELECT DISTINCT subTypes
+        FROM urn_aspects
+        WHERE {where_clause}
+        """
+        assert self._file_based_dict is not None
+        subtypes = set()
+        for row in self._file_based_dict.sql_query(subtype_query):
+            sub_type = row["subTypes"] or "unknown"
+            subtypes.add(sub_type)
+        for sub_type in subtypes:
+            query = f"""
+            SELECT urn
+            FROM urn_aspects
+            WHERE {where_clause} AND subTypes = ?
+            limit {self._samples_to_add}
+            """
+            for row in self._file_based_dict.sql_query(query, (sub_type,)):
+                self.samples[sample_key][sub_type].append(row["urn"])
+    def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
+        """Helper method to collect samples for entities that have any of the given aspects."""
+        if not aspects:
+            return
+        where_clause = self._build_aspects_where_clause(aspects)
+        self._collect_samples_by_subtype(where_clause, sample_key)
+    def _collect_samples_by_lineage_aspects(
+        self, aspects: List[str], sample_key: str
+    ) -> None:
+        """Helper method to collect samples for entities that have any of the given lineage aspects.
+        Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
+        """
+        if not aspects:
+            return
+        lineage_conditions = []
+        for aspect in aspects:
+            lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
+        where_clause = " OR ".join(lineage_conditions)
+        self._collect_samples_by_subtype(where_clause, sample_key)
+    def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
+        """
+        Collect samples for entities that have lineage, profiling, and usage aspects.
+        These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
+        """
+        if not self._lineage_aspects_seen:
+            return
+        assert self._file_based_dict is not None
+        # Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
+        lineage_conditions = []
+        for aspect in self._lineage_aspects_seen:
+            lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
+        lineage_where_clause = " OR ".join(lineage_conditions)
+        # Build profiling conditions using the same logic as _collect_samples_by_aspects
+        profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
+        # Build usage conditions using the same logic as _collect_samples_by_aspects
+        usage_where_clause = self._build_aspects_where_clause(
+            [
+                "datasetUsageStatistics",
+                "chartUsageStatistics",
+                "dashboardUsageStatistics",
+            ]
+        )
+        query = f"""
+        SELECT urn, subTypes
+        FROM urn_aspects
+        WHERE ({lineage_where_clause})
+        AND ({profiling_where_clause})
+        AND ({usage_where_clause})
+        limit {self._samples_to_add}
+        """
+        for row in self._file_based_dict.sql_query(query):
+            sub_type = row["subTypes"] or "unknown"
+            self.samples[sample_key][sub_type].append(row["urn"])
+    def _has_fine_grained_lineage(
+        self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
+    ) -> bool:
+        if isinstance(mcp.aspect, UpstreamLineageClass):
+            upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
+            if upstream_lineage.fineGrainedLineages:
+                return True
+        return False
+    def _update_file_based_dict(
+        self,
+        urn: str,
+        entityType: str,
+        aspectName: str,
+        mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
+    ) -> None:
+        if is_lineage_aspect(entityType, aspectName):
+            self._lineage_aspects_seen.add(aspectName)
+        has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
+        sub_type = "unknown"
+        if isinstance(mcp.aspect, SubTypesClass):
+            sub_type = mcp.aspect.typeNames[0]
+        assert self._file_based_dict is not None
+        if urn in self._file_based_dict:
+            if sub_type != "unknown":
+                self._file_based_dict[urn].subType = sub_type
+            aspects_dict = self._file_based_dict[urn].aspects
+            if aspectName in aspects_dict:
+                aspects_dict[aspectName] += 1
+            else:
+                aspects_dict[aspectName] = 1
+            if has_fine_grained_lineage:
+                if self._fine_grained_lineage_special_case_name in aspects_dict:
+                    aspects_dict[self._fine_grained_lineage_special_case_name] += 1
+                else:
+                    aspects_dict[self._fine_grained_lineage_special_case_name] = 1
+            self._file_based_dict.mark_dirty(urn)
+        else:
+            aspects_dict = {aspectName: 1}
+            if has_fine_grained_lineage:
+                aspects_dict[self._fine_grained_lineage_special_case_name] = 1
+            self._file_based_dict[urn] = SourceReportSubtypes(
+                urn=urn,
+                entity_type=entityType,
+                subType=sub_type,
+                aspects=aspects_dict,
+            )
+    def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
+        urn = wu.get_urn()
+        if not isinstance(wu.metadata, MetadataChangeEvent):
+            mcps = [wu.metadata]
+        else:
+            mcps = list(mcps_from_mce(wu.metadata))
+        for mcp in mcps:
+            entityType = mcp.entityType
+            aspectName = mcp.aspectName
+            if aspectName is None:
+                continue
+            self._update_file_based_dict(urn, entityType, aspectName, mcp)
+    def compute_stats(self) -> None:
+        if self._file_based_dict is None:
+            return
+        query = """
+        SELECT entityType, subTypes, aspects, count(*) as count
+        FROM urn_aspects
+        group by entityType, subTypes, aspects
+        """
+        entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
+            defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
+        )
+        for row in self._file_based_dict.sql_query(query):
+            entity_type = row["entityType"]
+            sub_type = row["subTypes"]
+            count = row["count"]
+            aspects_raw = row["aspects"] or "[]"
+            aspects = json.loads(aspects_raw)
+            for aspect, aspect_count in aspects.items():
+                entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
+                    aspect_count * count
+                )
+        self.aspects.clear()
+        self.aspects_by_subtypes.clear()
+        _aspects_seen: Set[str] = set()
+        for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
+            for sub_type, aspect_counts in subtype_counts.items():
+                for aspect, count in aspect_counts.items():
+                    self.aspects[entity_type][aspect] += count
+                    _aspects_seen.add(aspect)
+                self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
+        self.samples.clear()
+        self._collect_samples_by_aspects(["datasetProfile"], "profiling")
+        self._collect_samples_by_aspects(
+            [
+                "datasetUsageStatistics",
+                "chartUsageStatistics",
+                "dashboardUsageStatistics",
+            ],
+            "usage",
+        )
+        self._collect_samples_by_lineage_aspects(
+            list(self._lineage_aspects_seen), "lineage"
+        )
+        self._collect_samples_with_all_conditions("all_3")
 class EntityFilterReport(ReportAttribute):
     type: str

datahub/ingestion/api/sink.py CHANGED Viewed

@@ -147,6 +147,9 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
     def close(self) -> None:
         pass
+    def flush(self) -> None:
+        pass
     def configured(self) -> str:
         """Override this method to output a human-readable and scrubbed version of the configured sink"""
         return ""

datahub/ingestion/api/source.py CHANGED Viewed

@@ -2,7 +2,6 @@ import contextlib
 import datetime
 import logging
 from abc import ABCMeta, abstractmethod
-from collections import defaultdict
 from dataclasses import dataclass, field
 from enum import Enum
 from functools import partial
@@ -15,7 +14,6 @@ from typing import (
     List,
     Optional,
     Sequence,
-    Set,
     Type,
     TypeVar,
     Union,
@@ -28,7 +26,6 @@ from typing_extensions import LiteralString, Self
 from datahub.configuration.common import ConfigModel
 from datahub.configuration.source_common import PlatformInstanceConfigMixin
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.emitter.mcp_builder import mcps_from_mce
 from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
     auto_patch_last_modified,
 )
@@ -37,7 +34,7 @@ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
 )
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
-from datahub.ingestion.api.report import Report
+from datahub.ingestion.api.report import ExamplesReport, Report
 from datahub.ingestion.api.source_helpers import (
     AutoSystemMetadata,
     auto_browse_path_v2,
@@ -50,9 +47,8 @@ from datahub.ingestion.api.source_helpers import (
     auto_workunit_reporter,
 )
 from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
-from datahub.metadata.schema_classes import UpstreamLineageClass
 from datahub.sdk.entity import Entity
+from datahub.telemetry import stats
 from datahub.utilities.lossy_collections import LossyDict, LossyList
 from datahub.utilities.type_annotations import get_class_from_annotation
@@ -76,6 +72,7 @@ class SourceCapability(Enum):
     SCHEMA_METADATA = "Schema Metadata"
     CONTAINERS = "Asset Containers"
     CLASSIFICATION = "Classification"
+    TEST_CONNECTION = "Test Connection"
 class StructuredLogLevel(Enum):
@@ -190,20 +187,11 @@ class StructuredLogs(Report):
 @dataclass
-class SourceReport(Report):
+class SourceReport(ExamplesReport):
     event_not_produced_warn: bool = True
     events_produced: int = 0
     events_produced_per_sec: int = 0
-    _urns_seen: Set[str] = field(default_factory=set)
-    entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
-    aspects: Dict[str, Dict[str, int]] = field(
-        default_factory=lambda: defaultdict(lambda: defaultdict(int))
-    )
-    aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
-        default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
-    )
     _structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
     @property
@@ -220,33 +208,10 @@ class SourceReport(Report):
     def report_workunit(self, wu: WorkUnit) -> None:
         self.events_produced += 1
+        if not isinstance(wu, MetadataWorkUnit):
+            return
-        if isinstance(wu, MetadataWorkUnit):
-            urn = wu.get_urn()
-            # Specialized entity reporting.
-            if not isinstance(wu.metadata, MetadataChangeEvent):
-                mcps = [wu.metadata]
-            else:
-                mcps = list(mcps_from_mce(wu.metadata))
-            for mcp in mcps:
-                entityType = mcp.entityType
-                aspectName = mcp.aspectName
-                if urn not in self._urns_seen:
-                    self._urns_seen.add(urn)
-                    self.entities[entityType].append(urn)
-                if aspectName is not None:  # usually true
-                    self.aspects[entityType][aspectName] += 1
-                    self.aspect_urn_samples[entityType][aspectName].append(urn)
-                    if isinstance(mcp.aspect, UpstreamLineageClass):
-                        upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
-                        if upstream_lineage.fineGrainedLineages:
-                            self.aspect_urn_samples[entityType][
-                                "fineGrainedLineages"
-                            ].append(urn)
+        super()._store_workunit_data(wu)
     def report_warning(
         self,
@@ -265,9 +230,10 @@ class SourceReport(Report):
         context: Optional[str] = None,
         title: Optional[LiteralString] = None,
         exc: Optional[BaseException] = None,
+        log: bool = True,
     ) -> None:
         self._structured_logs.report_log(
-            StructuredLogLevel.WARN, message, title, context, exc, log=True
+            StructuredLogLevel.WARN, message, title, context, exc, log=log
         )
     def report_failure(
@@ -325,6 +291,7 @@ class SourceReport(Report):
             )
     def __post_init__(self) -> None:
+        super().__post_init__()
         self.start_time = datetime.datetime.now()
         self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
@@ -337,6 +304,43 @@ class SourceReport(Report):
             "infos": Report.to_pure_python_obj(self.infos),
         }
+    @staticmethod
+    def _discretize_dict_values(
+        nested_dict: Dict[str, Dict[str, int]],
+    ) -> Dict[str, Dict[str, int]]:
+        """Helper method to discretize values in a nested dictionary structure."""
+        result = {}
+        for outer_key, inner_dict in nested_dict.items():
+            discretized_dict: Dict[str, int] = {}
+            for inner_key, count in inner_dict.items():
+                discretized_dict[inner_key] = stats.discretize(count)
+            result[outer_key] = discretized_dict
+        return result
+    def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
+        """Convert the nested defaultdict aspects to a regular dict for serialization."""
+        return self._discretize_dict_values(self.aspects)
+    def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
+        """Get aspect counts grouped by entity type and subtype."""
+        return self._discretize_dict_values_nested(self.aspects_by_subtypes)
+    @staticmethod
+    def _discretize_dict_values_nested(
+        nested_dict: Dict[str, Dict[str, Dict[str, int]]],
+    ) -> Dict[str, Dict[str, Dict[str, int]]]:
+        """Helper method to discretize values in a nested dictionary structure with three levels."""
+        result = {}
+        for outer_key, middle_dict in nested_dict.items():
+            discretized_middle_dict: Dict[str, Dict[str, int]] = {}
+            for middle_key, inner_dict in middle_dict.items():
+                discretized_inner_dict: Dict[str, int] = {}
+                for inner_key, count in inner_dict.items():
+                    discretized_inner_dict[inner_key] = stats.discretize(count)
+                discretized_middle_dict[middle_key] = discretized_inner_dict
+            result[outer_key] = discretized_middle_dict
+        return result
     def compute_stats(self) -> None:
         super().compute_stats()
@@ -503,7 +507,7 @@ class Source(Closeable, metaclass=ABCMeta):
         pass
     def close(self) -> None:
-        pass
+        self.get_report().close()
     def _infer_platform(self) -> Optional[str]:
         config = self.get_config()

datahub/ingestion/autogenerated/__init__.py ADDED Viewed

File without changes

acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0py3-none-any.whl