PyPI - acryl-datahub - Versions diffs - 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show

{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -1
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
datahub/cli/check_cli.py +88 -7
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +124 -27
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +3 -15
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/configuration/pydantic_migration_helpers.py +7 -5
datahub/emitter/rest_emitter.py +70 -12
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +48 -44
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +100 -15
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +13 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +489 -244
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
datahub/ingestion/source/dbt/dbt_common.py +6 -2
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +7 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -11
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -11
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +239 -34
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +121 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +58 -3
datahub/ingestion/source/tableau/tableau.py +58 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +118 -0
datahub/ingestion/source/unity/source.py +195 -17
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1522 -569
datahub/metadata/_urns/urn_defs.py +1826 -1658
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +17758 -17097
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +2 -0
datahub/sdk/_all_entities.py +7 -0
datahub/sdk/_shared.py +116 -0
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +7 -0
datahub/sdk/datajob.py +45 -13
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +82 -2
datahub/sdk/lineage_client.py +683 -82
datahub/sdk/main_client.py +46 -16
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sdk/search_client.py +4 -3
datahub/sdk/search_filters.py +95 -27
datahub/specific/chart.py +1 -1
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +62 -13
datahub/telemetry/telemetry.py +17 -11
datahub/testing/sdk_v2_helpers.py +7 -1
datahub/upgrade/upgrade.py +56 -14
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -4,6 +4,7 @@ import functools
 import json
 import logging
 import os
+import re
 import time
 from collections import defaultdict
 from dataclasses import dataclass
@@ -60,6 +61,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
     MetadataChangeProposal,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
+from datahub.metadata.schema_classes import (
+    KEY_ASPECT_NAMES,
+    ChangeTypeClass,
+)
 from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
 if TYPE_CHECKING:
@@ -104,6 +109,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
 )
+def preserve_unicode_escapes(obj: Any) -> Any:
+    """Recursively convert unicode characters back to escape sequences"""
+    if isinstance(obj, dict):
+        return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [preserve_unicode_escapes(item) for item in obj]
+    elif isinstance(obj, str):
+        # Convert non-ASCII characters back to \u escapes
+        def escape_unicode(match: Any) -> Any:
+            return f"\\u{ord(match.group(0)):04x}"
+        return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
+    else:
+        return obj
 class EmitMode(ConfigEnum):
     # Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
     # Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
@@ -314,6 +335,7 @@ class DataHubRestEmitter(Closeable, Emitter):
         openapi_ingestion: Optional[bool] = None,
         client_mode: Optional[ClientMode] = None,
         datahub_component: Optional[str] = None,
+        server_config_refresh_interval: Optional[int] = None,
     ):
         if not gms_server:
             raise ConfigurationError("gms server is required")
@@ -329,6 +351,8 @@ class DataHubRestEmitter(Closeable, Emitter):
         self._openapi_ingestion = (
             openapi_ingestion  # Re-evaluated after test connection
         )
+        self._server_config_refresh_interval = server_config_refresh_interval
+        self._config_fetch_time: Optional[float] = None
         headers = {
             "X-RestLi-Protocol-Version": "2.0.0",
@@ -398,7 +422,17 @@ class DataHubRestEmitter(Closeable, Emitter):
         Raises:
             ConfigurationError: If there's an error fetching or validating the configuration
         """
-        if not hasattr(self, "_server_config") or not self._server_config:
+        if (
+            not hasattr(self, "_server_config")
+            or self._server_config is None
+            or (
+                self._server_config_refresh_interval is not None
+                and self._config_fetch_time is not None
+                and (time.time() - self._config_fetch_time)
+                > self._server_config_refresh_interval
+            )
+        ):
             if self._session is None or self._gms_server is None:
                 raise ConfigurationError(
                     "Session and URL are required to load configuration"
@@ -419,6 +453,7 @@ class DataHubRestEmitter(Closeable, Emitter):
                     )
                 self._server_config = RestServiceConfig(raw_config=raw_config)
+                self._config_fetch_time = time.time()
                 self._post_fetch_server_config()
             else:
@@ -453,6 +488,8 @@ class DataHubRestEmitter(Closeable, Emitter):
                     DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
                 )
+    def test_connection(self) -> None:
+        self.fetch_server_config()
         logger.debug(
             f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
         )
@@ -460,12 +497,21 @@ class DataHubRestEmitter(Closeable, Emitter):
             f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
         )
-    def test_connection(self) -> None:
-        self.fetch_server_config()
     def get_server_config(self) -> dict:
         return self.server_config.raw_config
+    def invalidate_config_cache(self) -> None:
+        """Manually invalidate the configuration cache."""
+        if (
+            hasattr(self, "_server_config")
+            and self._server_config is not None
+            and self._server_config_refresh_interval is not None
+        ):
+            # Set fetch time to beyond TTL in the past to force refresh on next access
+            self._config_fetch_time = (
+                time.time() - self._server_config_refresh_interval - 1
+            )
     def to_graph(self) -> "DataHubGraph":
         from datahub.ingestion.graph.client import DataHubGraph
@@ -584,15 +630,27 @@ class DataHubRestEmitter(Closeable, Emitter):
                     trace_data = extract_trace_data(response) if response else None
         else:
-            url = f"{self._gms_server}/aspects?action=ingestProposal"
+            if mcp.changeType == ChangeTypeClass.DELETE:
+                if mcp.aspectName not in KEY_ASPECT_NAMES:
+                    raise OperationalError(
+                        f"Delete not supported for non key aspect: {mcp.aspectName} for urn: "
+                        f"{mcp.entityUrn}"
+                    )
-            mcp_obj = pre_json_transform(mcp.to_obj())
-            payload_dict = {
-                "proposal": mcp_obj,
-                "async": "true"
-                if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
-                else "false",
-            }
+                url = f"{self._gms_server}/entities?action=delete"
+                payload_dict = {
+                    "urn": mcp.entityUrn,
+                }
+            else:
+                url = f"{self._gms_server}/aspects?action=ingestProposal"
+                mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
+                payload_dict = {
+                    "proposal": mcp_obj,
+                    "async": "true"
+                    if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
+                    else "false",
+                }
             payload = json.dumps(payload_dict)

datahub/entrypoints.py CHANGED Viewed

@@ -10,6 +10,7 @@ import click
 import datahub._version as datahub_version
 from datahub.cli.check_cli import check
 from datahub.cli.cli_utils import (
+    enable_auto_decorators,
     fixup_gms_url,
     generate_access_token,
     make_shim_command,
@@ -38,7 +39,6 @@ from datahub.cli.timeline_cli import timeline
 from datahub.configuration.common import should_show_stack_trace
 from datahub.ingestion.graph.client import get_default_graph
 from datahub.ingestion.graph.config import ClientMode
-from datahub.telemetry import telemetry
 from datahub.utilities._custom_package_loader import model_version_name
 from datahub.utilities.logging_manager import configure_logging
 from datahub.utilities.server_config_util import get_gms_config
@@ -111,7 +111,6 @@ def datahub(
     default=False,
     help="If passed will show server config. Assumes datahub init has happened.",
 )
-@telemetry.with_telemetry()
 def version(include_server: bool = False) -> None:
     """Print version number and exit."""
@@ -131,7 +130,6 @@ def version(include_server: bool = False) -> None:
     default=False,
     help="If passed then uses password to initialise token.",
 )
-@telemetry.with_telemetry()
 def init(use_password: bool = False) -> None:
     """Configure which datahub instance to connect to"""
@@ -218,6 +216,9 @@ except ImportError as e:
         make_shim_command("actions", "run `pip install acryl-datahub-actions`")
     )
+# Adding telemetry and upgrade decorators to all commands
+enable_auto_decorators(datahub)
 def main(**kwargs):
     # We use threads in a variety of places within our CLI. The multiprocessing

datahub/ingestion/api/decorators.py CHANGED Viewed

@@ -1,12 +1,16 @@
+# So that SourceCapabilityModifier can be resolved at runtime
+from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Callable, Dict, Optional, Type
+from typing import Callable, Dict, List, Optional, Type
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.source import (
     Source,
     SourceCapability as SourceCapability,
 )
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 def config_class(config_cls: Type) -> Callable[[Type], Type]:
@@ -88,10 +92,14 @@ class CapabilitySetting:
     capability: SourceCapability
     description: str
     supported: bool
+    subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
 def capability(
-    capability_name: SourceCapability, description: str, supported: bool = True
+    capability_name: SourceCapability,
+    description: str,
+    supported: bool = True,
+    subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
 ) -> Callable[[Type], Type]:
     """
     A decorator to mark a source as having a certain capability
@@ -104,6 +112,7 @@ def capability(
             for base in cls.__bases__
         ):
             cls.__capabilities = {}
             cls.get_capabilities = lambda: cls.__capabilities.values()
             # If the superclasses have capability annotations, copy those over.
@@ -113,7 +122,10 @@ def capability(
                     cls.__capabilities.update(base_caps)
         cls.__capabilities[capability_name] = CapabilitySetting(
-            capability=capability_name, description=description, supported=supported
+            capability=capability_name,
+            description=description,
+            supported=supported,
+            subtype_modifier=subtype_modifier,
         )
         return cls

datahub/ingestion/api/report.py CHANGED Viewed

@@ -2,17 +2,31 @@ import dataclasses
 import json
 import logging
 import pprint
-from dataclasses import dataclass
+from collections import defaultdict
+from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 from enum import Enum
-from typing import Any, Optional, runtime_checkable
+from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
 import humanfriendly
 import pydantic
 from pydantic import BaseModel
+from tabulate import tabulate
 from typing_extensions import Literal, Protocol
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.mcp_builder import mcps_from_mce
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.report_helpers import format_datetime_relative
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
+from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
+from datahub.metadata.schema_classes import (
+    MetadataChangeProposalClass,
+    SubTypesClass,
+    UpstreamLineageClass,
+)
+from datahub.utilities.file_backed_collections import FileBackedDict
 from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -82,7 +96,58 @@ class Report(SupportsAsObj):
         }
     def as_string(self) -> str:
-        return pprint.pformat(self.as_obj(), width=150, sort_dicts=False)
+        self_obj = self.as_obj()
+        _aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
+        # Format the main report data
+        result = pprint.pformat(self_obj, width=150, sort_dicts=False)
+        # Add aspects_by_subtypes table if it exists
+        if _aspects_by_subtypes:
+            result += "\n\nAspects by Subtypes:\n"
+            result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
+        return result
+    def _format_aspects_by_subtypes_table(
+        self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
+    ) -> str:
+        """Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
+        if not aspects_by_subtypes:
+            return "No aspects by subtypes data available."
+        all_aspects: set[str] = {
+            aspect
+            for subtypes in aspects_by_subtypes.values()
+            for aspects in subtypes.values()
+            for aspect in aspects
+        }
+        aspect_rows = sorted(all_aspects)
+        entity_subtype_columns = []
+        for entity_type, subtypes in aspects_by_subtypes.items():
+            for subtype in subtypes:
+                entity_subtype_columns.append(f"{entity_type} ({subtype})")
+        entity_subtype_columns.sort()
+        headers = ["Aspect"] + entity_subtype_columns
+        table_data = [
+            [aspect]
+            + [
+                aspects.get(aspect, 0)
+                for subtypes in aspects_by_subtypes.values()
+                for aspects in subtypes.values()
+            ]
+            for aspect in aspect_rows
+        ]
+        if table_data:
+            return tabulate(table_data, headers=headers, tablefmt="grid")
+        else:
+            return "No aspects by subtypes data available."
     def as_json(self) -> str:
         return json.dumps(self.as_obj())
@@ -90,6 +155,14 @@ class Report(SupportsAsObj):
     # TODO add helper method for warning / failure status + counts?
+@dataclass
+class SourceReportSubtypes:
+    urn: str
+    entity_type: str
+    subType: str = field(default="unknown")
+    aspects: Dict[str, int] = field(default_factory=dict)
 class ReportAttribute(BaseModel):
     severity: LogLevel = "DEBUG"
     help: Optional[str] = None
@@ -108,6 +181,262 @@ class ReportAttribute(BaseModel):
         logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
+@dataclass
+class ExamplesReport(Report, Closeable):
+    aspects: Dict[str, Dict[str, int]] = field(
+        default_factory=lambda: defaultdict(lambda: defaultdict(int))
+    )
+    aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
+        default_factory=lambda: defaultdict(
+            lambda: defaultdict(lambda: defaultdict(int))
+        )
+    )
+    samples: Dict[str, Dict[str, List[str]]] = field(
+        default_factory=lambda: defaultdict(lambda: defaultdict(list))
+    )
+    _file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
+    # We are adding this to make querying easier for fine-grained lineage
+    _fine_grained_lineage_special_case_name = "fineGrainedLineages"
+    _samples_to_add: int = 20
+    _lineage_aspects_seen: Set[str] = field(default_factory=set)
+    def __post_init__(self) -> None:
+        self._file_based_dict = FileBackedDict(
+            tablename="urn_aspects",
+            extra_columns={
+                "urn": lambda val: val.urn,
+                "entityType": lambda val: val.entity_type,
+                "subTypes": lambda val: val.subType,
+                "aspects": lambda val: json.dumps(val.aspects),
+            },
+        )
+    def close(self) -> None:
+        self.compute_stats()
+        if self._file_based_dict is not None:
+            self._file_based_dict.close()
+            self._file_based_dict = None
+    def _build_aspects_where_clause(self, aspects: List[str]) -> str:
+        """Build WHERE clause for matching any of the given aspects."""
+        if not aspects:
+            return ""
+        conditions = []
+        for aspect in aspects:
+            conditions.append(f"aspects LIKE '%{aspect}%'")
+        return " OR ".join(conditions)
+    def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
+        """Helper method to collect samples organized by subtype for a given where clause."""
+        subtype_query = f"""
+        SELECT DISTINCT subTypes
+        FROM urn_aspects
+        WHERE {where_clause}
+        """
+        assert self._file_based_dict is not None
+        subtypes = set()
+        for row in self._file_based_dict.sql_query(subtype_query):
+            sub_type = row["subTypes"] or "unknown"
+            subtypes.add(sub_type)
+        for sub_type in subtypes:
+            query = f"""
+            SELECT urn
+            FROM urn_aspects
+            WHERE {where_clause} AND subTypes = ?
+            limit {self._samples_to_add}
+            """
+            for row in self._file_based_dict.sql_query(query, (sub_type,)):
+                self.samples[sample_key][sub_type].append(row["urn"])
+    def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
+        """Helper method to collect samples for entities that have any of the given aspects."""
+        if not aspects:
+            return
+        where_clause = self._build_aspects_where_clause(aspects)
+        self._collect_samples_by_subtype(where_clause, sample_key)
+    def _collect_samples_by_lineage_aspects(
+        self, aspects: List[str], sample_key: str
+    ) -> None:
+        """Helper method to collect samples for entities that have any of the given lineage aspects.
+        Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
+        """
+        if not aspects:
+            return
+        lineage_conditions = []
+        for aspect in aspects:
+            lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
+        where_clause = " OR ".join(lineage_conditions)
+        self._collect_samples_by_subtype(where_clause, sample_key)
+    def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
+        """
+        Collect samples for entities that have lineage, profiling, and usage aspects.
+        These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
+        """
+        if not self._lineage_aspects_seen:
+            return
+        assert self._file_based_dict is not None
+        # Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
+        lineage_conditions = []
+        for aspect in self._lineage_aspects_seen:
+            lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
+        lineage_where_clause = " OR ".join(lineage_conditions)
+        # Build profiling conditions using the same logic as _collect_samples_by_aspects
+        profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
+        # Build usage conditions using the same logic as _collect_samples_by_aspects
+        usage_where_clause = self._build_aspects_where_clause(
+            [
+                "datasetUsageStatistics",
+                "chartUsageStatistics",
+                "dashboardUsageStatistics",
+            ]
+        )
+        query = f"""
+        SELECT urn, subTypes
+        FROM urn_aspects
+        WHERE ({lineage_where_clause})
+        AND ({profiling_where_clause})
+        AND ({usage_where_clause})
+        limit {self._samples_to_add}
+        """
+        for row in self._file_based_dict.sql_query(query):
+            sub_type = row["subTypes"] or "unknown"
+            self.samples[sample_key][sub_type].append(row["urn"])
+    def _has_fine_grained_lineage(
+        self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
+    ) -> bool:
+        if isinstance(mcp.aspect, UpstreamLineageClass):
+            upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
+            if upstream_lineage.fineGrainedLineages:
+                return True
+        return False
+    def _update_file_based_dict(
+        self,
+        urn: str,
+        entityType: str,
+        aspectName: str,
+        mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
+    ) -> None:
+        if is_lineage_aspect(entityType, aspectName):
+            self._lineage_aspects_seen.add(aspectName)
+        has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
+        sub_type = "unknown"
+        if isinstance(mcp.aspect, SubTypesClass):
+            sub_type = mcp.aspect.typeNames[0]
+        assert self._file_based_dict is not None
+        if urn in self._file_based_dict:
+            if sub_type != "unknown":
+                self._file_based_dict[urn].subType = sub_type
+            aspects_dict = self._file_based_dict[urn].aspects
+            if aspectName in aspects_dict:
+                aspects_dict[aspectName] += 1
+            else:
+                aspects_dict[aspectName] = 1
+            if has_fine_grained_lineage:
+                if self._fine_grained_lineage_special_case_name in aspects_dict:
+                    aspects_dict[self._fine_grained_lineage_special_case_name] += 1
+                else:
+                    aspects_dict[self._fine_grained_lineage_special_case_name] = 1
+            self._file_based_dict.mark_dirty(urn)
+        else:
+            aspects_dict = {aspectName: 1}
+            if has_fine_grained_lineage:
+                aspects_dict[self._fine_grained_lineage_special_case_name] = 1
+            self._file_based_dict[urn] = SourceReportSubtypes(
+                urn=urn,
+                entity_type=entityType,
+                subType=sub_type,
+                aspects=aspects_dict,
+            )
+    def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
+        urn = wu.get_urn()
+        if not isinstance(wu.metadata, MetadataChangeEvent):
+            mcps = [wu.metadata]
+        else:
+            mcps = list(mcps_from_mce(wu.metadata))
+        for mcp in mcps:
+            entityType = mcp.entityType
+            aspectName = mcp.aspectName
+            if aspectName is None:
+                continue
+            self._update_file_based_dict(urn, entityType, aspectName, mcp)
+    def compute_stats(self) -> None:
+        if self._file_based_dict is None:
+            return
+        query = """
+        SELECT entityType, subTypes, aspects, count(*) as count
+        FROM urn_aspects
+        group by entityType, subTypes, aspects
+        """
+        entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
+            defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
+        )
+        for row in self._file_based_dict.sql_query(query):
+            entity_type = row["entityType"]
+            sub_type = row["subTypes"]
+            count = row["count"]
+            aspects_raw = row["aspects"] or "[]"
+            aspects = json.loads(aspects_raw)
+            for aspect, aspect_count in aspects.items():
+                entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
+                    aspect_count * count
+                )
+        self.aspects.clear()
+        self.aspects_by_subtypes.clear()
+        _aspects_seen: Set[str] = set()
+        for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
+            for sub_type, aspect_counts in subtype_counts.items():
+                for aspect, count in aspect_counts.items():
+                    self.aspects[entity_type][aspect] += count
+                    _aspects_seen.add(aspect)
+                self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
+        self.samples.clear()
+        self._collect_samples_by_aspects(["datasetProfile"], "profiling")
+        self._collect_samples_by_aspects(
+            [
+                "datasetUsageStatistics",
+                "chartUsageStatistics",
+                "dashboardUsageStatistics",
+            ],
+            "usage",
+        )
+        self._collect_samples_by_lineage_aspects(
+            list(self._lineage_aspects_seen), "lineage"
+        )
+        self._collect_samples_with_all_conditions("all_3")
 class EntityFilterReport(ReportAttribute):
     type: str

datahub/ingestion/api/sink.py CHANGED Viewed

@@ -147,6 +147,9 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
     def close(self) -> None:
         pass
+    def flush(self) -> None:
+        pass
     def configured(self) -> str:
         """Override this method to output a human-readable and scrubbed version of the configured sink"""
         return ""

acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0.1py3-none-any.whl