PyPI - acryl-datahub - Versions diffs - 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
datahub/__init__.py +1 -1
datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
datahub/configuration/common.py +2 -5
datahub/configuration/source_common.py +13 -0
datahub/emitter/mce_builder.py +20 -4
datahub/emitter/mcp_builder.py +2 -7
datahub/emitter/mcp_patch_builder.py +37 -13
datahub/emitter/rest_emitter.py +25 -3
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
datahub/ingestion/api/closeable.py +3 -3
datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
datahub/ingestion/api/report.py +4 -1
datahub/ingestion/api/sink.py +4 -3
datahub/ingestion/api/source.py +4 -0
datahub/ingestion/api/source_helpers.py +2 -6
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/graph/client.py +6 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
datahub/ingestion/source/aws/aws_common.py +231 -27
datahub/ingestion/source/aws/glue.py +12 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
datahub/ingestion/source/datahub/config.py +22 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
datahub/ingestion/source/gc/datahub_gc.py +21 -5
datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
datahub/ingestion/source/iceberg/iceberg.py +27 -1
datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
datahub/ingestion/source/kafka_connect/__init__.py +0 -0
datahub/ingestion/source/kafka_connect/common.py +202 -0
datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
datahub/ingestion/source/looker/looker_common.py +63 -2
datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
datahub/ingestion/source/looker/looker_source.py +31 -4
datahub/ingestion/source/looker/looker_usage.py +23 -17
datahub/ingestion/source/mlflow.py +30 -5
datahub/ingestion/source/mode.py +40 -27
datahub/ingestion/source/powerbi/config.py +1 -14
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
datahub/ingestion/source/s3/source.py +1 -1
datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
datahub/ingestion/source/sql/hive.py +621 -8
datahub/ingestion/source/sql/hive_metastore.py +7 -0
datahub/ingestion/source/sql/mssql/job_models.py +30 -1
datahub/ingestion/source/sql/mssql/source.py +15 -1
datahub/ingestion/source/sql/sql_common.py +41 -102
datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
datahub/ingestion/source/sql/sql_report.py +2 -0
datahub/ingestion/source/state/checkpoint.py +2 -1
datahub/ingestion/source/tableau/tableau.py +122 -45
datahub/ingestion/source/tableau/tableau_common.py +18 -0
datahub/ingestion/source/tableau/tableau_constant.py +3 -1
datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/proxy.py +8 -27
datahub/ingestion/source/usage/usage_common.py +15 -1
datahub/ingestion/source_report/ingestion_stage.py +3 -0
datahub/metadata/_schema_classes.py +256 -3
datahub/metadata/_urns/urn_defs.py +168 -168
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
datahub/metadata/schema.avsc +252 -33
datahub/metadata/schemas/DataJobKey.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
datahub/metadata/schemas/MLModelProperties.avsc +62 -2
datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
datahub/specific/aspect_helpers/__init__.py +0 -0
datahub/specific/aspect_helpers/custom_properties.py +79 -0
datahub/specific/aspect_helpers/ownership.py +67 -0
datahub/specific/aspect_helpers/structured_properties.py +72 -0
datahub/specific/aspect_helpers/tags.py +42 -0
datahub/specific/aspect_helpers/terms.py +43 -0
datahub/specific/chart.py +28 -184
datahub/specific/dashboard.py +31 -196
datahub/specific/datajob.py +34 -189
datahub/specific/dataproduct.py +24 -86
datahub/specific/dataset.py +48 -133
datahub/specific/form.py +12 -32
datahub/specific/structured_property.py +9 -9
datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
datahub/sql_parsing/sqlglot_lineage.py +15 -5
datahub/sql_parsing/tool_meta_extractor.py +119 -5
datahub/utilities/time.py +8 -3
datahub/utilities/urns/_urn_base.py +5 -7
datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
datahub/specific/custom_properties.py +0 -37
datahub/specific/ownership.py +0 -48
datahub/specific/structured_properties.py +0 -53
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0

datahub/api/circuit_breaker/assertion_circuit_breaker.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from typing import Any, Dict, List, Optional
 from pydantic import Field
@@ -10,6 +10,7 @@ from datahub.api.circuit_breaker.circuit_breaker import (
     CircuitBreakerConfig,
 )
 from datahub.api.graphql import Assertion, Operation
+from datahub.emitter.mce_builder import parse_ts_millis
 logger: logging.Logger = logging.getLogger(__name__)
@@ -49,7 +50,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
         if not operations:
             return None
         else:
-            return datetime.fromtimestamp(operations[0]["lastUpdatedTimestamp"] / 1000)
+            return parse_ts_millis(operations[0]["lastUpdatedTimestamp"])
     def _check_if_assertion_failed(
         self, assertions: List[Dict[str, Any]], last_updated: Optional[datetime] = None
@@ -93,7 +94,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
                 logger.info(f"Found successful assertion: {assertion_urn}")
                 result = False
             if last_updated is not None:
-                last_run = datetime.fromtimestamp(last_assertion.time / 1000)
+                last_run = parse_ts_millis(last_assertion.time)
                 if last_updated > last_run:
                     logger.error(
                         f"Missing assertion run for {assertion_urn}. The dataset was updated on {last_updated} but the last assertion run was at {last_run}"
@@ -117,7 +118,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
             )
         if not last_updated:
-            last_updated = datetime.now() - self.config.time_delta
+            last_updated = datetime.now(tz=timezone.utc) - self.config.time_delta
             logger.info(
                 f"Dataset {urn} doesn't have last updated or check_last_assertion_time is false, using calculated min assertion date {last_updated}"
             )

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -14,7 +14,7 @@ from datahub.metadata.schema_classes import (
     PropertyValueClass,
     StructuredPropertyDefinitionClass,
 )
-from datahub.metadata.urns import StructuredPropertyUrn, Urn
+from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn
 from datahub.utilities.urns._urn_base import URN_TYPES
 logging.basicConfig(level=logging.INFO)
@@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel):
     @validator("type")
     def validate_type(cls, v: str) -> str:
-        # Convert to lowercase if needed
-        if not v.islower():
+        # This logic is somewhat hacky, since we need to deal with
+        # 1. fully qualified urns
+        # 2. raw data types, that need to get the datahub namespace prefix
+        # While keeping the user-facing interface and error messages clean.
+        if not v.startswith("urn:li:") and not v.islower():
+            # Convert to lowercase if needed
+            v = v.lower()
             logger.warning(
-                f"Structured property type should be lowercase. Updated to {v.lower()}"
+                f"Structured property type should be lowercase. Updated to {v}"
             )
-            v = v.lower()
+        urn = Urn.make_data_type_urn(v)
         # Check if type is allowed
-        if not AllowedTypes.check_allowed_type(v):
+        data_type_urn = DataTypeUrn.from_string(urn)
+        unqualified_data_type = data_type_urn.id
+        if unqualified_data_type.startswith("datahub."):
+            unqualified_data_type = unqualified_data_type[len("datahub.") :]
+        if not AllowedTypes.check_allowed_type(unqualified_data_type):
             raise ValueError(
-                f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}"
+                f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}"
             )
-        return v
+        return urn
     @property
     def fqn(self) -> str:

datahub/configuration/common.py CHANGED Viewed

@@ -10,7 +10,6 @@ from typing import (
     List,
     Optional,
     Type,
-    TypeVar,
     Union,
     runtime_checkable,
 )
@@ -19,14 +18,12 @@ import pydantic
 from cached_property import cached_property
 from pydantic import BaseModel, Extra, ValidationError
 from pydantic.fields import Field
-from typing_extensions import Protocol
+from typing_extensions import Protocol, Self
 from datahub.configuration._config_enum import ConfigEnum as ConfigEnum  # noqa: I250
 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
 from datahub.utilities.dedup_list import deduplicate_list
-_ConfigSelf = TypeVar("_ConfigSelf", bound="ConfigModel")
 REDACT_KEYS = {
     "password",
     "token",
@@ -109,7 +106,7 @@ class ConfigModel(BaseModel):
             schema_extra = _schema_extra
     @classmethod
-    def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf:
+    def parse_obj_allow_extras(cls, obj: Any) -> Self:
         if PYDANTIC_VERSION_2:
             try:
                 with unittest.mock.patch.dict(

datahub/configuration/source_common.py CHANGED Viewed

@@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin):
         default=None,
         description="A holder for platform -> platform_instance mappings to generate correct dataset urns",
     )
+class PlatformDetail(ConfigModel):
+    platform_instance: Optional[str] = Field(
+        default=None,
+        description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
+        "with platform instance name used in ingestion "
+        "recipe of other datahub sources.",
+    )
+    env: str = Field(
+        default=DEFAULT_ENV,
+        description="The environment that all assets produced by DataHub platform ingestion source belong to",
+    )

datahub/emitter/mce_builder.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 import os
 import re
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from enum import Enum
 from typing import (
     TYPE_CHECKING,
@@ -24,6 +24,7 @@ from typing import (
 import typing_inspect
 from avrogen.dict_wrapper import DictWrapper
+from typing_extensions import assert_never
 from datahub.emitter.enum_helpers import get_enum_options
 from datahub.metadata.schema_classes import (
@@ -103,6 +104,22 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
     return int(ts.timestamp() * 1000)
+@overload
+def parse_ts_millis(ts: float) -> datetime:
+    ...
+@overload
+def parse_ts_millis(ts: None) -> None:
+    ...
+def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
+    if ts is None:
+        return None
+    return datetime.fromtimestamp(ts / 1000, tz=timezone.utc)
 def make_data_platform_urn(platform: str) -> str:
     if platform.startswith("urn:li:dataPlatform:"):
         return platform
@@ -253,9 +270,8 @@ def make_owner_urn(owner: str, owner_type: OwnerType) -> str:
         return make_user_urn(owner)
     elif owner_type == OwnerType.GROUP:
         return make_group_urn(owner)
-    # This should pretty much never happen.
-    # TODO: With Python 3.11, we can use typing.assert_never() here.
-    return f"urn:li:{owner_type.value}:{owner}"
+    else:
+        assert_never(owner_type)
 def make_ownership_type_urn(type: str) -> str:

datahub/emitter/mcp_builder.py CHANGED Viewed

@@ -4,8 +4,8 @@ from pydantic.fields import Field
 from pydantic.main import BaseModel
 from datahub.cli.env_utils import get_boolean_env_variable
-from datahub.emitter.enum_helpers import get_enum_options
 from datahub.emitter.mce_builder import (
+    ALL_ENV_TYPES,
     Aspect,
     datahub_guid,
     make_container_urn,
@@ -25,7 +25,6 @@ from datahub.metadata.schema_classes import (
     ContainerClass,
     DomainsClass,
     EmbedClass,
-    FabricTypeClass,
     GlobalTagsClass,
     MetadataChangeEventClass,
     OwnerClass,
@@ -206,11 +205,7 @@ def gen_containers(
     # Extra validation on the env field.
     # In certain cases (mainly for backwards compatibility), the env field will actually
     # have a platform instance name.
-    env = (
-        container_key.env
-        if container_key.env in get_enum_options(FabricTypeClass)
-        else None
-    )
+    env = container_key.env if container_key.env in ALL_ENV_TYPES else None
     container_urn = container_key.as_urn()

datahub/emitter/mcp_patch_builder.py CHANGED Viewed

@@ -2,7 +2,19 @@ import json
 import time
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    runtime_checkable,
+)
+from typing_extensions import LiteralString
 from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
 from datahub.emitter.serialization_helper import pre_json_transform
@@ -19,25 +31,36 @@ from datahub.metadata.urns import Urn
 from datahub.utilities.urns.urn import guess_entity_type
+@runtime_checkable
+class SupportsToObj(Protocol):
+    def to_obj(self) -> Any:
+        ...
 def _recursive_to_obj(obj: Any) -> Any:
     if isinstance(obj, list):
         return [_recursive_to_obj(v) for v in obj]
-    elif hasattr(obj, "to_obj"):
+    elif isinstance(obj, SupportsToObj):
         return obj.to_obj()
     else:
         return obj
+PatchPath = Tuple[Union[LiteralString, Urn], ...]
+PatchOp = Literal["add", "remove", "replace"]
 @dataclass
-class _Patch:
-    op: str  # one of ['add', 'remove', 'replace']; we don't support move, copy or test
-    path: str
+class _Patch(SupportsToObj):
+    op: PatchOp
+    path: PatchPath
     value: Any
     def to_obj(self) -> Dict:
+        quoted_path = "/" + "/".join(MetadataPatchProposal.quote(p) for p in self.path)
         return {
             "op": self.op,
-            "path": self.path,
+            "path": quoted_path,
             "value": _recursive_to_obj(self.value),
         }
@@ -63,19 +86,20 @@ class MetadataPatchProposal:
     # Json Patch quoting based on https://jsonpatch.com/#json-pointer
     @classmethod
-    def quote(cls, value: str) -> str:
-        return value.replace("~", "~0").replace("/", "~1")
+    def quote(cls, value: Union[str, Urn]) -> str:
+        return str(value).replace("~", "~0").replace("/", "~1")
     def _add_patch(
-        self, aspect_name: str, op: str, path: Union[str, Sequence[str]], value: Any
+        self,
+        aspect_name: str,
+        op: PatchOp,
+        path: PatchPath,
+        value: Any,
     ) -> None:
-        if not isinstance(path, str):
-            path = "/" + "/".join(self.quote(p) for p in path)
         # TODO: Validate that aspectName is a valid aspect for this entityType
         self.patches[aspect_name].append(_Patch(op, path, value))
-    def build(self) -> Iterable[MetadataChangeProposalClass]:
+    def build(self) -> List[MetadataChangeProposalClass]:
         return [
             MetadataChangeProposalClass(
                 entityUrn=self.urn,

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import logging
 import os
 from json.decoder import JSONDecodeError
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
 import requests
 from deprecated import deprecated
@@ -13,6 +13,7 @@ from requests.exceptions import HTTPError, RequestException
 from datahub import nice_version_name
 from datahub.cli import config_utils
 from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
+from datahub.cli.env_utils import get_boolean_env_variable
 from datahub.configuration.common import ConfigurationError, OperationalError
 from datahub.emitter.generic_emitter import Emitter
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -46,6 +47,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
     os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
 )
+_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
 # The limit is 16mb. We will use a max of 15mb to have some space
 # for overhead like request headers.
 # This applies to pretty much all calls to GMS.
@@ -288,9 +291,11 @@ class DataHubRestEmitter(Closeable, Emitter):
     def emit_mcps(
         self,
-        mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
+        mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
         async_flag: Optional[bool] = None,
     ) -> int:
+        if _DATAHUB_EMITTER_TRACE:
+            logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
         url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
         for mcp in mcps:
             ensure_has_system_metadata(mcp)
@@ -303,15 +308,25 @@ class DataHubRestEmitter(Closeable, Emitter):
         current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
         for mcp_obj in mcp_objs:
             mcp_obj_size = len(json.dumps(mcp_obj))
+            if _DATAHUB_EMITTER_TRACE:
+                logger.debug(
+                    f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
+                )
             if (
                 mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
                 or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
             ):
+                if _DATAHUB_EMITTER_TRACE:
+                    logger.debug("Decided to create new chunk")
                 mcp_obj_chunks.append([])
                 current_chunk_size = 0
             mcp_obj_chunks[-1].append(mcp_obj)
             current_chunk_size += mcp_obj_size
+        if len(mcp_obj_chunks) > 0:
+            logger.debug(
+                f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
+            )
         for mcp_obj_chunk in mcp_obj_chunks:
             # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +353,15 @@ class DataHubRestEmitter(Closeable, Emitter):
     def _emit_generic(self, url: str, payload: str) -> None:
         curl_command = make_curl_command(self._session, "POST", url, payload)
+        payload_size = len(payload)
+        if payload_size > INGEST_MAX_PAYLOAD_BYTES:
+            # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
+            logger.warning(
+                f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
+            )
         logger.debug(
-            "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
+            "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
+            payload_size,
             curl_command,
         )
         try:

datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py ADDED Viewed

@@ -0,0 +1,98 @@
+import json
+import logging
+from typing import TYPE_CHECKING, Iterable, List
+from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
+from datahub.emitter.serialization_helper import pre_json_transform
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.metadata.schema_classes import (
+    DatasetProfileClass,
+    SchemaFieldClass,
+    SchemaMetadataClass,
+)
+if TYPE_CHECKING:
+    from datahub.ingestion.api.source import SourceReport
+logger = logging.getLogger(__name__)
+class EnsureAspectSizeProcessor:
+    def __init__(
+        self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
+    ):
+        self.report = report
+        self.payload_constraint = payload_constraint
+    def ensure_dataset_profile_size(
+        self, dataset_urn: str, profile: DatasetProfileClass
+    ) -> None:
+        """
+        This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
+        in the future
+        """
+        sample_fields_size = 0
+        if profile.fieldProfiles:
+            logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
+            for field in profile.fieldProfiles:
+                if field.sampleValues:
+                    values_len = 0
+                    for value in field.sampleValues:
+                        if value:
+                            values_len += len(value)
+                    logger.debug(
+                        f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
+                    )
+                    if sample_fields_size + values_len > self.payload_constraint:
+                        field.sampleValues = []
+                        self.report.warning(
+                            title="Dataset profile truncated due to size constraint",
+                            message="Dataset profile contained too much data and would have caused ingestion to fail",
+                            context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
+                        )
+                    else:
+                        sample_fields_size += values_len
+                else:
+                    logger.debug(f"Field {field.fieldPath} has no sample values")
+    def ensure_schema_metadata_size(
+        self, dataset_urn: str, schema: SchemaMetadataClass
+    ) -> None:
+        """
+        This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
+        in the future
+        """
+        total_fields_size = 0
+        logger.debug(f"Amount of schema fields: {len(schema.fields)}")
+        accepted_fields: List[SchemaFieldClass] = []
+        for field in schema.fields:
+            field_size = len(json.dumps(pre_json_transform(field.to_obj())))
+            logger.debug(f"Field {field.fieldPath} takes total {field_size}")
+            if total_fields_size + field_size < self.payload_constraint:
+                accepted_fields.append(field)
+                total_fields_size += field_size
+            else:
+                self.report.warning(
+                    title="Schema truncated due to size constraint",
+                    message="Dataset schema contained too much data and would have caused ingestion to fail",
+                    context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
+                )
+        schema.fields = accepted_fields
+    def ensure_aspect_size(
+        self,
+        stream: Iterable[MetadataWorkUnit],
+    ) -> Iterable[MetadataWorkUnit]:
+        """
+        We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
+        on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
+        """
+        for wu in stream:
+            logger.debug(f"Ensuring size of workunit: {wu.id}")
+            if schema := wu.get_aspect_of_type(SchemaMetadataClass):
+                self.ensure_schema_metadata_size(wu.get_urn(), schema)
+            elif profile := wu.get_aspect_of_type(DatasetProfileClass):
+                self.ensure_dataset_profile_size(wu.get_urn(), profile)
+            yield wu

datahub/ingestion/api/closeable.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from abc import abstractmethod
 from contextlib import AbstractContextManager
 from types import TracebackType
-from typing import Optional, Type, TypeVar
+from typing import Optional, Type
-_Self = TypeVar("_Self", bound="Closeable")
+from typing_extensions import Self
 class Closeable(AbstractContextManager):
@@ -11,7 +11,7 @@ class Closeable(AbstractContextManager):
     def close(self) -> None:
         pass
-    def __enter__(self: _Self) -> _Self:
+    def __enter__(self) -> Self:
         # This method is mainly required for type checking.
         return self

datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import Any, Dict, NewType, Optional, Type, TypeVar
+from typing import Any, Dict, NewType, Optional
+from typing_extensions import Self
 import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import ConfigModel
@@ -17,9 +19,6 @@ class IngestionCheckpointingProviderConfig(ConfigModel):
     pass
-_Self = TypeVar("_Self", bound="IngestionCheckpointingProviderBase")
 @dataclass()
 class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStatesMap]):
     """
@@ -32,9 +31,7 @@ class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStates
     @classmethod
     @abstractmethod
-    def create(
-        cls: Type[_Self], config_dict: Dict[str, Any], ctx: PipelineContext
-    ) -> "_Self":
+    def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> Self:
         pass
     @abstractmethod

datahub/ingestion/api/report.py CHANGED Viewed

@@ -42,7 +42,10 @@ class Report(SupportsAsObj):
             return some_val.as_obj()
         elif isinstance(some_val, pydantic.BaseModel):
             return Report.to_pure_python_obj(some_val.dict())
-        elif dataclasses.is_dataclass(some_val):
+        elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type):
+            # The `is_dataclass` function returns `True` for both instances and classes.
+            # We need an extra check to ensure an instance was passed in.
+            # https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass
             return dataclasses.asdict(some_val)
         elif isinstance(some_val, list):
             return [Report.to_pure_python_obj(v) for v in some_val if v is not None]

datahub/ingestion/api/sink.py CHANGED Viewed

@@ -3,6 +3,8 @@ from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass, field
 from typing import Any, Generic, Optional, Type, TypeVar, cast
+from typing_extensions import Self
 from datahub.configuration.common import ConfigModel
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
@@ -79,7 +81,6 @@ class NoopWriteCallback(WriteCallback):
 SinkReportType = TypeVar("SinkReportType", bound=SinkReport, covariant=True)
 SinkConfig = TypeVar("SinkConfig", bound=ConfigModel, covariant=True)
-Self = TypeVar("Self", bound="Sink")
 class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
@@ -90,7 +91,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
     report: SinkReportType
     @classmethod
-    def get_config_class(cls: Type[Self]) -> Type[SinkConfig]:
+    def get_config_class(cls) -> Type[SinkConfig]:
         config_class = get_class_from_annotation(cls, Sink, ConfigModel)
         assert config_class, "Sink subclasses must define a config class"
         return cast(Type[SinkConfig], config_class)
@@ -112,7 +113,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
         pass
     @classmethod
-    def create(cls: Type[Self], config_dict: dict, ctx: PipelineContext) -> "Self":
+    def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self":
         return cls(ctx, cls.get_config_class().parse_obj(config_dict))
     def handle_work_unit_start(self, workunit: WorkUnit) -> None:

datahub/ingestion/api/source.py CHANGED Viewed

@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
 from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
     auto_patch_last_modified,
 )
+from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
+    EnsureAspectSizeProcessor,
+)
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
 from datahub.ingestion.api.report import Report
@@ -450,6 +453,7 @@ class Source(Closeable, metaclass=ABCMeta):
             browse_path_processor,
             partial(auto_workunit_reporter, self.get_report()),
             auto_patch_last_modified,
+            EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
         ]
     @staticmethod

datahub/ingestion/api/source_helpers.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from datetime import datetime, timezone
 from typing import (
     TYPE_CHECKING,
     Dict,
@@ -14,7 +13,7 @@ from typing import (
 )
 from datahub.configuration.time_window_config import BaseTimeWindowConfig
-from datahub.emitter.mce_builder import make_dataplatform_instance_urn
+from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import entity_supports_aspect
 from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -479,10 +478,7 @@ def auto_empty_dataset_usage_statistics(
     if invalid_timestamps:
         logger.warning(
             f"Usage statistics with unexpected timestamps, bucket_duration={config.bucket_duration}:\n"
-            ", ".join(
-                str(datetime.fromtimestamp(ts / 1000, tz=timezone.utc))
-                for ts in invalid_timestamps
-            )
+            ", ".join(str(parse_ts_millis(ts)) for ts in invalid_timestamps)
         )
     for bucket in bucket_timestamps:

datahub/ingestion/glossary/classifier.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
     )
     max_workers: int = Field(
-        default=(os.cpu_count() or 4),
-        description="Number of worker processes to use for classification. Set to 1 to disable.",
+        default=1,
+        description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
     )
     table_pattern: AllowDenyPattern = Field(

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -188,9 +188,12 @@ class DataHubGraph(DatahubRestEmitter):
                 retry_max_times=emitter._retry_max_times,
                 extra_headers=emitter._session.headers,
                 disable_ssl_verification=emitter._session.verify is False,
-                # TODO: Support these headers.
-                # ca_certificate_path=emitter._ca_certificate_path,
-                # client_certificate_path=emitter._client_certificate_path,
+                ca_certificate_path=(
+                    emitter._session.verify
+                    if isinstance(emitter._session.verify, str)
+                    else None
+                ),
+                client_certificate_path=emitter._session.cert,
             )
         )

acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl