PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -4,6 +4,11 @@ import functools
 import json
 import logging
 import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from enum import auto
 from json.decoder import JSONDecodeError
 from typing import (
     TYPE_CHECKING,
@@ -17,6 +22,7 @@ from typing import (
     Union,
 )
+import pydantic
 import requests
 from deprecated import deprecated
 from requests.adapters import HTTPAdapter, Retry
@@ -27,13 +33,22 @@ from datahub.cli import config_utils
 from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
 from datahub.cli.env_utils import get_boolean_env_variable
 from datahub.configuration.common import (
+    ConfigEnum,
     ConfigModel,
     ConfigurationError,
     OperationalError,
+    TraceTimeoutError,
+    TraceValidationError,
 )
+from datahub.emitter.aspect import JSON_CONTENT_TYPE
 from datahub.emitter.generic_emitter import Emitter
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.request_helper import make_curl_command
+from datahub.emitter.response_helper import (
+    TraceData,
+    extract_trace_data,
+    extract_trace_data_from_mcps,
+)
 from datahub.emitter.serialization_helper import pre_json_transform
 from datahub.ingestion.api.closeable import Closeable
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
@@ -63,6 +78,11 @@ _DEFAULT_RETRY_MAX_TIMES = int(
 _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
+TRACE_PENDING_STATUS = "PENDING"
+TRACE_INITIAL_BACKOFF = 1.0  # Start with 1 second
+TRACE_MAX_BACKOFF = 300.0  # Cap at 5 minutes
+TRACE_BACKOFF_FACTOR = 2.0  # Double the wait time each attempt
 # The limit is 16mb. We will use a max of 15mb to have some space
 # for overhead like request headers.
 # This applies to pretty much all calls to GMS.
@@ -77,6 +97,29 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
 )
+class RestTraceMode(ConfigEnum):
+    ENABLED = auto()
+    DISABLED = auto()
+class RestSinkEndpoint(ConfigEnum):
+    RESTLI = auto()
+    OPENAPI = auto()
+DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
+    RestSinkEndpoint,
+    os.getenv("DATAHUB_REST_SINK_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
+)
+# Supported with v1.0
+DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
+    RestTraceMode,
+    os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
+)
 class RequestsSessionConfig(ConfigModel):
     timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
@@ -143,10 +186,32 @@ class RequestsSessionConfig(ConfigModel):
         return session
+@dataclass
+class _Chunk:
+    items: List[str]
+    total_bytes: int = 0
+    def add_item(self, item: str) -> bool:
+        item_bytes = len(item.encode())
+        if not self.items:  # Always add at least one item even if over byte limit
+            self.items.append(item)
+            self.total_bytes += item_bytes
+            return True
+        self.items.append(item)
+        self.total_bytes += item_bytes
+        return True
+    @staticmethod
+    def join(chunk: "_Chunk") -> str:
+        return "[" + ",".join(chunk.items) + "]"
 class DataHubRestEmitter(Closeable, Emitter):
     _gms_server: str
     _token: Optional[str]
     _session: requests.Session
+    _openapi_ingestion: bool
+    _default_trace_mode: bool
     def __init__(
         self,
@@ -162,6 +227,8 @@ class DataHubRestEmitter(Closeable, Emitter):
         ca_certificate_path: Optional[str] = None,
         client_certificate_path: Optional[str] = None,
         disable_ssl_verification: bool = False,
+        openapi_ingestion: bool = False,
+        default_trace_mode: bool = False,
     ):
         if not gms_server:
             raise ConfigurationError("gms server is required")
@@ -174,9 +241,17 @@ class DataHubRestEmitter(Closeable, Emitter):
         self._gms_server = fixup_gms_url(gms_server)
         self._token = token
         self.server_config: Dict[str, Any] = {}
+        self._openapi_ingestion = openapi_ingestion
+        self._default_trace_mode = default_trace_mode
         self._session = requests.Session()
+        logger.debug(
+            f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
+        )
+        if self._default_trace_mode:
+            logger.debug("Using API Tracing for ingestion.")
         headers = {
             "X-RestLi-Protocol-Version": "2.0.0",
             "X-DataHub-Py-Cli-Version": nice_version_name(),
@@ -264,6 +339,43 @@ class DataHubRestEmitter(Closeable, Emitter):
         return DataHubGraph.from_emitter(self)
+    def _to_openapi_request(
+        self,
+        mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
+        async_flag: Optional[bool] = None,
+        async_default: bool = False,
+    ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
+        if mcp.aspect and mcp.aspectName:
+            resolved_async_flag = (
+                async_flag if async_flag is not None else async_default
+            )
+            url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
+            if isinstance(mcp, MetadataChangeProposalWrapper):
+                aspect_value = pre_json_transform(
+                    mcp.to_obj(simplified_structure=True)
+                )["aspect"]["json"]
+            else:
+                obj = mcp.aspect.to_obj()
+                if obj.get("value") and obj.get("contentType") == JSON_CONTENT_TYPE:
+                    obj = json.loads(obj["value"])
+                aspect_value = pre_json_transform(obj)
+            return (
+                url,
+                [
+                    {
+                        "urn": mcp.entityUrn,
+                        mcp.aspectName: {
+                            "value": aspect_value,
+                            "systemMetadata": mcp.systemMetadata.to_obj()
+                            if mcp.systemMetadata
+                            else None,
+                        },
+                    }
+                ],
+            )
+        return None
     def emit(
         self,
         item: Union[
@@ -316,31 +428,135 @@ class DataHubRestEmitter(Closeable, Emitter):
         self,
         mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
         async_flag: Optional[bool] = None,
+        trace_flag: Optional[bool] = None,
+        trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
     ) -> None:
-        url = f"{self._gms_server}/aspects?action=ingestProposal"
         ensure_has_system_metadata(mcp)
-        mcp_obj = pre_json_transform(mcp.to_obj())
-        payload_dict = {"proposal": mcp_obj}
+        trace_data = None
-        if async_flag is not None:
-            payload_dict["async"] = "true" if async_flag else "false"
+        if self._openapi_ingestion:
+            request = self._to_openapi_request(mcp, async_flag, async_default=False)
+            if request:
+                response = self._emit_generic(request[0], payload=request[1])
-        payload = json.dumps(payload_dict)
+                if self._should_trace(async_flag, trace_flag):
+                    trace_data = extract_trace_data(response) if response else None
-        self._emit_generic(url, payload)
+        else:
+            url = f"{self._gms_server}/aspects?action=ingestProposal"
+            mcp_obj = pre_json_transform(mcp.to_obj())
+            payload_dict = {"proposal": mcp_obj}
+            if async_flag is not None:
+                payload_dict["async"] = "true" if async_flag else "false"
+            payload = json.dumps(payload_dict)
+            response = self._emit_generic(url, payload)
+            if self._should_trace(async_flag, trace_flag):
+                trace_data = (
+                    extract_trace_data_from_mcps(response, [mcp]) if response else None
+                )
+        if trace_data:
+            self._await_status(
+                [trace_data],
+                trace_timeout,
+            )
     def emit_mcps(
         self,
         mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
         async_flag: Optional[bool] = None,
+        trace_flag: Optional[bool] = None,
+        trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
     ) -> int:
         if _DATAHUB_EMITTER_TRACE:
             logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
-        url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
         for mcp in mcps:
             ensure_has_system_metadata(mcp)
+        if self._openapi_ingestion:
+            return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
+        else:
+            return self._emit_restli_mcps(mcps, async_flag)
+    def _emit_openapi_mcps(
+        self,
+        mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
+        async_flag: Optional[bool] = None,
+        trace_flag: Optional[bool] = None,
+        trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
+    ) -> int:
+        """
+        1. Grouping MCPs by their entity URL
+        2. Breaking down large batches into smaller chunks based on both:
+         * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
+         * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
+        The Chunk class encapsulates both the items and their byte size tracking
+        Serializing the items only once with json.dumps(request[1]) and reusing that
+        The chunking logic handles edge cases (always accepting at least one item per chunk)
+        The joining logic is efficient with a simple string concatenation
+        :param mcps: metadata change proposals to transmit
+        :param async_flag: the mode
+        :return: number of requests
+        """
+        # group by entity url
+        batches: Dict[str, List[_Chunk]] = defaultdict(
+            lambda: [_Chunk(items=[])]
+        )  # Initialize with one empty Chunk
+        for mcp in mcps:
+            request = self._to_openapi_request(mcp, async_flag, async_default=True)
+            if request:
+                current_chunk = batches[request[0]][-1]  # Get the last chunk
+                # Only serialize once
+                serialized_item = json.dumps(request[1][0])
+                item_bytes = len(serialized_item.encode())
+                # If adding this item would exceed max_bytes, create a new chunk
+                # Unless the chunk is empty (always add at least one item)
+                if current_chunk.items and (
+                    current_chunk.total_bytes + item_bytes > INGEST_MAX_PAYLOAD_BYTES
+                    or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
+                ):
+                    new_chunk = _Chunk(items=[])
+                    batches[request[0]].append(new_chunk)
+                    current_chunk = new_chunk
+                current_chunk.add_item(serialized_item)
+        responses = []
+        for url, chunks in batches.items():
+            for chunk in chunks:
+                response = self._emit_generic(url, payload=_Chunk.join(chunk))
+                responses.append(response)
+        if self._should_trace(async_flag, trace_flag, async_default=True):
+            trace_data = []
+            for response in responses:
+                data = extract_trace_data(response) if response else None
+                if data is not None:
+                    trace_data.append(data)
+            if trace_data:
+                self._await_status(trace_data, trace_timeout)
+        return len(responses)
+    def _emit_restli_mcps(
+        self,
+        mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
+        async_flag: Optional[bool] = None,
+    ) -> int:
+        url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
         mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
         # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
@@ -392,7 +608,10 @@ class DataHubRestEmitter(Closeable, Emitter):
         payload = json.dumps(snapshot)
         self._emit_generic(url, payload)
-    def _emit_generic(self, url: str, payload: str) -> None:
+    def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
+        if not isinstance(payload, str):
+            payload = json.dumps(payload)
         curl_command = make_curl_command(self._session, "POST", url, payload)
         payload_size = len(payload)
         if payload_size > INGEST_MAX_PAYLOAD_BYTES:
@@ -408,6 +627,7 @@ class DataHubRestEmitter(Closeable, Emitter):
         try:
             response = self._session.post(url, data=payload)
             response.raise_for_status()
+            return response
         except HTTPError as e:
             try:
                 info: Dict = response.json()
@@ -438,6 +658,99 @@ class DataHubRestEmitter(Closeable, Emitter):
                 "Unable to emit metadata to DataHub GMS", {"message": str(e)}
             ) from e
+    def _await_status(
+        self,
+        trace_data: List[TraceData],
+        trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
+    ) -> None:
+        """Verify the status of asynchronous write operations.
+        Args:
+            trace_data: List of trace data to verify
+            trace_timeout: Maximum time to wait for verification.
+        Raises:
+            TraceTimeoutError: If verification fails or times out
+            TraceValidationError: Expected write was not completed successfully
+        """
+        if trace_timeout is None:
+            raise ValueError("trace_timeout cannot be None")
+        try:
+            if not trace_data:
+                logger.debug("No trace data to verify")
+                return
+            start_time = datetime.now()
+            for trace in trace_data:
+                current_backoff = TRACE_INITIAL_BACKOFF
+                while trace.data:
+                    if datetime.now() - start_time > trace_timeout:
+                        raise TraceTimeoutError(
+                            f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
+                        )
+                    base_url = f"{self._gms_server}/openapi/v1/trace/write"
+                    url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"
+                    response = self._emit_generic(url, payload=trace.data)
+                    json_data = response.json()
+                    for urn, aspects in json_data.items():
+                        for aspect_name, aspect_status in aspects.items():
+                            if not aspect_status["success"]:
+                                error_msg = (
+                                    f"Unable to validate async write to DataHub GMS: "
+                                    f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
+                                    f"Status: {aspect_status}"
+                                )
+                                raise TraceValidationError(error_msg, aspect_status)
+                            primary_storage = aspect_status["primaryStorage"][
+                                "writeStatus"
+                            ]
+                            search_storage = aspect_status["searchStorage"][
+                                "writeStatus"
+                            ]
+                            # Remove resolved statuses
+                            if (
+                                primary_storage != TRACE_PENDING_STATUS
+                                and search_storage != TRACE_PENDING_STATUS
+                            ):
+                                trace.data[urn].remove(aspect_name)
+                        # Remove urns with all statuses resolved
+                        if not trace.data[urn]:
+                            trace.data.pop(urn)
+                    # Adjust backoff based on response
+                    if trace.data:
+                        # If we still have pending items, increase backoff
+                        current_backoff = min(
+                            current_backoff * TRACE_BACKOFF_FACTOR, TRACE_MAX_BACKOFF
+                        )
+                        logger.debug(
+                            f"Waiting {current_backoff} seconds before next check"
+                        )
+                        time.sleep(current_backoff)
+        except Exception as e:
+            logger.error(f"Error during status verification: {str(e)}")
+            raise
+    def _should_trace(
+        self,
+        async_flag: Optional[bool] = None,
+        trace_flag: Optional[bool] = None,
+        async_default: bool = False,
+    ) -> bool:
+        resolved_trace_flag = (
+            trace_flag if trace_flag is not None else self._default_trace_mode
+        )
+        resolved_async_flag = async_flag if async_flag is not None else async_default
+        return resolved_trace_flag and resolved_async_flag
     def __repr__(self) -> str:
         token_str = (
             f" with token: {self._token[:4]}**********{self._token[-4:]}"

datahub/ingestion/api/decorators.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum, auto
 from typing import Callable, Dict, Optional, Type
 from datahub.ingestion.api.common import PipelineContext
-from datahub.ingestion.api.source import (  # noqa: I250
+from datahub.ingestion.api.source import (
     Source,
     SourceCapability as SourceCapability,
 )

datahub/ingestion/api/source_helpers.py CHANGED Viewed

@@ -250,6 +250,10 @@ def auto_browse_path_v2(
     emitted_urns: Set[str] = set()
     containers_used_as_parent: Set[str] = set()
     for urn, batch in _batch_workunits_by_urn(stream):
+        # Do not generate browse path v2 for entities that do not support it
+        if not entity_supports_aspect(guess_entity_type(urn), BrowsePathsV2Class):
+            yield from batch
+            continue
         container_path: Optional[List[BrowsePathEntryClass]] = None
         legacy_path: Optional[List[BrowsePathEntryClass]] = None
         browse_path_v2: Optional[List[BrowsePathEntryClass]] = None

datahub/ingestion/fs/s3_fs.py CHANGED Viewed

@@ -48,12 +48,12 @@ class S3ListIterator(Iterator):
     def __next__(self) -> FileInfo:
         try:
             return next(self._file_statuses)
-        except StopIteration:
+        except StopIteration as e:
             if self._token:
                 self.fetch()
                 return next(self._file_statuses)
             else:
-                raise StopIteration()
+                raise e
     def fetch(self):
         params = dict(Bucket=self._bucket, Prefix=self._prefix, MaxKeys=self._max_keys)

datahub/ingestion/glossary/classification_mixin.py CHANGED Viewed

@@ -279,11 +279,7 @@ class ClassificationHandler:
                             "Dataset_Name": dataset_name,
                         }
                     ),
-                    values=(
-                        sample_data[schema_field.fieldPath]
-                        if schema_field.fieldPath in sample_data.keys()
-                        else []
-                    ),
+                    values=sample_data.get(schema_field.fieldPath, []),
                 )
             )

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl