acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -4,6 +4,11 @@ import functools
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
+
import time
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from enum import auto
|
|
7
12
|
from json.decoder import JSONDecodeError
|
|
8
13
|
from typing import (
|
|
9
14
|
TYPE_CHECKING,
|
|
@@ -17,6 +22,7 @@ from typing import (
|
|
|
17
22
|
Union,
|
|
18
23
|
)
|
|
19
24
|
|
|
25
|
+
import pydantic
|
|
20
26
|
import requests
|
|
21
27
|
from deprecated import deprecated
|
|
22
28
|
from requests.adapters import HTTPAdapter, Retry
|
|
@@ -27,13 +33,22 @@ from datahub.cli import config_utils
|
|
|
27
33
|
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
|
|
28
34
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
29
35
|
from datahub.configuration.common import (
|
|
36
|
+
ConfigEnum,
|
|
30
37
|
ConfigModel,
|
|
31
38
|
ConfigurationError,
|
|
32
39
|
OperationalError,
|
|
40
|
+
TraceTimeoutError,
|
|
41
|
+
TraceValidationError,
|
|
33
42
|
)
|
|
43
|
+
from datahub.emitter.aspect import JSON_CONTENT_TYPE
|
|
34
44
|
from datahub.emitter.generic_emitter import Emitter
|
|
35
45
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
36
46
|
from datahub.emitter.request_helper import make_curl_command
|
|
47
|
+
from datahub.emitter.response_helper import (
|
|
48
|
+
TraceData,
|
|
49
|
+
extract_trace_data,
|
|
50
|
+
extract_trace_data_from_mcps,
|
|
51
|
+
)
|
|
37
52
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
38
53
|
from datahub.ingestion.api.closeable import Closeable
|
|
39
54
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
@@ -63,6 +78,11 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
63
78
|
|
|
64
79
|
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
65
80
|
|
|
81
|
+
TRACE_PENDING_STATUS = "PENDING"
|
|
82
|
+
TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
83
|
+
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
84
|
+
TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
|
|
85
|
+
|
|
66
86
|
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
67
87
|
# for overhead like request headers.
|
|
68
88
|
# This applies to pretty much all calls to GMS.
|
|
@@ -77,6 +97,29 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
77
97
|
)
|
|
78
98
|
|
|
79
99
|
|
|
100
|
+
class RestTraceMode(ConfigEnum):
|
|
101
|
+
ENABLED = auto()
|
|
102
|
+
DISABLED = auto()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class RestSinkEndpoint(ConfigEnum):
|
|
106
|
+
RESTLI = auto()
|
|
107
|
+
OPENAPI = auto()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
|
|
111
|
+
RestSinkEndpoint,
|
|
112
|
+
os.getenv("DATAHUB_REST_SINK_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# Supported with v1.0
|
|
117
|
+
DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
|
|
118
|
+
RestTraceMode,
|
|
119
|
+
os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
80
123
|
class RequestsSessionConfig(ConfigModel):
|
|
81
124
|
timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
|
|
82
125
|
|
|
@@ -143,10 +186,32 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
143
186
|
return session
|
|
144
187
|
|
|
145
188
|
|
|
189
|
+
@dataclass
|
|
190
|
+
class _Chunk:
|
|
191
|
+
items: List[str]
|
|
192
|
+
total_bytes: int = 0
|
|
193
|
+
|
|
194
|
+
def add_item(self, item: str) -> bool:
|
|
195
|
+
item_bytes = len(item.encode())
|
|
196
|
+
if not self.items: # Always add at least one item even if over byte limit
|
|
197
|
+
self.items.append(item)
|
|
198
|
+
self.total_bytes += item_bytes
|
|
199
|
+
return True
|
|
200
|
+
self.items.append(item)
|
|
201
|
+
self.total_bytes += item_bytes
|
|
202
|
+
return True
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def join(chunk: "_Chunk") -> str:
|
|
206
|
+
return "[" + ",".join(chunk.items) + "]"
|
|
207
|
+
|
|
208
|
+
|
|
146
209
|
class DataHubRestEmitter(Closeable, Emitter):
|
|
147
210
|
_gms_server: str
|
|
148
211
|
_token: Optional[str]
|
|
149
212
|
_session: requests.Session
|
|
213
|
+
_openapi_ingestion: bool
|
|
214
|
+
_default_trace_mode: bool
|
|
150
215
|
|
|
151
216
|
def __init__(
|
|
152
217
|
self,
|
|
@@ -162,6 +227,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
162
227
|
ca_certificate_path: Optional[str] = None,
|
|
163
228
|
client_certificate_path: Optional[str] = None,
|
|
164
229
|
disable_ssl_verification: bool = False,
|
|
230
|
+
openapi_ingestion: bool = False,
|
|
231
|
+
default_trace_mode: bool = False,
|
|
165
232
|
):
|
|
166
233
|
if not gms_server:
|
|
167
234
|
raise ConfigurationError("gms server is required")
|
|
@@ -174,9 +241,17 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
174
241
|
self._gms_server = fixup_gms_url(gms_server)
|
|
175
242
|
self._token = token
|
|
176
243
|
self.server_config: Dict[str, Any] = {}
|
|
177
|
-
|
|
244
|
+
self._openapi_ingestion = openapi_ingestion
|
|
245
|
+
self._default_trace_mode = default_trace_mode
|
|
178
246
|
self._session = requests.Session()
|
|
179
247
|
|
|
248
|
+
logger.debug(
|
|
249
|
+
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if self._default_trace_mode:
|
|
253
|
+
logger.debug("Using API Tracing for ingestion.")
|
|
254
|
+
|
|
180
255
|
headers = {
|
|
181
256
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
182
257
|
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
@@ -264,6 +339,43 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
264
339
|
|
|
265
340
|
return DataHubGraph.from_emitter(self)
|
|
266
341
|
|
|
342
|
+
def _to_openapi_request(
|
|
343
|
+
self,
|
|
344
|
+
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
345
|
+
async_flag: Optional[bool] = None,
|
|
346
|
+
async_default: bool = False,
|
|
347
|
+
) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
|
|
348
|
+
if mcp.aspect and mcp.aspectName:
|
|
349
|
+
resolved_async_flag = (
|
|
350
|
+
async_flag if async_flag is not None else async_default
|
|
351
|
+
)
|
|
352
|
+
url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
|
|
353
|
+
|
|
354
|
+
if isinstance(mcp, MetadataChangeProposalWrapper):
|
|
355
|
+
aspect_value = pre_json_transform(
|
|
356
|
+
mcp.to_obj(simplified_structure=True)
|
|
357
|
+
)["aspect"]["json"]
|
|
358
|
+
else:
|
|
359
|
+
obj = mcp.aspect.to_obj()
|
|
360
|
+
if obj.get("value") and obj.get("contentType") == JSON_CONTENT_TYPE:
|
|
361
|
+
obj = json.loads(obj["value"])
|
|
362
|
+
aspect_value = pre_json_transform(obj)
|
|
363
|
+
return (
|
|
364
|
+
url,
|
|
365
|
+
[
|
|
366
|
+
{
|
|
367
|
+
"urn": mcp.entityUrn,
|
|
368
|
+
mcp.aspectName: {
|
|
369
|
+
"value": aspect_value,
|
|
370
|
+
"systemMetadata": mcp.systemMetadata.to_obj()
|
|
371
|
+
if mcp.systemMetadata
|
|
372
|
+
else None,
|
|
373
|
+
},
|
|
374
|
+
}
|
|
375
|
+
],
|
|
376
|
+
)
|
|
377
|
+
return None
|
|
378
|
+
|
|
267
379
|
def emit(
|
|
268
380
|
self,
|
|
269
381
|
item: Union[
|
|
@@ -316,31 +428,135 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
316
428
|
self,
|
|
317
429
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
318
430
|
async_flag: Optional[bool] = None,
|
|
431
|
+
trace_flag: Optional[bool] = None,
|
|
432
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
319
433
|
) -> None:
|
|
320
|
-
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
321
434
|
ensure_has_system_metadata(mcp)
|
|
322
435
|
|
|
323
|
-
|
|
324
|
-
payload_dict = {"proposal": mcp_obj}
|
|
436
|
+
trace_data = None
|
|
325
437
|
|
|
326
|
-
if
|
|
327
|
-
|
|
438
|
+
if self._openapi_ingestion:
|
|
439
|
+
request = self._to_openapi_request(mcp, async_flag, async_default=False)
|
|
440
|
+
if request:
|
|
441
|
+
response = self._emit_generic(request[0], payload=request[1])
|
|
328
442
|
|
|
329
|
-
|
|
443
|
+
if self._should_trace(async_flag, trace_flag):
|
|
444
|
+
trace_data = extract_trace_data(response) if response else None
|
|
330
445
|
|
|
331
|
-
|
|
446
|
+
else:
|
|
447
|
+
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
448
|
+
|
|
449
|
+
mcp_obj = pre_json_transform(mcp.to_obj())
|
|
450
|
+
payload_dict = {"proposal": mcp_obj}
|
|
451
|
+
|
|
452
|
+
if async_flag is not None:
|
|
453
|
+
payload_dict["async"] = "true" if async_flag else "false"
|
|
454
|
+
|
|
455
|
+
payload = json.dumps(payload_dict)
|
|
456
|
+
|
|
457
|
+
response = self._emit_generic(url, payload)
|
|
458
|
+
|
|
459
|
+
if self._should_trace(async_flag, trace_flag):
|
|
460
|
+
trace_data = (
|
|
461
|
+
extract_trace_data_from_mcps(response, [mcp]) if response else None
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if trace_data:
|
|
465
|
+
self._await_status(
|
|
466
|
+
[trace_data],
|
|
467
|
+
trace_timeout,
|
|
468
|
+
)
|
|
332
469
|
|
|
333
470
|
def emit_mcps(
|
|
334
471
|
self,
|
|
335
472
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
336
473
|
async_flag: Optional[bool] = None,
|
|
474
|
+
trace_flag: Optional[bool] = None,
|
|
475
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
337
476
|
) -> int:
|
|
338
477
|
if _DATAHUB_EMITTER_TRACE:
|
|
339
478
|
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
340
|
-
|
|
479
|
+
|
|
341
480
|
for mcp in mcps:
|
|
342
481
|
ensure_has_system_metadata(mcp)
|
|
343
482
|
|
|
483
|
+
if self._openapi_ingestion:
|
|
484
|
+
return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
|
|
485
|
+
else:
|
|
486
|
+
return self._emit_restli_mcps(mcps, async_flag)
|
|
487
|
+
|
|
488
|
+
def _emit_openapi_mcps(
|
|
489
|
+
self,
|
|
490
|
+
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
491
|
+
async_flag: Optional[bool] = None,
|
|
492
|
+
trace_flag: Optional[bool] = None,
|
|
493
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
494
|
+
) -> int:
|
|
495
|
+
"""
|
|
496
|
+
1. Grouping MCPs by their entity URL
|
|
497
|
+
2. Breaking down large batches into smaller chunks based on both:
|
|
498
|
+
* Total byte size (INGEST_MAX_PAYLOAD_BYTES)
|
|
499
|
+
* Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
|
|
500
|
+
|
|
501
|
+
The Chunk class encapsulates both the items and their byte size tracking
|
|
502
|
+
Serializing the items only once with json.dumps(request[1]) and reusing that
|
|
503
|
+
The chunking logic handles edge cases (always accepting at least one item per chunk)
|
|
504
|
+
The joining logic is efficient with a simple string concatenation
|
|
505
|
+
|
|
506
|
+
:param mcps: metadata change proposals to transmit
|
|
507
|
+
:param async_flag: the mode
|
|
508
|
+
:return: number of requests
|
|
509
|
+
"""
|
|
510
|
+
# group by entity url
|
|
511
|
+
batches: Dict[str, List[_Chunk]] = defaultdict(
|
|
512
|
+
lambda: [_Chunk(items=[])]
|
|
513
|
+
) # Initialize with one empty Chunk
|
|
514
|
+
|
|
515
|
+
for mcp in mcps:
|
|
516
|
+
request = self._to_openapi_request(mcp, async_flag, async_default=True)
|
|
517
|
+
if request:
|
|
518
|
+
current_chunk = batches[request[0]][-1] # Get the last chunk
|
|
519
|
+
# Only serialize once
|
|
520
|
+
serialized_item = json.dumps(request[1][0])
|
|
521
|
+
item_bytes = len(serialized_item.encode())
|
|
522
|
+
|
|
523
|
+
# If adding this item would exceed max_bytes, create a new chunk
|
|
524
|
+
# Unless the chunk is empty (always add at least one item)
|
|
525
|
+
if current_chunk.items and (
|
|
526
|
+
current_chunk.total_bytes + item_bytes > INGEST_MAX_PAYLOAD_BYTES
|
|
527
|
+
or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
528
|
+
):
|
|
529
|
+
new_chunk = _Chunk(items=[])
|
|
530
|
+
batches[request[0]].append(new_chunk)
|
|
531
|
+
current_chunk = new_chunk
|
|
532
|
+
|
|
533
|
+
current_chunk.add_item(serialized_item)
|
|
534
|
+
|
|
535
|
+
responses = []
|
|
536
|
+
for url, chunks in batches.items():
|
|
537
|
+
for chunk in chunks:
|
|
538
|
+
response = self._emit_generic(url, payload=_Chunk.join(chunk))
|
|
539
|
+
responses.append(response)
|
|
540
|
+
|
|
541
|
+
if self._should_trace(async_flag, trace_flag, async_default=True):
|
|
542
|
+
trace_data = []
|
|
543
|
+
for response in responses:
|
|
544
|
+
data = extract_trace_data(response) if response else None
|
|
545
|
+
if data is not None:
|
|
546
|
+
trace_data.append(data)
|
|
547
|
+
|
|
548
|
+
if trace_data:
|
|
549
|
+
self._await_status(trace_data, trace_timeout)
|
|
550
|
+
|
|
551
|
+
return len(responses)
|
|
552
|
+
|
|
553
|
+
def _emit_restli_mcps(
|
|
554
|
+
self,
|
|
555
|
+
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
556
|
+
async_flag: Optional[bool] = None,
|
|
557
|
+
) -> int:
|
|
558
|
+
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
559
|
+
|
|
344
560
|
mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
|
|
345
561
|
|
|
346
562
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
@@ -392,7 +608,10 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
392
608
|
payload = json.dumps(snapshot)
|
|
393
609
|
self._emit_generic(url, payload)
|
|
394
610
|
|
|
395
|
-
def _emit_generic(self, url: str, payload: str) ->
|
|
611
|
+
def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
|
|
612
|
+
if not isinstance(payload, str):
|
|
613
|
+
payload = json.dumps(payload)
|
|
614
|
+
|
|
396
615
|
curl_command = make_curl_command(self._session, "POST", url, payload)
|
|
397
616
|
payload_size = len(payload)
|
|
398
617
|
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
@@ -408,6 +627,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
408
627
|
try:
|
|
409
628
|
response = self._session.post(url, data=payload)
|
|
410
629
|
response.raise_for_status()
|
|
630
|
+
return response
|
|
411
631
|
except HTTPError as e:
|
|
412
632
|
try:
|
|
413
633
|
info: Dict = response.json()
|
|
@@ -438,6 +658,99 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
438
658
|
"Unable to emit metadata to DataHub GMS", {"message": str(e)}
|
|
439
659
|
) from e
|
|
440
660
|
|
|
661
|
+
def _await_status(
|
|
662
|
+
self,
|
|
663
|
+
trace_data: List[TraceData],
|
|
664
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
665
|
+
) -> None:
|
|
666
|
+
"""Verify the status of asynchronous write operations.
|
|
667
|
+
Args:
|
|
668
|
+
trace_data: List of trace data to verify
|
|
669
|
+
trace_timeout: Maximum time to wait for verification.
|
|
670
|
+
Raises:
|
|
671
|
+
TraceTimeoutError: If verification fails or times out
|
|
672
|
+
TraceValidationError: Expected write was not completed successfully
|
|
673
|
+
"""
|
|
674
|
+
if trace_timeout is None:
|
|
675
|
+
raise ValueError("trace_timeout cannot be None")
|
|
676
|
+
|
|
677
|
+
try:
|
|
678
|
+
if not trace_data:
|
|
679
|
+
logger.debug("No trace data to verify")
|
|
680
|
+
return
|
|
681
|
+
|
|
682
|
+
start_time = datetime.now()
|
|
683
|
+
|
|
684
|
+
for trace in trace_data:
|
|
685
|
+
current_backoff = TRACE_INITIAL_BACKOFF
|
|
686
|
+
|
|
687
|
+
while trace.data:
|
|
688
|
+
if datetime.now() - start_time > trace_timeout:
|
|
689
|
+
raise TraceTimeoutError(
|
|
690
|
+
f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
base_url = f"{self._gms_server}/openapi/v1/trace/write"
|
|
694
|
+
url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"
|
|
695
|
+
|
|
696
|
+
response = self._emit_generic(url, payload=trace.data)
|
|
697
|
+
json_data = response.json()
|
|
698
|
+
|
|
699
|
+
for urn, aspects in json_data.items():
|
|
700
|
+
for aspect_name, aspect_status in aspects.items():
|
|
701
|
+
if not aspect_status["success"]:
|
|
702
|
+
error_msg = (
|
|
703
|
+
f"Unable to validate async write to DataHub GMS: "
|
|
704
|
+
f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
|
|
705
|
+
f"Status: {aspect_status}"
|
|
706
|
+
)
|
|
707
|
+
raise TraceValidationError(error_msg, aspect_status)
|
|
708
|
+
|
|
709
|
+
primary_storage = aspect_status["primaryStorage"][
|
|
710
|
+
"writeStatus"
|
|
711
|
+
]
|
|
712
|
+
search_storage = aspect_status["searchStorage"][
|
|
713
|
+
"writeStatus"
|
|
714
|
+
]
|
|
715
|
+
|
|
716
|
+
# Remove resolved statuses
|
|
717
|
+
if (
|
|
718
|
+
primary_storage != TRACE_PENDING_STATUS
|
|
719
|
+
and search_storage != TRACE_PENDING_STATUS
|
|
720
|
+
):
|
|
721
|
+
trace.data[urn].remove(aspect_name)
|
|
722
|
+
|
|
723
|
+
# Remove urns with all statuses resolved
|
|
724
|
+
if not trace.data[urn]:
|
|
725
|
+
trace.data.pop(urn)
|
|
726
|
+
|
|
727
|
+
# Adjust backoff based on response
|
|
728
|
+
if trace.data:
|
|
729
|
+
# If we still have pending items, increase backoff
|
|
730
|
+
current_backoff = min(
|
|
731
|
+
current_backoff * TRACE_BACKOFF_FACTOR, TRACE_MAX_BACKOFF
|
|
732
|
+
)
|
|
733
|
+
logger.debug(
|
|
734
|
+
f"Waiting {current_backoff} seconds before next check"
|
|
735
|
+
)
|
|
736
|
+
time.sleep(current_backoff)
|
|
737
|
+
|
|
738
|
+
except Exception as e:
|
|
739
|
+
logger.error(f"Error during status verification: {str(e)}")
|
|
740
|
+
raise
|
|
741
|
+
|
|
742
|
+
def _should_trace(
|
|
743
|
+
self,
|
|
744
|
+
async_flag: Optional[bool] = None,
|
|
745
|
+
trace_flag: Optional[bool] = None,
|
|
746
|
+
async_default: bool = False,
|
|
747
|
+
) -> bool:
|
|
748
|
+
resolved_trace_flag = (
|
|
749
|
+
trace_flag if trace_flag is not None else self._default_trace_mode
|
|
750
|
+
)
|
|
751
|
+
resolved_async_flag = async_flag if async_flag is not None else async_default
|
|
752
|
+
return resolved_trace_flag and resolved_async_flag
|
|
753
|
+
|
|
441
754
|
def __repr__(self) -> str:
|
|
442
755
|
token_str = (
|
|
443
756
|
f" with token: {self._token[:4]}**********{self._token[-4:]}"
|
|
@@ -3,7 +3,7 @@ from enum import Enum, auto
|
|
|
3
3
|
from typing import Callable, Dict, Optional, Type
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.api.common import PipelineContext
|
|
6
|
-
from datahub.ingestion.api.source import (
|
|
6
|
+
from datahub.ingestion.api.source import (
|
|
7
7
|
Source,
|
|
8
8
|
SourceCapability as SourceCapability,
|
|
9
9
|
)
|
|
@@ -250,6 +250,10 @@ def auto_browse_path_v2(
|
|
|
250
250
|
emitted_urns: Set[str] = set()
|
|
251
251
|
containers_used_as_parent: Set[str] = set()
|
|
252
252
|
for urn, batch in _batch_workunits_by_urn(stream):
|
|
253
|
+
# Do not generate browse path v2 for entities that do not support it
|
|
254
|
+
if not entity_supports_aspect(guess_entity_type(urn), BrowsePathsV2Class):
|
|
255
|
+
yield from batch
|
|
256
|
+
continue
|
|
253
257
|
container_path: Optional[List[BrowsePathEntryClass]] = None
|
|
254
258
|
legacy_path: Optional[List[BrowsePathEntryClass]] = None
|
|
255
259
|
browse_path_v2: Optional[List[BrowsePathEntryClass]] = None
|
datahub/ingestion/fs/s3_fs.py
CHANGED
|
@@ -48,12 +48,12 @@ class S3ListIterator(Iterator):
|
|
|
48
48
|
def __next__(self) -> FileInfo:
|
|
49
49
|
try:
|
|
50
50
|
return next(self._file_statuses)
|
|
51
|
-
except StopIteration:
|
|
51
|
+
except StopIteration as e:
|
|
52
52
|
if self._token:
|
|
53
53
|
self.fetch()
|
|
54
54
|
return next(self._file_statuses)
|
|
55
55
|
else:
|
|
56
|
-
raise
|
|
56
|
+
raise e
|
|
57
57
|
|
|
58
58
|
def fetch(self):
|
|
59
59
|
params = dict(Bucket=self._bucket, Prefix=self._prefix, MaxKeys=self._max_keys)
|
|
@@ -279,11 +279,7 @@ class ClassificationHandler:
|
|
|
279
279
|
"Dataset_Name": dataset_name,
|
|
280
280
|
}
|
|
281
281
|
),
|
|
282
|
-
values=(
|
|
283
|
-
sample_data[schema_field.fieldPath]
|
|
284
|
-
if schema_field.fieldPath in sample_data.keys()
|
|
285
|
-
else []
|
|
286
|
-
),
|
|
282
|
+
values=sample_data.get(schema_field.fieldPath, []),
|
|
287
283
|
)
|
|
288
284
|
)
|
|
289
285
|
|