acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +3 -5
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +3 -3
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +6 -12
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +7 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +251 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +29 -5
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +20 -13
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
datahub/emitter/mce_builder.py
CHANGED
|
@@ -52,7 +52,15 @@ from datahub.metadata.schema_classes import (
|
|
|
52
52
|
UpstreamLineageClass,
|
|
53
53
|
_Aspect as AspectAbstract,
|
|
54
54
|
)
|
|
55
|
-
from datahub.metadata.urns import
|
|
55
|
+
from datahub.metadata.urns import (
|
|
56
|
+
ChartUrn,
|
|
57
|
+
DashboardUrn,
|
|
58
|
+
DataFlowUrn,
|
|
59
|
+
DataJobUrn,
|
|
60
|
+
DataPlatformUrn,
|
|
61
|
+
DatasetUrn,
|
|
62
|
+
TagUrn,
|
|
63
|
+
)
|
|
56
64
|
from datahub.utilities.urn_encoder import UrnEncoder
|
|
57
65
|
|
|
58
66
|
logger = logging.getLogger(__name__)
|
|
@@ -119,7 +127,7 @@ def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
|
|
|
119
127
|
def make_data_platform_urn(platform: str) -> str:
|
|
120
128
|
if platform.startswith("urn:li:dataPlatform:"):
|
|
121
129
|
return platform
|
|
122
|
-
return
|
|
130
|
+
return DataPlatformUrn.create_from_id(platform).urn()
|
|
123
131
|
|
|
124
132
|
|
|
125
133
|
def make_dataset_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
|
|
@@ -236,7 +244,7 @@ def make_user_urn(username: str) -> str:
|
|
|
236
244
|
Makes a user urn if the input is not a user or group urn already
|
|
237
245
|
"""
|
|
238
246
|
return (
|
|
239
|
-
f"urn:li:corpuser:{username}"
|
|
247
|
+
f"urn:li:corpuser:{UrnEncoder.encode_string(username)}"
|
|
240
248
|
if not username.startswith(("urn:li:corpuser:", "urn:li:corpGroup:"))
|
|
241
249
|
else username
|
|
242
250
|
)
|
|
@@ -249,7 +257,7 @@ def make_group_urn(groupname: str) -> str:
|
|
|
249
257
|
if groupname and groupname.startswith(("urn:li:corpGroup:", "urn:li:corpuser:")):
|
|
250
258
|
return groupname
|
|
251
259
|
else:
|
|
252
|
-
return f"urn:li:corpGroup:{groupname}"
|
|
260
|
+
return f"urn:li:corpGroup:{UrnEncoder.encode_string(groupname)}"
|
|
253
261
|
|
|
254
262
|
|
|
255
263
|
def make_tag_urn(tag: str) -> str:
|
|
@@ -301,7 +309,12 @@ def make_data_flow_urn(
|
|
|
301
309
|
|
|
302
310
|
|
|
303
311
|
def make_data_job_urn_with_flow(flow_urn: str, job_id: str) -> str:
|
|
304
|
-
|
|
312
|
+
data_flow_urn = DataFlowUrn.from_string(flow_urn)
|
|
313
|
+
data_job_urn = DataJobUrn.create_from_ids(
|
|
314
|
+
data_flow_urn=data_flow_urn.urn(),
|
|
315
|
+
job_id=job_id,
|
|
316
|
+
)
|
|
317
|
+
return data_job_urn.urn()
|
|
305
318
|
|
|
306
319
|
|
|
307
320
|
def make_data_process_instance_urn(dataProcessInstanceId: str) -> str:
|
|
@@ -324,10 +337,11 @@ def make_dashboard_urn(
|
|
|
324
337
|
platform: str, name: str, platform_instance: Optional[str] = None
|
|
325
338
|
) -> str:
|
|
326
339
|
# FIXME: dashboards don't currently include data platform urn prefixes.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
340
|
+
return DashboardUrn.create_from_ids(
|
|
341
|
+
platform=platform,
|
|
342
|
+
name=name,
|
|
343
|
+
platform_instance=platform_instance,
|
|
344
|
+
).urn()
|
|
331
345
|
|
|
332
346
|
|
|
333
347
|
def dashboard_urn_to_key(dashboard_urn: str) -> Optional[DashboardKeyClass]:
|
|
@@ -342,10 +356,11 @@ def make_chart_urn(
|
|
|
342
356
|
platform: str, name: str, platform_instance: Optional[str] = None
|
|
343
357
|
) -> str:
|
|
344
358
|
# FIXME: charts don't currently include data platform urn prefixes.
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
359
|
+
return ChartUrn.create_from_ids(
|
|
360
|
+
platform=platform,
|
|
361
|
+
name=name,
|
|
362
|
+
platform_instance=platform_instance,
|
|
363
|
+
).urn()
|
|
349
364
|
|
|
350
365
|
|
|
351
366
|
def chart_urn_to_key(chart_urn: str) -> Optional[ChartKeyClass]:
|
datahub/emitter/mcp_builder.py
CHANGED
|
@@ -36,7 +36,7 @@ from datahub.metadata.schema_classes import (
|
|
|
36
36
|
SubTypesClass,
|
|
37
37
|
TagAssociationClass,
|
|
38
38
|
)
|
|
39
|
-
from datahub.metadata.urns import StructuredPropertyUrn
|
|
39
|
+
from datahub.metadata.urns import ContainerUrn, StructuredPropertyUrn
|
|
40
40
|
|
|
41
41
|
# In https://github.com/datahub-project/datahub/pull/11214, we added a
|
|
42
42
|
# new env field to container properties. However, populating this field
|
|
@@ -87,6 +87,9 @@ class ContainerKey(DatahubKey):
|
|
|
87
87
|
def property_dict(self) -> Dict[str, str]:
|
|
88
88
|
return self.dict(by_alias=True, exclude_none=True)
|
|
89
89
|
|
|
90
|
+
def as_urn_typed(self) -> ContainerUrn:
|
|
91
|
+
return ContainerUrn.from_string(self.as_urn())
|
|
92
|
+
|
|
90
93
|
def as_urn(self) -> str:
|
|
91
94
|
return make_container_urn(guid=self.guid())
|
|
92
95
|
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, List, Optional, Sequence, Union
|
|
5
|
+
|
|
6
|
+
from requests import Response
|
|
7
|
+
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
10
|
+
MetadataChangeProposal,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class TraceData:
|
|
18
|
+
trace_id: str
|
|
19
|
+
data: Dict[str, List[str]]
|
|
20
|
+
|
|
21
|
+
def __post_init__(self) -> None:
|
|
22
|
+
if not self.trace_id:
|
|
23
|
+
raise ValueError("trace_id cannot be empty")
|
|
24
|
+
if not isinstance(self.data, dict):
|
|
25
|
+
raise TypeError("data must be a dictionary")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _extract_trace_id(
|
|
29
|
+
response: Response, trace_header: str = "traceparent"
|
|
30
|
+
) -> Optional[str]:
|
|
31
|
+
"""
|
|
32
|
+
Extract trace ID from response headers.
|
|
33
|
+
Args:
|
|
34
|
+
response: HTTP response object
|
|
35
|
+
trace_header: Name of the trace header to use
|
|
36
|
+
Returns:
|
|
37
|
+
Trace ID if found and response is valid, None otherwise
|
|
38
|
+
"""
|
|
39
|
+
if not 200 <= response.status_code < 300:
|
|
40
|
+
logger.debug(f"Invalid status code: {response.status_code}")
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
trace_id = response.headers.get(trace_header)
|
|
44
|
+
if not trace_id:
|
|
45
|
+
logger.debug(f"Missing trace header: {trace_header}")
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
return trace_id
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def extract_trace_data(
|
|
52
|
+
response: Response,
|
|
53
|
+
aspects_to_trace: Optional[List[str]] = None,
|
|
54
|
+
trace_header: str = "traceparent",
|
|
55
|
+
) -> Optional[TraceData]:
|
|
56
|
+
"""
|
|
57
|
+
Extract trace data from a response object.
|
|
58
|
+
Args:
|
|
59
|
+
response: HTTP response object
|
|
60
|
+
aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
|
|
61
|
+
trace_header: Name of the trace header to use (default: "traceparent")
|
|
62
|
+
Returns:
|
|
63
|
+
TraceData object if successful, None otherwise
|
|
64
|
+
Raises:
|
|
65
|
+
JSONDecodeError: If response body cannot be decoded as JSON
|
|
66
|
+
"""
|
|
67
|
+
trace_id = _extract_trace_id(response, trace_header)
|
|
68
|
+
if not trace_id:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
json_data = response.json()
|
|
73
|
+
if not isinstance(json_data, list):
|
|
74
|
+
logger.debug("JSON data is not a list")
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
data: Dict[str, List[str]] = {}
|
|
78
|
+
|
|
79
|
+
for item in json_data:
|
|
80
|
+
urn = item.get("urn")
|
|
81
|
+
if not urn:
|
|
82
|
+
logger.debug(f"Skipping item without URN: {item}")
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if aspects_to_trace is None:
|
|
86
|
+
aspect_names = [
|
|
87
|
+
k for k, v in item.items() if k != "urn" and v is not None
|
|
88
|
+
]
|
|
89
|
+
else:
|
|
90
|
+
aspect_names = [
|
|
91
|
+
field for field in aspects_to_trace if item.get(field) is not None
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
data[urn] = aspect_names
|
|
95
|
+
|
|
96
|
+
return TraceData(trace_id=trace_id, data=data)
|
|
97
|
+
|
|
98
|
+
except json.JSONDecodeError as e:
|
|
99
|
+
logger.error(f"Failed to decode JSON response: {e}")
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extract_trace_data_from_mcps(
|
|
104
|
+
response: Response,
|
|
105
|
+
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
106
|
+
aspects_to_trace: Optional[List[str]] = None,
|
|
107
|
+
trace_header: str = "traceparent",
|
|
108
|
+
) -> Optional[TraceData]:
|
|
109
|
+
"""
|
|
110
|
+
Extract trace data from a response object and populate data from provided MCPs.
|
|
111
|
+
Args:
|
|
112
|
+
response: HTTP response object used only for trace_id extraction
|
|
113
|
+
mcps: List of MCP URN and aspect data
|
|
114
|
+
aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
|
|
115
|
+
trace_header: Name of the trace header to use (default: "traceparent")
|
|
116
|
+
Returns:
|
|
117
|
+
TraceData object if successful, None otherwise
|
|
118
|
+
"""
|
|
119
|
+
trace_id = _extract_trace_id(response, trace_header)
|
|
120
|
+
if not trace_id:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
data: Dict[str, List[str]] = {}
|
|
124
|
+
try:
|
|
125
|
+
for mcp in mcps:
|
|
126
|
+
entity_urn = getattr(mcp, "entityUrn", None)
|
|
127
|
+
aspect_name = getattr(mcp, "aspectName", None)
|
|
128
|
+
|
|
129
|
+
if not entity_urn or not aspect_name:
|
|
130
|
+
logger.debug(f"Skipping MCP with missing URN or aspect name: {mcp}")
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
if aspects_to_trace is not None and aspect_name not in aspects_to_trace:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
if entity_urn not in data:
|
|
137
|
+
data[entity_urn] = []
|
|
138
|
+
|
|
139
|
+
data[entity_urn].append(aspect_name)
|
|
140
|
+
|
|
141
|
+
return TraceData(trace_id=trace_id, data=data)
|
|
142
|
+
|
|
143
|
+
except AttributeError as e:
|
|
144
|
+
logger.error(f"Error processing MCPs: {e}")
|
|
145
|
+
return None
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -4,6 +4,11 @@ import functools
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
+
import time
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from enum import auto
|
|
7
12
|
from json.decoder import JSONDecodeError
|
|
8
13
|
from typing import (
|
|
9
14
|
TYPE_CHECKING,
|
|
@@ -17,6 +22,7 @@ from typing import (
|
|
|
17
22
|
Union,
|
|
18
23
|
)
|
|
19
24
|
|
|
25
|
+
import pydantic
|
|
20
26
|
import requests
|
|
21
27
|
from deprecated import deprecated
|
|
22
28
|
from requests.adapters import HTTPAdapter, Retry
|
|
@@ -27,13 +33,22 @@ from datahub.cli import config_utils
|
|
|
27
33
|
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
|
|
28
34
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
29
35
|
from datahub.configuration.common import (
|
|
36
|
+
ConfigEnum,
|
|
30
37
|
ConfigModel,
|
|
31
38
|
ConfigurationError,
|
|
32
39
|
OperationalError,
|
|
40
|
+
TraceTimeoutError,
|
|
41
|
+
TraceValidationError,
|
|
33
42
|
)
|
|
43
|
+
from datahub.emitter.aspect import JSON_CONTENT_TYPE
|
|
34
44
|
from datahub.emitter.generic_emitter import Emitter
|
|
35
45
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
36
46
|
from datahub.emitter.request_helper import make_curl_command
|
|
47
|
+
from datahub.emitter.response_helper import (
|
|
48
|
+
TraceData,
|
|
49
|
+
extract_trace_data,
|
|
50
|
+
extract_trace_data_from_mcps,
|
|
51
|
+
)
|
|
37
52
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
38
53
|
from datahub.ingestion.api.closeable import Closeable
|
|
39
54
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
@@ -63,6 +78,11 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
63
78
|
|
|
64
79
|
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
65
80
|
|
|
81
|
+
TRACE_PENDING_STATUS = "PENDING"
|
|
82
|
+
TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
83
|
+
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
84
|
+
TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
|
|
85
|
+
|
|
66
86
|
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
67
87
|
# for overhead like request headers.
|
|
68
88
|
# This applies to pretty much all calls to GMS.
|
|
@@ -77,6 +97,29 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
77
97
|
)
|
|
78
98
|
|
|
79
99
|
|
|
100
|
+
class RestTraceMode(ConfigEnum):
|
|
101
|
+
ENABLED = auto()
|
|
102
|
+
DISABLED = auto()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class RestSinkEndpoint(ConfigEnum):
|
|
106
|
+
RESTLI = auto()
|
|
107
|
+
OPENAPI = auto()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
|
|
111
|
+
RestSinkEndpoint,
|
|
112
|
+
os.getenv("DATAHUB_REST_SINK_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# Supported with v1.0
|
|
117
|
+
DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
|
|
118
|
+
RestTraceMode,
|
|
119
|
+
os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
80
123
|
class RequestsSessionConfig(ConfigModel):
|
|
81
124
|
timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
|
|
82
125
|
|
|
@@ -143,10 +186,32 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
143
186
|
return session
|
|
144
187
|
|
|
145
188
|
|
|
189
|
+
@dataclass
|
|
190
|
+
class _Chunk:
|
|
191
|
+
items: List[str]
|
|
192
|
+
total_bytes: int = 0
|
|
193
|
+
|
|
194
|
+
def add_item(self, item: str) -> bool:
|
|
195
|
+
item_bytes = len(item.encode())
|
|
196
|
+
if not self.items: # Always add at least one item even if over byte limit
|
|
197
|
+
self.items.append(item)
|
|
198
|
+
self.total_bytes += item_bytes
|
|
199
|
+
return True
|
|
200
|
+
self.items.append(item)
|
|
201
|
+
self.total_bytes += item_bytes
|
|
202
|
+
return True
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def join(chunk: "_Chunk") -> str:
|
|
206
|
+
return "[" + ",".join(chunk.items) + "]"
|
|
207
|
+
|
|
208
|
+
|
|
146
209
|
class DataHubRestEmitter(Closeable, Emitter):
|
|
147
210
|
_gms_server: str
|
|
148
211
|
_token: Optional[str]
|
|
149
212
|
_session: requests.Session
|
|
213
|
+
_openapi_ingestion: bool
|
|
214
|
+
_default_trace_mode: bool
|
|
150
215
|
|
|
151
216
|
def __init__(
|
|
152
217
|
self,
|
|
@@ -162,6 +227,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
162
227
|
ca_certificate_path: Optional[str] = None,
|
|
163
228
|
client_certificate_path: Optional[str] = None,
|
|
164
229
|
disable_ssl_verification: bool = False,
|
|
230
|
+
openapi_ingestion: bool = False,
|
|
231
|
+
default_trace_mode: bool = False,
|
|
165
232
|
):
|
|
166
233
|
if not gms_server:
|
|
167
234
|
raise ConfigurationError("gms server is required")
|
|
@@ -174,9 +241,17 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
174
241
|
self._gms_server = fixup_gms_url(gms_server)
|
|
175
242
|
self._token = token
|
|
176
243
|
self.server_config: Dict[str, Any] = {}
|
|
177
|
-
|
|
244
|
+
self._openapi_ingestion = openapi_ingestion
|
|
245
|
+
self._default_trace_mode = default_trace_mode
|
|
178
246
|
self._session = requests.Session()
|
|
179
247
|
|
|
248
|
+
logger.debug(
|
|
249
|
+
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if self._default_trace_mode:
|
|
253
|
+
logger.debug("Using API Tracing for ingestion.")
|
|
254
|
+
|
|
180
255
|
headers = {
|
|
181
256
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
182
257
|
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
@@ -264,6 +339,43 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
264
339
|
|
|
265
340
|
return DataHubGraph.from_emitter(self)
|
|
266
341
|
|
|
342
|
+
def _to_openapi_request(
|
|
343
|
+
self,
|
|
344
|
+
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
345
|
+
async_flag: Optional[bool] = None,
|
|
346
|
+
async_default: bool = False,
|
|
347
|
+
) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
|
|
348
|
+
if mcp.aspect and mcp.aspectName:
|
|
349
|
+
resolved_async_flag = (
|
|
350
|
+
async_flag if async_flag is not None else async_default
|
|
351
|
+
)
|
|
352
|
+
url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
|
|
353
|
+
|
|
354
|
+
if isinstance(mcp, MetadataChangeProposalWrapper):
|
|
355
|
+
aspect_value = pre_json_transform(
|
|
356
|
+
mcp.to_obj(simplified_structure=True)
|
|
357
|
+
)["aspect"]["json"]
|
|
358
|
+
else:
|
|
359
|
+
obj = mcp.aspect.to_obj()
|
|
360
|
+
if obj.get("value") and obj.get("contentType") == JSON_CONTENT_TYPE:
|
|
361
|
+
obj = json.loads(obj["value"])
|
|
362
|
+
aspect_value = pre_json_transform(obj)
|
|
363
|
+
return (
|
|
364
|
+
url,
|
|
365
|
+
[
|
|
366
|
+
{
|
|
367
|
+
"urn": mcp.entityUrn,
|
|
368
|
+
mcp.aspectName: {
|
|
369
|
+
"value": aspect_value,
|
|
370
|
+
"systemMetadata": mcp.systemMetadata.to_obj()
|
|
371
|
+
if mcp.systemMetadata
|
|
372
|
+
else None,
|
|
373
|
+
},
|
|
374
|
+
}
|
|
375
|
+
],
|
|
376
|
+
)
|
|
377
|
+
return None
|
|
378
|
+
|
|
267
379
|
def emit(
|
|
268
380
|
self,
|
|
269
381
|
item: Union[
|
|
@@ -316,31 +428,135 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
316
428
|
self,
|
|
317
429
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
318
430
|
async_flag: Optional[bool] = None,
|
|
431
|
+
trace_flag: Optional[bool] = None,
|
|
432
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
319
433
|
) -> None:
|
|
320
|
-
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
321
434
|
ensure_has_system_metadata(mcp)
|
|
322
435
|
|
|
323
|
-
|
|
324
|
-
payload_dict = {"proposal": mcp_obj}
|
|
436
|
+
trace_data = None
|
|
325
437
|
|
|
326
|
-
if
|
|
327
|
-
|
|
438
|
+
if self._openapi_ingestion:
|
|
439
|
+
request = self._to_openapi_request(mcp, async_flag, async_default=False)
|
|
440
|
+
if request:
|
|
441
|
+
response = self._emit_generic(request[0], payload=request[1])
|
|
328
442
|
|
|
329
|
-
|
|
443
|
+
if self._should_trace(async_flag, trace_flag):
|
|
444
|
+
trace_data = extract_trace_data(response) if response else None
|
|
330
445
|
|
|
331
|
-
|
|
446
|
+
else:
|
|
447
|
+
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
448
|
+
|
|
449
|
+
mcp_obj = pre_json_transform(mcp.to_obj())
|
|
450
|
+
payload_dict = {"proposal": mcp_obj}
|
|
451
|
+
|
|
452
|
+
if async_flag is not None:
|
|
453
|
+
payload_dict["async"] = "true" if async_flag else "false"
|
|
454
|
+
|
|
455
|
+
payload = json.dumps(payload_dict)
|
|
456
|
+
|
|
457
|
+
response = self._emit_generic(url, payload)
|
|
458
|
+
|
|
459
|
+
if self._should_trace(async_flag, trace_flag):
|
|
460
|
+
trace_data = (
|
|
461
|
+
extract_trace_data_from_mcps(response, [mcp]) if response else None
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if trace_data:
|
|
465
|
+
self._await_status(
|
|
466
|
+
[trace_data],
|
|
467
|
+
trace_timeout,
|
|
468
|
+
)
|
|
332
469
|
|
|
333
470
|
def emit_mcps(
|
|
334
471
|
self,
|
|
335
472
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
336
473
|
async_flag: Optional[bool] = None,
|
|
474
|
+
trace_flag: Optional[bool] = None,
|
|
475
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
337
476
|
) -> int:
|
|
338
477
|
if _DATAHUB_EMITTER_TRACE:
|
|
339
478
|
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
340
|
-
|
|
479
|
+
|
|
341
480
|
for mcp in mcps:
|
|
342
481
|
ensure_has_system_metadata(mcp)
|
|
343
482
|
|
|
483
|
+
if self._openapi_ingestion:
|
|
484
|
+
return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
|
|
485
|
+
else:
|
|
486
|
+
return self._emit_restli_mcps(mcps, async_flag)
|
|
487
|
+
|
|
488
|
+
def _emit_openapi_mcps(
|
|
489
|
+
self,
|
|
490
|
+
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
491
|
+
async_flag: Optional[bool] = None,
|
|
492
|
+
trace_flag: Optional[bool] = None,
|
|
493
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
494
|
+
) -> int:
|
|
495
|
+
"""
|
|
496
|
+
1. Grouping MCPs by their entity URL
|
|
497
|
+
2. Breaking down large batches into smaller chunks based on both:
|
|
498
|
+
* Total byte size (INGEST_MAX_PAYLOAD_BYTES)
|
|
499
|
+
* Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
|
|
500
|
+
|
|
501
|
+
The Chunk class encapsulates both the items and their byte size tracking
|
|
502
|
+
Serializing the items only once with json.dumps(request[1]) and reusing that
|
|
503
|
+
The chunking logic handles edge cases (always accepting at least one item per chunk)
|
|
504
|
+
The joining logic is efficient with a simple string concatenation
|
|
505
|
+
|
|
506
|
+
:param mcps: metadata change proposals to transmit
|
|
507
|
+
:param async_flag: the mode
|
|
508
|
+
:return: number of requests
|
|
509
|
+
"""
|
|
510
|
+
# group by entity url
|
|
511
|
+
batches: Dict[str, List[_Chunk]] = defaultdict(
|
|
512
|
+
lambda: [_Chunk(items=[])]
|
|
513
|
+
) # Initialize with one empty Chunk
|
|
514
|
+
|
|
515
|
+
for mcp in mcps:
|
|
516
|
+
request = self._to_openapi_request(mcp, async_flag, async_default=True)
|
|
517
|
+
if request:
|
|
518
|
+
current_chunk = batches[request[0]][-1] # Get the last chunk
|
|
519
|
+
# Only serialize once
|
|
520
|
+
serialized_item = json.dumps(request[1][0])
|
|
521
|
+
item_bytes = len(serialized_item.encode())
|
|
522
|
+
|
|
523
|
+
# If adding this item would exceed max_bytes, create a new chunk
|
|
524
|
+
# Unless the chunk is empty (always add at least one item)
|
|
525
|
+
if current_chunk.items and (
|
|
526
|
+
current_chunk.total_bytes + item_bytes > INGEST_MAX_PAYLOAD_BYTES
|
|
527
|
+
or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
528
|
+
):
|
|
529
|
+
new_chunk = _Chunk(items=[])
|
|
530
|
+
batches[request[0]].append(new_chunk)
|
|
531
|
+
current_chunk = new_chunk
|
|
532
|
+
|
|
533
|
+
current_chunk.add_item(serialized_item)
|
|
534
|
+
|
|
535
|
+
responses = []
|
|
536
|
+
for url, chunks in batches.items():
|
|
537
|
+
for chunk in chunks:
|
|
538
|
+
response = self._emit_generic(url, payload=_Chunk.join(chunk))
|
|
539
|
+
responses.append(response)
|
|
540
|
+
|
|
541
|
+
if self._should_trace(async_flag, trace_flag, async_default=True):
|
|
542
|
+
trace_data = []
|
|
543
|
+
for response in responses:
|
|
544
|
+
data = extract_trace_data(response) if response else None
|
|
545
|
+
if data is not None:
|
|
546
|
+
trace_data.append(data)
|
|
547
|
+
|
|
548
|
+
if trace_data:
|
|
549
|
+
self._await_status(trace_data, trace_timeout)
|
|
550
|
+
|
|
551
|
+
return len(responses)
|
|
552
|
+
|
|
553
|
+
def _emit_restli_mcps(
|
|
554
|
+
self,
|
|
555
|
+
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
556
|
+
async_flag: Optional[bool] = None,
|
|
557
|
+
) -> int:
|
|
558
|
+
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
559
|
+
|
|
344
560
|
mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
|
|
345
561
|
|
|
346
562
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
@@ -392,7 +608,10 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
392
608
|
payload = json.dumps(snapshot)
|
|
393
609
|
self._emit_generic(url, payload)
|
|
394
610
|
|
|
395
|
-
def _emit_generic(self, url: str, payload: str) ->
|
|
611
|
+
def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
|
|
612
|
+
if not isinstance(payload, str):
|
|
613
|
+
payload = json.dumps(payload)
|
|
614
|
+
|
|
396
615
|
curl_command = make_curl_command(self._session, "POST", url, payload)
|
|
397
616
|
payload_size = len(payload)
|
|
398
617
|
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
@@ -408,6 +627,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
408
627
|
try:
|
|
409
628
|
response = self._session.post(url, data=payload)
|
|
410
629
|
response.raise_for_status()
|
|
630
|
+
return response
|
|
411
631
|
except HTTPError as e:
|
|
412
632
|
try:
|
|
413
633
|
info: Dict = response.json()
|
|
@@ -438,6 +658,99 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
438
658
|
"Unable to emit metadata to DataHub GMS", {"message": str(e)}
|
|
439
659
|
) from e
|
|
440
660
|
|
|
661
|
+
def _await_status(
|
|
662
|
+
self,
|
|
663
|
+
trace_data: List[TraceData],
|
|
664
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
665
|
+
) -> None:
|
|
666
|
+
"""Verify the status of asynchronous write operations.
|
|
667
|
+
Args:
|
|
668
|
+
trace_data: List of trace data to verify
|
|
669
|
+
trace_timeout: Maximum time to wait for verification.
|
|
670
|
+
Raises:
|
|
671
|
+
TraceTimeoutError: If verification fails or times out
|
|
672
|
+
TraceValidationError: Expected write was not completed successfully
|
|
673
|
+
"""
|
|
674
|
+
if trace_timeout is None:
|
|
675
|
+
raise ValueError("trace_timeout cannot be None")
|
|
676
|
+
|
|
677
|
+
try:
|
|
678
|
+
if not trace_data:
|
|
679
|
+
logger.debug("No trace data to verify")
|
|
680
|
+
return
|
|
681
|
+
|
|
682
|
+
start_time = datetime.now()
|
|
683
|
+
|
|
684
|
+
for trace in trace_data:
|
|
685
|
+
current_backoff = TRACE_INITIAL_BACKOFF
|
|
686
|
+
|
|
687
|
+
while trace.data:
|
|
688
|
+
if datetime.now() - start_time > trace_timeout:
|
|
689
|
+
raise TraceTimeoutError(
|
|
690
|
+
f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
base_url = f"{self._gms_server}/openapi/v1/trace/write"
|
|
694
|
+
url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"
|
|
695
|
+
|
|
696
|
+
response = self._emit_generic(url, payload=trace.data)
|
|
697
|
+
json_data = response.json()
|
|
698
|
+
|
|
699
|
+
for urn, aspects in json_data.items():
|
|
700
|
+
for aspect_name, aspect_status in aspects.items():
|
|
701
|
+
if not aspect_status["success"]:
|
|
702
|
+
error_msg = (
|
|
703
|
+
f"Unable to validate async write to DataHub GMS: "
|
|
704
|
+
f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
|
|
705
|
+
f"Status: {aspect_status}"
|
|
706
|
+
)
|
|
707
|
+
raise TraceValidationError(error_msg, aspect_status)
|
|
708
|
+
|
|
709
|
+
primary_storage = aspect_status["primaryStorage"][
|
|
710
|
+
"writeStatus"
|
|
711
|
+
]
|
|
712
|
+
search_storage = aspect_status["searchStorage"][
|
|
713
|
+
"writeStatus"
|
|
714
|
+
]
|
|
715
|
+
|
|
716
|
+
# Remove resolved statuses
|
|
717
|
+
if (
|
|
718
|
+
primary_storage != TRACE_PENDING_STATUS
|
|
719
|
+
and search_storage != TRACE_PENDING_STATUS
|
|
720
|
+
):
|
|
721
|
+
trace.data[urn].remove(aspect_name)
|
|
722
|
+
|
|
723
|
+
# Remove urns with all statuses resolved
|
|
724
|
+
if not trace.data[urn]:
|
|
725
|
+
trace.data.pop(urn)
|
|
726
|
+
|
|
727
|
+
# Adjust backoff based on response
|
|
728
|
+
if trace.data:
|
|
729
|
+
# If we still have pending items, increase backoff
|
|
730
|
+
current_backoff = min(
|
|
731
|
+
current_backoff * TRACE_BACKOFF_FACTOR, TRACE_MAX_BACKOFF
|
|
732
|
+
)
|
|
733
|
+
logger.debug(
|
|
734
|
+
f"Waiting {current_backoff} seconds before next check"
|
|
735
|
+
)
|
|
736
|
+
time.sleep(current_backoff)
|
|
737
|
+
|
|
738
|
+
except Exception as e:
|
|
739
|
+
logger.error(f"Error during status verification: {str(e)}")
|
|
740
|
+
raise
|
|
741
|
+
|
|
742
|
+
def _should_trace(
|
|
743
|
+
self,
|
|
744
|
+
async_flag: Optional[bool] = None,
|
|
745
|
+
trace_flag: Optional[bool] = None,
|
|
746
|
+
async_default: bool = False,
|
|
747
|
+
) -> bool:
|
|
748
|
+
resolved_trace_flag = (
|
|
749
|
+
trace_flag if trace_flag is not None else self._default_trace_mode
|
|
750
|
+
)
|
|
751
|
+
resolved_async_flag = async_flag if async_flag is not None else async_default
|
|
752
|
+
return resolved_trace_flag and resolved_async_flag
|
|
753
|
+
|
|
441
754
|
def __repr__(self) -> str:
|
|
442
755
|
token_str = (
|
|
443
756
|
f" with token: {self._token[:4]}**********{self._token[-4:]}"
|