acryl-datahub 1.0.0rc16__py3-none-any.whl → 1.0.0rc17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc16.dist-info → acryl_datahub-1.0.0rc17.dist-info}/METADATA +2571 -2541
- {acryl_datahub-1.0.0rc16.dist-info → acryl_datahub-1.0.0rc17.dist-info}/RECORD +17 -14
- {acryl_datahub-1.0.0rc16.dist-info → acryl_datahub-1.0.0rc17.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/configuration/common.py +8 -0
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +161 -3
- datahub/ingestion/graph/client.py +3 -0
- datahub/ingestion/sink/datahub_rest.py +4 -0
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/salesforce.py +529 -276
- datahub/ingestion/source/sql/hive.py +13 -0
- datahub/ingestion/source/vertexai.py +697 -0
- {acryl_datahub-1.0.0rc16.dist-info → acryl_datahub-1.0.0rc17.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc16.dist-info → acryl_datahub-1.0.0rc17.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc16.dist-info → acryl_datahub-1.0.0rc17.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
|
-
datahub/_version.py,sha256=
|
|
3
|
+
datahub/_version.py,sha256=Pm6pv1SEZW2WH_7EUSAIoR-6UVTQyPmr1utervGvb6M,322
|
|
4
4
|
datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
|
|
5
5
|
datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
|
|
6
6
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -93,7 +93,7 @@ datahub/cli/specific/structuredproperties_cli.py,sha256=Rb06XJKxqda5RSUo188B90Wn
|
|
|
93
93
|
datahub/cli/specific/user_cli.py,sha256=jGAokb1NRu8obs6P2g4OL2NQdFgpUBa9De55TBBtun0,1897
|
|
94
94
|
datahub/configuration/__init__.py,sha256=5TN3a7CWNsLRHpdj-sv2bxKWF2IslvJwE6EpNMFrIS4,123
|
|
95
95
|
datahub/configuration/_config_enum.py,sha256=ul2hr5gMmdLvBINicFkMNMi1ApmnmZSwNdUYYted5nk,1447
|
|
96
|
-
datahub/configuration/common.py,sha256=
|
|
96
|
+
datahub/configuration/common.py,sha256=GKRNgRCBsFUmZb6huD6DPNYUKNRxUbcHfFazHLte7wk,10626
|
|
97
97
|
datahub/configuration/config_loader.py,sha256=hRzPFxkz-w9IqkpSa5vwCzSra1p49DyfeJNeyqGa8-4,6827
|
|
98
98
|
datahub/configuration/connection_resolver.py,sha256=n4-6MwMiOEDgTouxO0SMjTILKVhJPo6-naE6FuR5qMs,1516
|
|
99
99
|
datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvrkosPg,2870
|
|
@@ -124,7 +124,8 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
|
|
|
124
124
|
datahub/emitter/mcp_builder.py,sha256=Q1bX2BthNvZ7ae71XYF6ICoiN8IOqaAd_h3zOct57Q0,11752
|
|
125
125
|
datahub/emitter/mcp_patch_builder.py,sha256=u7cpW6DkiN7KpLapmMaXgL_FneoN69boxiANbVgMdSI,4564
|
|
126
126
|
datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
|
|
127
|
-
datahub/emitter/
|
|
127
|
+
datahub/emitter/response_helper.py,sha256=lRMvzF-RPHNkN_ONl-N2uJjKh5XtRFrofrdGibVGn2U,4509
|
|
128
|
+
datahub/emitter/rest_emitter.py,sha256=LyJuTZicSxzyLlwCyVzecjuyFDH0HkvQmSh037OLBc8,29777
|
|
128
129
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
129
130
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
130
131
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -169,7 +170,7 @@ datahub/ingestion/glossary/classifier.py,sha256=daLxnVv_JlfB_jBOxH5LrU_xQRndrsGo
|
|
|
169
170
|
datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
|
|
170
171
|
datahub/ingestion/glossary/datahub_classifier.py,sha256=O7wm6gQT1Jf2QSKdWjJQbS5oSzJwplXzfza26Gdq5Mg,7555
|
|
171
172
|
datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
172
|
-
datahub/ingestion/graph/client.py,sha256=
|
|
173
|
+
datahub/ingestion/graph/client.py,sha256=4F-IMnz4DwIdLPL5yXMRqqlB3uonN2OHOUIKKHByS7o,65680
|
|
173
174
|
datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
|
|
174
175
|
datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
|
|
175
176
|
datahub/ingestion/graph/entity_versioning.py,sha256=nrcNz0Qm6kpE6oTu_mrYUQDx14KPspBTc6R9SyFUY6c,6901
|
|
@@ -188,7 +189,7 @@ datahub/ingestion/sink/blackhole.py,sha256=-jYcWo4i8q7312bCIoHrGr7nT9JdPvA7c4jvS
|
|
|
188
189
|
datahub/ingestion/sink/console.py,sha256=TZfhA0Ec2eNCrMH7RRy2JOdUE-U-hkoIQrPm1CmKLQs,591
|
|
189
190
|
datahub/ingestion/sink/datahub_kafka.py,sha256=_cjuXu5I6G0zJ2UK7hMbaKjMPZXeIwRMgm7CVeTiNtc,2578
|
|
190
191
|
datahub/ingestion/sink/datahub_lite.py,sha256=7u2aWm7ENLshKHl-PkjJg6Mrw4bWs8sTfKIBz4mm8Ak,1879
|
|
191
|
-
datahub/ingestion/sink/datahub_rest.py,sha256=
|
|
192
|
+
datahub/ingestion/sink/datahub_rest.py,sha256=4hvMDUxHMJXGgk3Iy7fcYGKixjvVd9DHD03X-F3kOg0,12976
|
|
192
193
|
datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
|
|
193
194
|
datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
|
|
194
195
|
datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -212,10 +213,11 @@ datahub/ingestion/source/openapi_parser.py,sha256=K3Z4aLXBQX8eR7tvk9iZakOjghjezx
|
|
|
212
213
|
datahub/ingestion/source/preset.py,sha256=bbh0ZWiAZMy2zuJDmaRY07_OuGJ9tdtKjwvIxqbY5II,3964
|
|
213
214
|
datahub/ingestion/source/pulsar.py,sha256=u5F8QnCLJsht5-7XCiUTsnfhCPIpKVB_l32CgMCU-As,20187
|
|
214
215
|
datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99WdvcYiA,30653
|
|
215
|
-
datahub/ingestion/source/salesforce.py,sha256=
|
|
216
|
+
datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
|
|
216
217
|
datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
|
|
217
218
|
datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
|
|
218
219
|
datahub/ingestion/source/superset.py,sha256=WrpCiZEC17cmFGcfUTTqUdnKASq7ZpT0ih-4xqB9qt4,30976
|
|
220
|
+
datahub/ingestion/source/vertexai.py,sha256=uOtIgHwsH--hkAFqspXGoNN-jHip16s6m5lyvwi-jrg,27735
|
|
219
221
|
datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
220
222
|
datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
|
|
221
223
|
datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
|
|
@@ -245,7 +247,7 @@ datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeu
|
|
|
245
247
|
datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=fADrYPiQaiJYvLOrltgv8RJIV9VV2y7vjh3s0zHW6Cw,13950
|
|
246
248
|
datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=kEwWhq3ch6WT4q4hcX8-fvQh28KgrNfspFwIytO3vQA,25103
|
|
247
249
|
datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
|
|
248
|
-
datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=
|
|
250
|
+
datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=66c7VPPcX1Eamm14VoEZzSPngMiwsNZOeymWBBRsznk,24420
|
|
249
251
|
datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py,sha256=DeT3v_Z82__8En0FcZ0kavBAWQoRvSZ5Rppm9eeDAb8,2393
|
|
250
252
|
datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7cBso7ZzrWl_vO5PYSa6CpvqNx8,1554
|
|
251
253
|
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256=9_sfX8BE2vt9RjBMyq27UxCxBaSlD5o3L4gQxrwlPvA,4961
|
|
@@ -268,6 +270,7 @@ datahub/ingestion/source/cassandra/cassandra_profiling.py,sha256=DkSIryZNwLei5Pa
|
|
|
268
270
|
datahub/ingestion/source/cassandra/cassandra_utils.py,sha256=j-LidYkaCTmGnpUVNLsax_c3z32PsQbsbHeYojygd1s,5105
|
|
269
271
|
datahub/ingestion/source/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
270
272
|
datahub/ingestion/source/common/data_reader.py,sha256=XbSxiRTYrk6seOz0ZjVjzSpGvP8lEjmqXrNI4cdYYmQ,1819
|
|
273
|
+
datahub/ingestion/source/common/gcp_credentials_config.py,sha256=MM902YksVGv-yCmUTimBXF9YreHUDo3PW3o1Wasb2FE,2212
|
|
271
274
|
datahub/ingestion/source/common/subtypes.py,sha256=LCJefUZ9o8yyhNXOy_HJefBOt93Cmn9r3m4VtCiK4iM,2643
|
|
272
275
|
datahub/ingestion/source/data_lake_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
273
276
|
datahub/ingestion/source/data_lake_common/config.py,sha256=qUk83B01hjuBKHvVz8SmXnVCy5eFj-2-2QLEOrAdbgk,359
|
|
@@ -461,7 +464,7 @@ datahub/ingestion/source/sql/clickhouse.py,sha256=uSRy-HKAiGFTHVLoVtGoh23X0O1lwy
|
|
|
461
464
|
datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2qkzjH6CpTOgkx4,1443
|
|
462
465
|
datahub/ingestion/source/sql/druid.py,sha256=IjGZdntb5hubkIzzT9qDRDpyfbckEg2GwRncvC5mDSs,2722
|
|
463
466
|
datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
|
|
464
|
-
datahub/ingestion/source/sql/hive.py,sha256=
|
|
467
|
+
datahub/ingestion/source/sql/hive.py,sha256=n0XCGkNkVAe-TEyXbxlefvohbmtALbWaC1a0_B9rlG8,30670
|
|
465
468
|
datahub/ingestion/source/sql/hive_metastore.py,sha256=HW0zoHKarBYb8oVCy5fHvPOn-pTo25LctW_AusmH0hQ,36252
|
|
466
469
|
datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
|
|
467
470
|
datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
|
|
@@ -1022,9 +1025,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1022
1025
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1023
1026
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1024
1027
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1025
|
-
acryl_datahub-1.0.
|
|
1026
|
-
acryl_datahub-1.0.
|
|
1027
|
-
acryl_datahub-1.0.
|
|
1028
|
-
acryl_datahub-1.0.
|
|
1029
|
-
acryl_datahub-1.0.
|
|
1030
|
-
acryl_datahub-1.0.
|
|
1028
|
+
acryl_datahub-1.0.0rc17.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
1029
|
+
acryl_datahub-1.0.0rc17.dist-info/METADATA,sha256=MDcGSuVfOxVWXtrGFaLcnCPABOS1ZCnehvPDHCWJtrk,176898
|
|
1030
|
+
acryl_datahub-1.0.0rc17.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
1031
|
+
acryl_datahub-1.0.0rc17.dist-info/entry_points.txt,sha256=7-eDilp0OACUtlmmZ-LF6H9MF_SWD_bWHKNG7Dvhhos,9652
|
|
1032
|
+
acryl_datahub-1.0.0rc17.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1033
|
+
acryl_datahub-1.0.0rc17.dist-info/RECORD,,
|
|
@@ -101,6 +101,7 @@ tableau = datahub.ingestion.source.tableau.tableau:TableauSource
|
|
|
101
101
|
teradata = datahub.ingestion.source.sql.teradata:TeradataSource
|
|
102
102
|
trino = datahub.ingestion.source.sql.trino:TrinoSource
|
|
103
103
|
unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource
|
|
104
|
+
vertexai = datahub.ingestion.source.vertexai:VertexAISource
|
|
104
105
|
vertica = datahub.ingestion.source.sql.vertica:VerticaSource
|
|
105
106
|
|
|
106
107
|
[datahub.ingestion.transformer.plugins]
|
datahub/_version.py
CHANGED
datahub/configuration/common.py
CHANGED
|
@@ -198,6 +198,14 @@ class IgnorableError(MetaError):
|
|
|
198
198
|
"""An error that can be ignored."""
|
|
199
199
|
|
|
200
200
|
|
|
201
|
+
class TraceTimeoutError(OperationalError):
|
|
202
|
+
"""Failure to complete an API Trace within the timeout."""
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class TraceValidationError(OperationalError):
|
|
206
|
+
"""Failure to complete the expected write operation."""
|
|
207
|
+
|
|
208
|
+
|
|
201
209
|
@runtime_checkable
|
|
202
210
|
class ExceptionWithProps(Protocol):
|
|
203
211
|
def get_telemetry_props(self) -> Dict[str, Any]: ...
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, List, Optional, Sequence, Union
|
|
5
|
+
|
|
6
|
+
from requests import Response
|
|
7
|
+
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
10
|
+
MetadataChangeProposal,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class TraceData:
|
|
18
|
+
trace_id: str
|
|
19
|
+
data: Dict[str, List[str]]
|
|
20
|
+
|
|
21
|
+
def __post_init__(self) -> None:
|
|
22
|
+
if not self.trace_id:
|
|
23
|
+
raise ValueError("trace_id cannot be empty")
|
|
24
|
+
if not isinstance(self.data, dict):
|
|
25
|
+
raise TypeError("data must be a dictionary")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _extract_trace_id(
|
|
29
|
+
response: Response, trace_header: str = "traceparent"
|
|
30
|
+
) -> Optional[str]:
|
|
31
|
+
"""
|
|
32
|
+
Extract trace ID from response headers.
|
|
33
|
+
Args:
|
|
34
|
+
response: HTTP response object
|
|
35
|
+
trace_header: Name of the trace header to use
|
|
36
|
+
Returns:
|
|
37
|
+
Trace ID if found and response is valid, None otherwise
|
|
38
|
+
"""
|
|
39
|
+
if not 200 <= response.status_code < 300:
|
|
40
|
+
logger.debug(f"Invalid status code: {response.status_code}")
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
trace_id = response.headers.get(trace_header)
|
|
44
|
+
if not trace_id:
|
|
45
|
+
logger.debug(f"Missing trace header: {trace_header}")
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
return trace_id
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def extract_trace_data(
|
|
52
|
+
response: Response,
|
|
53
|
+
aspects_to_trace: Optional[List[str]] = None,
|
|
54
|
+
trace_header: str = "traceparent",
|
|
55
|
+
) -> Optional[TraceData]:
|
|
56
|
+
"""
|
|
57
|
+
Extract trace data from a response object.
|
|
58
|
+
Args:
|
|
59
|
+
response: HTTP response object
|
|
60
|
+
aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
|
|
61
|
+
trace_header: Name of the trace header to use (default: "traceparent")
|
|
62
|
+
Returns:
|
|
63
|
+
TraceData object if successful, None otherwise
|
|
64
|
+
Raises:
|
|
65
|
+
JSONDecodeError: If response body cannot be decoded as JSON
|
|
66
|
+
"""
|
|
67
|
+
trace_id = _extract_trace_id(response, trace_header)
|
|
68
|
+
if not trace_id:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
json_data = response.json()
|
|
73
|
+
if not isinstance(json_data, list):
|
|
74
|
+
logger.debug("JSON data is not a list")
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
data: Dict[str, List[str]] = {}
|
|
78
|
+
|
|
79
|
+
for item in json_data:
|
|
80
|
+
urn = item.get("urn")
|
|
81
|
+
if not urn:
|
|
82
|
+
logger.debug(f"Skipping item without URN: {item}")
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if aspects_to_trace is None:
|
|
86
|
+
aspect_names = [
|
|
87
|
+
k for k, v in item.items() if k != "urn" and v is not None
|
|
88
|
+
]
|
|
89
|
+
else:
|
|
90
|
+
aspect_names = [
|
|
91
|
+
field for field in aspects_to_trace if item.get(field) is not None
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
data[urn] = aspect_names
|
|
95
|
+
|
|
96
|
+
return TraceData(trace_id=trace_id, data=data)
|
|
97
|
+
|
|
98
|
+
except json.JSONDecodeError as e:
|
|
99
|
+
logger.error(f"Failed to decode JSON response: {e}")
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extract_trace_data_from_mcps(
|
|
104
|
+
response: Response,
|
|
105
|
+
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
106
|
+
aspects_to_trace: Optional[List[str]] = None,
|
|
107
|
+
trace_header: str = "traceparent",
|
|
108
|
+
) -> Optional[TraceData]:
|
|
109
|
+
"""
|
|
110
|
+
Extract trace data from a response object and populate data from provided MCPs.
|
|
111
|
+
Args:
|
|
112
|
+
response: HTTP response object used only for trace_id extraction
|
|
113
|
+
mcps: List of MCP URN and aspect data
|
|
114
|
+
aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
|
|
115
|
+
trace_header: Name of the trace header to use (default: "traceparent")
|
|
116
|
+
Returns:
|
|
117
|
+
TraceData object if successful, None otherwise
|
|
118
|
+
"""
|
|
119
|
+
trace_id = _extract_trace_id(response, trace_header)
|
|
120
|
+
if not trace_id:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
data: Dict[str, List[str]] = {}
|
|
124
|
+
try:
|
|
125
|
+
for mcp in mcps:
|
|
126
|
+
entity_urn = getattr(mcp, "entityUrn", None)
|
|
127
|
+
aspect_name = getattr(mcp, "aspectName", None)
|
|
128
|
+
|
|
129
|
+
if not entity_urn or not aspect_name:
|
|
130
|
+
logger.debug(f"Skipping MCP with missing URN or aspect name: {mcp}")
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
if aspects_to_trace is not None and aspect_name not in aspects_to_trace:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
if entity_urn not in data:
|
|
137
|
+
data[entity_urn] = []
|
|
138
|
+
|
|
139
|
+
data[entity_urn].append(aspect_name)
|
|
140
|
+
|
|
141
|
+
return TraceData(trace_id=trace_id, data=data)
|
|
142
|
+
|
|
143
|
+
except AttributeError as e:
|
|
144
|
+
logger.error(f"Error processing MCPs: {e}")
|
|
145
|
+
return None
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -4,8 +4,10 @@ import functools
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
+
import time
|
|
7
8
|
from collections import defaultdict
|
|
8
9
|
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime, timedelta
|
|
9
11
|
from enum import auto
|
|
10
12
|
from json.decoder import JSONDecodeError
|
|
11
13
|
from typing import (
|
|
@@ -35,11 +37,18 @@ from datahub.configuration.common import (
|
|
|
35
37
|
ConfigModel,
|
|
36
38
|
ConfigurationError,
|
|
37
39
|
OperationalError,
|
|
40
|
+
TraceTimeoutError,
|
|
41
|
+
TraceValidationError,
|
|
38
42
|
)
|
|
39
43
|
from datahub.emitter.aspect import JSON_CONTENT_TYPE
|
|
40
44
|
from datahub.emitter.generic_emitter import Emitter
|
|
41
45
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
42
46
|
from datahub.emitter.request_helper import make_curl_command
|
|
47
|
+
from datahub.emitter.response_helper import (
|
|
48
|
+
TraceData,
|
|
49
|
+
extract_trace_data,
|
|
50
|
+
extract_trace_data_from_mcps,
|
|
51
|
+
)
|
|
43
52
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
44
53
|
from datahub.ingestion.api.closeable import Closeable
|
|
45
54
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
@@ -69,6 +78,11 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
69
78
|
|
|
70
79
|
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
71
80
|
|
|
81
|
+
TRACE_PENDING_STATUS = "PENDING"
|
|
82
|
+
TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
83
|
+
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
84
|
+
TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
|
|
85
|
+
|
|
72
86
|
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
73
87
|
# for overhead like request headers.
|
|
74
88
|
# This applies to pretty much all calls to GMS.
|
|
@@ -83,6 +97,11 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
83
97
|
)
|
|
84
98
|
|
|
85
99
|
|
|
100
|
+
class RestTraceMode(ConfigEnum):
|
|
101
|
+
ENABLED = auto()
|
|
102
|
+
DISABLED = auto()
|
|
103
|
+
|
|
104
|
+
|
|
86
105
|
class RestSinkEndpoint(ConfigEnum):
|
|
87
106
|
RESTLI = auto()
|
|
88
107
|
OPENAPI = auto()
|
|
@@ -94,6 +113,13 @@ DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
|
|
|
94
113
|
)
|
|
95
114
|
|
|
96
115
|
|
|
116
|
+
# Supported with v1.0
|
|
117
|
+
DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
|
|
118
|
+
RestTraceMode,
|
|
119
|
+
os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
97
123
|
class RequestsSessionConfig(ConfigModel):
|
|
98
124
|
timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
|
|
99
125
|
|
|
@@ -185,6 +211,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
185
211
|
_token: Optional[str]
|
|
186
212
|
_session: requests.Session
|
|
187
213
|
_openapi_ingestion: bool
|
|
214
|
+
_default_trace_mode: bool
|
|
188
215
|
|
|
189
216
|
def __init__(
|
|
190
217
|
self,
|
|
@@ -201,6 +228,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
201
228
|
client_certificate_path: Optional[str] = None,
|
|
202
229
|
disable_ssl_verification: bool = False,
|
|
203
230
|
openapi_ingestion: bool = False,
|
|
231
|
+
default_trace_mode: bool = False,
|
|
204
232
|
):
|
|
205
233
|
if not gms_server:
|
|
206
234
|
raise ConfigurationError("gms server is required")
|
|
@@ -214,12 +242,16 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
214
242
|
self._token = token
|
|
215
243
|
self.server_config: Dict[str, Any] = {}
|
|
216
244
|
self._openapi_ingestion = openapi_ingestion
|
|
245
|
+
self._default_trace_mode = default_trace_mode
|
|
217
246
|
self._session = requests.Session()
|
|
218
247
|
|
|
219
248
|
logger.debug(
|
|
220
249
|
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
221
250
|
)
|
|
222
251
|
|
|
252
|
+
if self._default_trace_mode:
|
|
253
|
+
logger.debug("Using API Tracing for ingestion.")
|
|
254
|
+
|
|
223
255
|
headers = {
|
|
224
256
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
225
257
|
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
@@ -396,13 +428,21 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
396
428
|
self,
|
|
397
429
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
398
430
|
async_flag: Optional[bool] = None,
|
|
431
|
+
trace_flag: Optional[bool] = None,
|
|
432
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
399
433
|
) -> None:
|
|
400
434
|
ensure_has_system_metadata(mcp)
|
|
401
435
|
|
|
436
|
+
trace_data = None
|
|
437
|
+
|
|
402
438
|
if self._openapi_ingestion:
|
|
403
439
|
request = self._to_openapi_request(mcp, async_flag, async_default=False)
|
|
404
440
|
if request:
|
|
405
|
-
self._emit_generic(request[0], payload=request[1])
|
|
441
|
+
response = self._emit_generic(request[0], payload=request[1])
|
|
442
|
+
|
|
443
|
+
if self._should_trace(async_flag, trace_flag):
|
|
444
|
+
trace_data = extract_trace_data(response) if response else None
|
|
445
|
+
|
|
406
446
|
else:
|
|
407
447
|
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
408
448
|
|
|
@@ -414,12 +454,25 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
414
454
|
|
|
415
455
|
payload = json.dumps(payload_dict)
|
|
416
456
|
|
|
417
|
-
self._emit_generic(url, payload)
|
|
457
|
+
response = self._emit_generic(url, payload)
|
|
458
|
+
|
|
459
|
+
if self._should_trace(async_flag, trace_flag):
|
|
460
|
+
trace_data = (
|
|
461
|
+
extract_trace_data_from_mcps(response, [mcp]) if response else None
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if trace_data:
|
|
465
|
+
self._await_status(
|
|
466
|
+
[trace_data],
|
|
467
|
+
trace_timeout,
|
|
468
|
+
)
|
|
418
469
|
|
|
419
470
|
def emit_mcps(
|
|
420
471
|
self,
|
|
421
472
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
422
473
|
async_flag: Optional[bool] = None,
|
|
474
|
+
trace_flag: Optional[bool] = None,
|
|
475
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
423
476
|
) -> int:
|
|
424
477
|
if _DATAHUB_EMITTER_TRACE:
|
|
425
478
|
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
@@ -428,7 +481,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
428
481
|
ensure_has_system_metadata(mcp)
|
|
429
482
|
|
|
430
483
|
if self._openapi_ingestion:
|
|
431
|
-
return self._emit_openapi_mcps(mcps, async_flag)
|
|
484
|
+
return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
|
|
432
485
|
else:
|
|
433
486
|
return self._emit_restli_mcps(mcps, async_flag)
|
|
434
487
|
|
|
@@ -436,6 +489,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
436
489
|
self,
|
|
437
490
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
438
491
|
async_flag: Optional[bool] = None,
|
|
492
|
+
trace_flag: Optional[bool] = None,
|
|
493
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
439
494
|
) -> int:
|
|
440
495
|
"""
|
|
441
496
|
1. Grouping MCPs by their entity URL
|
|
@@ -483,6 +538,16 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
483
538
|
response = self._emit_generic(url, payload=_Chunk.join(chunk))
|
|
484
539
|
responses.append(response)
|
|
485
540
|
|
|
541
|
+
if self._should_trace(async_flag, trace_flag, async_default=True):
|
|
542
|
+
trace_data = []
|
|
543
|
+
for response in responses:
|
|
544
|
+
data = extract_trace_data(response) if response else None
|
|
545
|
+
if data is not None:
|
|
546
|
+
trace_data.append(data)
|
|
547
|
+
|
|
548
|
+
if trace_data:
|
|
549
|
+
self._await_status(trace_data, trace_timeout)
|
|
550
|
+
|
|
486
551
|
return len(responses)
|
|
487
552
|
|
|
488
553
|
def _emit_restli_mcps(
|
|
@@ -593,6 +658,99 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
593
658
|
"Unable to emit metadata to DataHub GMS", {"message": str(e)}
|
|
594
659
|
) from e
|
|
595
660
|
|
|
661
|
+
def _await_status(
|
|
662
|
+
self,
|
|
663
|
+
trace_data: List[TraceData],
|
|
664
|
+
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
665
|
+
) -> None:
|
|
666
|
+
"""Verify the status of asynchronous write operations.
|
|
667
|
+
Args:
|
|
668
|
+
trace_data: List of trace data to verify
|
|
669
|
+
trace_timeout: Maximum time to wait for verification.
|
|
670
|
+
Raises:
|
|
671
|
+
TraceTimeoutError: If verification fails or times out
|
|
672
|
+
TraceValidationError: Expected write was not completed successfully
|
|
673
|
+
"""
|
|
674
|
+
if trace_timeout is None:
|
|
675
|
+
raise ValueError("trace_timeout cannot be None")
|
|
676
|
+
|
|
677
|
+
try:
|
|
678
|
+
if not trace_data:
|
|
679
|
+
logger.debug("No trace data to verify")
|
|
680
|
+
return
|
|
681
|
+
|
|
682
|
+
start_time = datetime.now()
|
|
683
|
+
|
|
684
|
+
for trace in trace_data:
|
|
685
|
+
current_backoff = TRACE_INITIAL_BACKOFF
|
|
686
|
+
|
|
687
|
+
while trace.data:
|
|
688
|
+
if datetime.now() - start_time > trace_timeout:
|
|
689
|
+
raise TraceTimeoutError(
|
|
690
|
+
f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
base_url = f"{self._gms_server}/openapi/v1/trace/write"
|
|
694
|
+
url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"
|
|
695
|
+
|
|
696
|
+
response = self._emit_generic(url, payload=trace.data)
|
|
697
|
+
json_data = response.json()
|
|
698
|
+
|
|
699
|
+
for urn, aspects in json_data.items():
|
|
700
|
+
for aspect_name, aspect_status in aspects.items():
|
|
701
|
+
if not aspect_status["success"]:
|
|
702
|
+
error_msg = (
|
|
703
|
+
f"Unable to validate async write to DataHub GMS: "
|
|
704
|
+
f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
|
|
705
|
+
f"Status: {aspect_status}"
|
|
706
|
+
)
|
|
707
|
+
raise TraceValidationError(error_msg, aspect_status)
|
|
708
|
+
|
|
709
|
+
primary_storage = aspect_status["primaryStorage"][
|
|
710
|
+
"writeStatus"
|
|
711
|
+
]
|
|
712
|
+
search_storage = aspect_status["searchStorage"][
|
|
713
|
+
"writeStatus"
|
|
714
|
+
]
|
|
715
|
+
|
|
716
|
+
# Remove resolved statuses
|
|
717
|
+
if (
|
|
718
|
+
primary_storage != TRACE_PENDING_STATUS
|
|
719
|
+
and search_storage != TRACE_PENDING_STATUS
|
|
720
|
+
):
|
|
721
|
+
trace.data[urn].remove(aspect_name)
|
|
722
|
+
|
|
723
|
+
# Remove urns with all statuses resolved
|
|
724
|
+
if not trace.data[urn]:
|
|
725
|
+
trace.data.pop(urn)
|
|
726
|
+
|
|
727
|
+
# Adjust backoff based on response
|
|
728
|
+
if trace.data:
|
|
729
|
+
# If we still have pending items, increase backoff
|
|
730
|
+
current_backoff = min(
|
|
731
|
+
current_backoff * TRACE_BACKOFF_FACTOR, TRACE_MAX_BACKOFF
|
|
732
|
+
)
|
|
733
|
+
logger.debug(
|
|
734
|
+
f"Waiting {current_backoff} seconds before next check"
|
|
735
|
+
)
|
|
736
|
+
time.sleep(current_backoff)
|
|
737
|
+
|
|
738
|
+
except Exception as e:
|
|
739
|
+
logger.error(f"Error during status verification: {str(e)}")
|
|
740
|
+
raise
|
|
741
|
+
|
|
742
|
+
def _should_trace(
|
|
743
|
+
self,
|
|
744
|
+
async_flag: Optional[bool] = None,
|
|
745
|
+
trace_flag: Optional[bool] = None,
|
|
746
|
+
async_default: bool = False,
|
|
747
|
+
) -> bool:
|
|
748
|
+
resolved_trace_flag = (
|
|
749
|
+
trace_flag if trace_flag is not None else self._default_trace_mode
|
|
750
|
+
)
|
|
751
|
+
resolved_async_flag = async_flag if async_flag is not None else async_default
|
|
752
|
+
return resolved_trace_flag and resolved_async_flag
|
|
753
|
+
|
|
596
754
|
def __repr__(self) -> str:
|
|
597
755
|
token_str = (
|
|
598
756
|
f" with token: {self._token[:4]}**********{self._token[-4:]}"
|
|
@@ -34,8 +34,10 @@ from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
|
|
|
34
34
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
35
35
|
from datahub.emitter.rest_emitter import (
|
|
36
36
|
DEFAULT_REST_SINK_ENDPOINT,
|
|
37
|
+
DEFAULT_REST_TRACE_MODE,
|
|
37
38
|
DatahubRestEmitter,
|
|
38
39
|
RestSinkEndpoint,
|
|
40
|
+
RestTraceMode,
|
|
39
41
|
)
|
|
40
42
|
from datahub.emitter.serialization_helper import post_json_transform
|
|
41
43
|
from datahub.ingestion.graph.config import (
|
|
@@ -146,6 +148,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
146
148
|
client_certificate_path=self.config.client_certificate_path,
|
|
147
149
|
disable_ssl_verification=self.config.disable_ssl_verification,
|
|
148
150
|
openapi_ingestion=DEFAULT_REST_SINK_ENDPOINT == RestSinkEndpoint.OPENAPI,
|
|
151
|
+
default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
|
|
149
152
|
)
|
|
150
153
|
|
|
151
154
|
self.server_id = _MISSING_SERVER_ID
|
|
@@ -21,8 +21,10 @@ from datahub.emitter.mcp_builder import mcps_from_mce
|
|
|
21
21
|
from datahub.emitter.rest_emitter import (
|
|
22
22
|
BATCH_INGEST_MAX_PAYLOAD_LENGTH,
|
|
23
23
|
DEFAULT_REST_SINK_ENDPOINT,
|
|
24
|
+
DEFAULT_REST_TRACE_MODE,
|
|
24
25
|
DataHubRestEmitter,
|
|
25
26
|
RestSinkEndpoint,
|
|
27
|
+
RestTraceMode,
|
|
26
28
|
)
|
|
27
29
|
from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
|
|
28
30
|
from datahub.ingestion.api.sink import (
|
|
@@ -69,6 +71,7 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
|
|
|
69
71
|
class DatahubRestSinkConfig(DatahubClientConfig):
|
|
70
72
|
mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
|
|
71
73
|
endpoint: RestSinkEndpoint = DEFAULT_REST_SINK_ENDPOINT
|
|
74
|
+
default_trace_mode: RestTraceMode = DEFAULT_REST_TRACE_MODE
|
|
72
75
|
|
|
73
76
|
# These only apply in async modes.
|
|
74
77
|
max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
|
|
@@ -176,6 +179,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
176
179
|
client_certificate_path=config.client_certificate_path,
|
|
177
180
|
disable_ssl_verification=config.disable_ssl_verification,
|
|
178
181
|
openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
|
|
182
|
+
default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
|
|
179
183
|
)
|
|
180
184
|
|
|
181
185
|
@property
|