acryl-datahub 1.0.0rc16__py3-none-any.whl → 1.0.0rc18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
- datahub/_version.py,sha256=E9Ai-iBRxYw3ysQIRJYlsLsIOcjgGBzeqp6OJwZHeAg,322
3
+ datahub/_version.py,sha256=cadwaIVSYjldq1gLBbOrAkoALM9-SUSsE4xECju3hJw,322
4
4
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
5
5
  datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
6
6
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -93,7 +93,7 @@ datahub/cli/specific/structuredproperties_cli.py,sha256=Rb06XJKxqda5RSUo188B90Wn
93
93
  datahub/cli/specific/user_cli.py,sha256=jGAokb1NRu8obs6P2g4OL2NQdFgpUBa9De55TBBtun0,1897
94
94
  datahub/configuration/__init__.py,sha256=5TN3a7CWNsLRHpdj-sv2bxKWF2IslvJwE6EpNMFrIS4,123
95
95
  datahub/configuration/_config_enum.py,sha256=ul2hr5gMmdLvBINicFkMNMi1ApmnmZSwNdUYYted5nk,1447
96
- datahub/configuration/common.py,sha256=bt_kiy2blqHbxbG-aM_8RNAZoIKMfaMzOZhtknnyLXg,10410
96
+ datahub/configuration/common.py,sha256=GKRNgRCBsFUmZb6huD6DPNYUKNRxUbcHfFazHLte7wk,10626
97
97
  datahub/configuration/config_loader.py,sha256=hRzPFxkz-w9IqkpSa5vwCzSra1p49DyfeJNeyqGa8-4,6827
98
98
  datahub/configuration/connection_resolver.py,sha256=n4-6MwMiOEDgTouxO0SMjTILKVhJPo6-naE6FuR5qMs,1516
99
99
  datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvrkosPg,2870
@@ -124,7 +124,8 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
124
124
  datahub/emitter/mcp_builder.py,sha256=Q1bX2BthNvZ7ae71XYF6ICoiN8IOqaAd_h3zOct57Q0,11752
125
125
  datahub/emitter/mcp_patch_builder.py,sha256=u7cpW6DkiN7KpLapmMaXgL_FneoN69boxiANbVgMdSI,4564
126
126
  datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
127
- datahub/emitter/rest_emitter.py,sha256=yJ_QCVe4K-ILXQOhS7CiTHG5Gw2xu4H9mscAnOvfUY4,23633
127
+ datahub/emitter/response_helper.py,sha256=lRMvzF-RPHNkN_ONl-N2uJjKh5XtRFrofrdGibVGn2U,4509
128
+ datahub/emitter/rest_emitter.py,sha256=LyJuTZicSxzyLlwCyVzecjuyFDH0HkvQmSh037OLBc8,29777
128
129
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
129
130
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
130
131
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -169,7 +170,7 @@ datahub/ingestion/glossary/classifier.py,sha256=daLxnVv_JlfB_jBOxH5LrU_xQRndrsGo
169
170
  datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
170
171
  datahub/ingestion/glossary/datahub_classifier.py,sha256=O7wm6gQT1Jf2QSKdWjJQbS5oSzJwplXzfza26Gdq5Mg,7555
171
172
  datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
- datahub/ingestion/graph/client.py,sha256=DpGenZBQ5yziaDRNzKDSlMnE2GWoJe_yk2MdyU3UnLM,65551
173
+ datahub/ingestion/graph/client.py,sha256=4F-IMnz4DwIdLPL5yXMRqqlB3uonN2OHOUIKKHByS7o,65680
173
174
  datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
174
175
  datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
175
176
  datahub/ingestion/graph/entity_versioning.py,sha256=nrcNz0Qm6kpE6oTu_mrYUQDx14KPspBTc6R9SyFUY6c,6901
@@ -188,7 +189,7 @@ datahub/ingestion/sink/blackhole.py,sha256=-jYcWo4i8q7312bCIoHrGr7nT9JdPvA7c4jvS
188
189
  datahub/ingestion/sink/console.py,sha256=TZfhA0Ec2eNCrMH7RRy2JOdUE-U-hkoIQrPm1CmKLQs,591
189
190
  datahub/ingestion/sink/datahub_kafka.py,sha256=_cjuXu5I6G0zJ2UK7hMbaKjMPZXeIwRMgm7CVeTiNtc,2578
190
191
  datahub/ingestion/sink/datahub_lite.py,sha256=7u2aWm7ENLshKHl-PkjJg6Mrw4bWs8sTfKIBz4mm8Ak,1879
191
- datahub/ingestion/sink/datahub_rest.py,sha256=KLUFteqGPmMvKaMbZG055uBYNyNUDkt_ziuJcjaNl1o,12781
192
+ datahub/ingestion/sink/datahub_rest.py,sha256=4hvMDUxHMJXGgk3Iy7fcYGKixjvVd9DHD03X-F3kOg0,12976
192
193
  datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
193
194
  datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
194
195
  datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -212,10 +213,11 @@ datahub/ingestion/source/openapi_parser.py,sha256=K3Z4aLXBQX8eR7tvk9iZakOjghjezx
212
213
  datahub/ingestion/source/preset.py,sha256=bbh0ZWiAZMy2zuJDmaRY07_OuGJ9tdtKjwvIxqbY5II,3964
213
214
  datahub/ingestion/source/pulsar.py,sha256=u5F8QnCLJsht5-7XCiUTsnfhCPIpKVB_l32CgMCU-As,20187
214
215
  datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99WdvcYiA,30653
215
- datahub/ingestion/source/salesforce.py,sha256=d56tfYqg1rGDvMkLznmBJII55B1Zs8XTaQrrW-wHdLo,32679
216
+ datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
216
217
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
217
218
  datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
218
219
  datahub/ingestion/source/superset.py,sha256=WrpCiZEC17cmFGcfUTTqUdnKASq7ZpT0ih-4xqB9qt4,30976
220
+ datahub/ingestion/source/vertexai.py,sha256=uOtIgHwsH--hkAFqspXGoNN-jHip16s6m5lyvwi-jrg,27735
219
221
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
220
222
  datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
221
223
  datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
@@ -245,7 +247,7 @@ datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeu
245
247
  datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=fADrYPiQaiJYvLOrltgv8RJIV9VV2y7vjh3s0zHW6Cw,13950
246
248
  datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=kEwWhq3ch6WT4q4hcX8-fvQh28KgrNfspFwIytO3vQA,25103
247
249
  datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
248
- datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=UwGCngVm6GEUYvXvdquK-inNIruUoTB_pN6qsrjW5c4,26291
250
+ datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=66c7VPPcX1Eamm14VoEZzSPngMiwsNZOeymWBBRsznk,24420
249
251
  datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py,sha256=DeT3v_Z82__8En0FcZ0kavBAWQoRvSZ5Rppm9eeDAb8,2393
250
252
  datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7cBso7ZzrWl_vO5PYSa6CpvqNx8,1554
251
253
  datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256=9_sfX8BE2vt9RjBMyq27UxCxBaSlD5o3L4gQxrwlPvA,4961
@@ -268,6 +270,7 @@ datahub/ingestion/source/cassandra/cassandra_profiling.py,sha256=DkSIryZNwLei5Pa
268
270
  datahub/ingestion/source/cassandra/cassandra_utils.py,sha256=j-LidYkaCTmGnpUVNLsax_c3z32PsQbsbHeYojygd1s,5105
269
271
  datahub/ingestion/source/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
270
272
  datahub/ingestion/source/common/data_reader.py,sha256=XbSxiRTYrk6seOz0ZjVjzSpGvP8lEjmqXrNI4cdYYmQ,1819
273
+ datahub/ingestion/source/common/gcp_credentials_config.py,sha256=MM902YksVGv-yCmUTimBXF9YreHUDo3PW3o1Wasb2FE,2212
271
274
  datahub/ingestion/source/common/subtypes.py,sha256=LCJefUZ9o8yyhNXOy_HJefBOt93Cmn9r3m4VtCiK4iM,2643
272
275
  datahub/ingestion/source/data_lake_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
273
276
  datahub/ingestion/source/data_lake_common/config.py,sha256=qUk83B01hjuBKHvVz8SmXnVCy5eFj-2-2QLEOrAdbgk,359
@@ -444,7 +447,7 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDi
444
447
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
445
448
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
446
449
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
447
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=R3QxWtdR8T_8YV_3aqt3rJdto1gAij_mEHlSYKqdCfA,28326
450
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=gX9E1Z_CemAZsuTDmtvqrxY7vBL2da75j7X8Xwhaf8Y,28441
448
451
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=qz_rhRMNCXxHd23bePbb3YxhFgN7eRpV4s6g58hQ5bU,39678
449
452
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=ahea-bwpW6T0iDehGo0Qq_J7wKxPkV61aYHm8bGwDqo,6651
450
453
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=qkGgk6WdKSPThFjexXHrxUPYiVtzDk2MbGX3b281A4c,26044
@@ -461,7 +464,7 @@ datahub/ingestion/source/sql/clickhouse.py,sha256=uSRy-HKAiGFTHVLoVtGoh23X0O1lwy
461
464
  datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2qkzjH6CpTOgkx4,1443
462
465
  datahub/ingestion/source/sql/druid.py,sha256=IjGZdntb5hubkIzzT9qDRDpyfbckEg2GwRncvC5mDSs,2722
463
466
  datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
464
- datahub/ingestion/source/sql/hive.py,sha256=tfRgzatF4cDb3F7gNXF9zEjFOFrcI318K6yGgykW_EQ,30212
467
+ datahub/ingestion/source/sql/hive.py,sha256=n0XCGkNkVAe-TEyXbxlefvohbmtALbWaC1a0_B9rlG8,30670
465
468
  datahub/ingestion/source/sql/hive_metastore.py,sha256=HW0zoHKarBYb8oVCy5fHvPOn-pTo25LctW_AusmH0hQ,36252
466
469
  datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
467
470
  datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
@@ -1022,9 +1025,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1022
1025
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1023
1026
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1024
1027
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1025
- acryl_datahub-1.0.0rc16.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1026
- acryl_datahub-1.0.0rc16.dist-info/METADATA,sha256=GLR8w248XWb0HwHx4mWmrC_lVq4qTRyn2YiFAixM1qU,175337
1027
- acryl_datahub-1.0.0rc16.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
1028
- acryl_datahub-1.0.0rc16.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
1029
- acryl_datahub-1.0.0rc16.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1030
- acryl_datahub-1.0.0rc16.dist-info/RECORD,,
1028
+ acryl_datahub-1.0.0rc18.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1029
+ acryl_datahub-1.0.0rc18.dist-info/METADATA,sha256=paUotX7MdMGKowqzTD7mt93-f2Hl6uB-og_wV_VbmhU,176898
1030
+ acryl_datahub-1.0.0rc18.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
1031
+ acryl_datahub-1.0.0rc18.dist-info/entry_points.txt,sha256=7-eDilp0OACUtlmmZ-LF6H9MF_SWD_bWHKNG7Dvhhos,9652
1032
+ acryl_datahub-1.0.0rc18.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1033
+ acryl_datahub-1.0.0rc18.dist-info/RECORD,,
@@ -101,6 +101,7 @@ tableau = datahub.ingestion.source.tableau.tableau:TableauSource
101
101
  teradata = datahub.ingestion.source.sql.teradata:TeradataSource
102
102
  trino = datahub.ingestion.source.sql.trino:TrinoSource
103
103
  unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource
104
+ vertexai = datahub.ingestion.source.vertexai:VertexAISource
104
105
  vertica = datahub.ingestion.source.sql.vertica:VerticaSource
105
106
 
106
107
  [datahub.ingestion.transformer.plugins]
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0rc16"
3
+ __version__ = "1.0.0rc18"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -198,6 +198,14 @@ class IgnorableError(MetaError):
198
198
  """An error that can be ignored."""
199
199
 
200
200
 
201
+ class TraceTimeoutError(OperationalError):
202
+ """Failure to complete an API Trace within the timeout."""
203
+
204
+
205
+ class TraceValidationError(OperationalError):
206
+ """Failure to complete the expected write operation."""
207
+
208
+
201
209
  @runtime_checkable
202
210
  class ExceptionWithProps(Protocol):
203
211
  def get_telemetry_props(self) -> Dict[str, Any]: ...
@@ -0,0 +1,145 @@
1
+ import json
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Optional, Sequence, Union
5
+
6
+ from requests import Response
7
+
8
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
+ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
10
+ MetadataChangeProposal,
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class TraceData:
18
+ trace_id: str
19
+ data: Dict[str, List[str]]
20
+
21
+ def __post_init__(self) -> None:
22
+ if not self.trace_id:
23
+ raise ValueError("trace_id cannot be empty")
24
+ if not isinstance(self.data, dict):
25
+ raise TypeError("data must be a dictionary")
26
+
27
+
28
+ def _extract_trace_id(
29
+ response: Response, trace_header: str = "traceparent"
30
+ ) -> Optional[str]:
31
+ """
32
+ Extract trace ID from response headers.
33
+ Args:
34
+ response: HTTP response object
35
+ trace_header: Name of the trace header to use
36
+ Returns:
37
+ Trace ID if found and response is valid, None otherwise
38
+ """
39
+ if not 200 <= response.status_code < 300:
40
+ logger.debug(f"Invalid status code: {response.status_code}")
41
+ return None
42
+
43
+ trace_id = response.headers.get(trace_header)
44
+ if not trace_id:
45
+ logger.debug(f"Missing trace header: {trace_header}")
46
+ return None
47
+
48
+ return trace_id
49
+
50
+
51
+ def extract_trace_data(
52
+ response: Response,
53
+ aspects_to_trace: Optional[List[str]] = None,
54
+ trace_header: str = "traceparent",
55
+ ) -> Optional[TraceData]:
56
+ """
57
+ Extract trace data from a response object.
58
+ Args:
59
+ response: HTTP response object
60
+ aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
61
+ trace_header: Name of the trace header to use (default: "traceparent")
62
+ Returns:
63
+ TraceData object if successful, None otherwise
64
+ Raises:
65
+ JSONDecodeError: If response body cannot be decoded as JSON
66
+ """
67
+ trace_id = _extract_trace_id(response, trace_header)
68
+ if not trace_id:
69
+ return None
70
+
71
+ try:
72
+ json_data = response.json()
73
+ if not isinstance(json_data, list):
74
+ logger.debug("JSON data is not a list")
75
+ return None
76
+
77
+ data: Dict[str, List[str]] = {}
78
+
79
+ for item in json_data:
80
+ urn = item.get("urn")
81
+ if not urn:
82
+ logger.debug(f"Skipping item without URN: {item}")
83
+ continue
84
+
85
+ if aspects_to_trace is None:
86
+ aspect_names = [
87
+ k for k, v in item.items() if k != "urn" and v is not None
88
+ ]
89
+ else:
90
+ aspect_names = [
91
+ field for field in aspects_to_trace if item.get(field) is not None
92
+ ]
93
+
94
+ data[urn] = aspect_names
95
+
96
+ return TraceData(trace_id=trace_id, data=data)
97
+
98
+ except json.JSONDecodeError as e:
99
+ logger.error(f"Failed to decode JSON response: {e}")
100
+ return None
101
+
102
+
103
+ def extract_trace_data_from_mcps(
104
+ response: Response,
105
+ mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
106
+ aspects_to_trace: Optional[List[str]] = None,
107
+ trace_header: str = "traceparent",
108
+ ) -> Optional[TraceData]:
109
+ """
110
+ Extract trace data from a response object and populate data from provided MCPs.
111
+ Args:
112
+ response: HTTP response object used only for trace_id extraction
113
+ mcps: List of MCP URN and aspect data
114
+ aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
115
+ trace_header: Name of the trace header to use (default: "traceparent")
116
+ Returns:
117
+ TraceData object if successful, None otherwise
118
+ """
119
+ trace_id = _extract_trace_id(response, trace_header)
120
+ if not trace_id:
121
+ return None
122
+
123
+ data: Dict[str, List[str]] = {}
124
+ try:
125
+ for mcp in mcps:
126
+ entity_urn = getattr(mcp, "entityUrn", None)
127
+ aspect_name = getattr(mcp, "aspectName", None)
128
+
129
+ if not entity_urn or not aspect_name:
130
+ logger.debug(f"Skipping MCP with missing URN or aspect name: {mcp}")
131
+ continue
132
+
133
+ if aspects_to_trace is not None and aspect_name not in aspects_to_trace:
134
+ continue
135
+
136
+ if entity_urn not in data:
137
+ data[entity_urn] = []
138
+
139
+ data[entity_urn].append(aspect_name)
140
+
141
+ return TraceData(trace_id=trace_id, data=data)
142
+
143
+ except AttributeError as e:
144
+ logger.error(f"Error processing MCPs: {e}")
145
+ return None
@@ -4,8 +4,10 @@ import functools
4
4
  import json
5
5
  import logging
6
6
  import os
7
+ import time
7
8
  from collections import defaultdict
8
9
  from dataclasses import dataclass
10
+ from datetime import datetime, timedelta
9
11
  from enum import auto
10
12
  from json.decoder import JSONDecodeError
11
13
  from typing import (
@@ -35,11 +37,18 @@ from datahub.configuration.common import (
35
37
  ConfigModel,
36
38
  ConfigurationError,
37
39
  OperationalError,
40
+ TraceTimeoutError,
41
+ TraceValidationError,
38
42
  )
39
43
  from datahub.emitter.aspect import JSON_CONTENT_TYPE
40
44
  from datahub.emitter.generic_emitter import Emitter
41
45
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
42
46
  from datahub.emitter.request_helper import make_curl_command
47
+ from datahub.emitter.response_helper import (
48
+ TraceData,
49
+ extract_trace_data,
50
+ extract_trace_data_from_mcps,
51
+ )
43
52
  from datahub.emitter.serialization_helper import pre_json_transform
44
53
  from datahub.ingestion.api.closeable import Closeable
45
54
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
@@ -69,6 +78,11 @@ _DEFAULT_RETRY_MAX_TIMES = int(
69
78
 
70
79
  _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
71
80
 
81
+ TRACE_PENDING_STATUS = "PENDING"
82
+ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
83
+ TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
84
+ TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
85
+
72
86
  # The limit is 16mb. We will use a max of 15mb to have some space
73
87
  # for overhead like request headers.
74
88
  # This applies to pretty much all calls to GMS.
@@ -83,6 +97,11 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
83
97
  )
84
98
 
85
99
 
100
+ class RestTraceMode(ConfigEnum):
101
+ ENABLED = auto()
102
+ DISABLED = auto()
103
+
104
+
86
105
  class RestSinkEndpoint(ConfigEnum):
87
106
  RESTLI = auto()
88
107
  OPENAPI = auto()
@@ -94,6 +113,13 @@ DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
94
113
  )
95
114
 
96
115
 
116
+ # Supported with v1.0
117
+ DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
118
+ RestTraceMode,
119
+ os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
120
+ )
121
+
122
+
97
123
  class RequestsSessionConfig(ConfigModel):
98
124
  timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
99
125
 
@@ -185,6 +211,7 @@ class DataHubRestEmitter(Closeable, Emitter):
185
211
  _token: Optional[str]
186
212
  _session: requests.Session
187
213
  _openapi_ingestion: bool
214
+ _default_trace_mode: bool
188
215
 
189
216
  def __init__(
190
217
  self,
@@ -201,6 +228,7 @@ class DataHubRestEmitter(Closeable, Emitter):
201
228
  client_certificate_path: Optional[str] = None,
202
229
  disable_ssl_verification: bool = False,
203
230
  openapi_ingestion: bool = False,
231
+ default_trace_mode: bool = False,
204
232
  ):
205
233
  if not gms_server:
206
234
  raise ConfigurationError("gms server is required")
@@ -214,12 +242,16 @@ class DataHubRestEmitter(Closeable, Emitter):
214
242
  self._token = token
215
243
  self.server_config: Dict[str, Any] = {}
216
244
  self._openapi_ingestion = openapi_ingestion
245
+ self._default_trace_mode = default_trace_mode
217
246
  self._session = requests.Session()
218
247
 
219
248
  logger.debug(
220
249
  f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
221
250
  )
222
251
 
252
+ if self._default_trace_mode:
253
+ logger.debug("Using API Tracing for ingestion.")
254
+
223
255
  headers = {
224
256
  "X-RestLi-Protocol-Version": "2.0.0",
225
257
  "X-DataHub-Py-Cli-Version": nice_version_name(),
@@ -396,13 +428,21 @@ class DataHubRestEmitter(Closeable, Emitter):
396
428
  self,
397
429
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
398
430
  async_flag: Optional[bool] = None,
431
+ trace_flag: Optional[bool] = None,
432
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
399
433
  ) -> None:
400
434
  ensure_has_system_metadata(mcp)
401
435
 
436
+ trace_data = None
437
+
402
438
  if self._openapi_ingestion:
403
439
  request = self._to_openapi_request(mcp, async_flag, async_default=False)
404
440
  if request:
405
- self._emit_generic(request[0], payload=request[1])
441
+ response = self._emit_generic(request[0], payload=request[1])
442
+
443
+ if self._should_trace(async_flag, trace_flag):
444
+ trace_data = extract_trace_data(response) if response else None
445
+
406
446
  else:
407
447
  url = f"{self._gms_server}/aspects?action=ingestProposal"
408
448
 
@@ -414,12 +454,25 @@ class DataHubRestEmitter(Closeable, Emitter):
414
454
 
415
455
  payload = json.dumps(payload_dict)
416
456
 
417
- self._emit_generic(url, payload)
457
+ response = self._emit_generic(url, payload)
458
+
459
+ if self._should_trace(async_flag, trace_flag):
460
+ trace_data = (
461
+ extract_trace_data_from_mcps(response, [mcp]) if response else None
462
+ )
463
+
464
+ if trace_data:
465
+ self._await_status(
466
+ [trace_data],
467
+ trace_timeout,
468
+ )
418
469
 
419
470
  def emit_mcps(
420
471
  self,
421
472
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
422
473
  async_flag: Optional[bool] = None,
474
+ trace_flag: Optional[bool] = None,
475
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
423
476
  ) -> int:
424
477
  if _DATAHUB_EMITTER_TRACE:
425
478
  logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
@@ -428,7 +481,7 @@ class DataHubRestEmitter(Closeable, Emitter):
428
481
  ensure_has_system_metadata(mcp)
429
482
 
430
483
  if self._openapi_ingestion:
431
- return self._emit_openapi_mcps(mcps, async_flag)
484
+ return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
432
485
  else:
433
486
  return self._emit_restli_mcps(mcps, async_flag)
434
487
 
@@ -436,6 +489,8 @@ class DataHubRestEmitter(Closeable, Emitter):
436
489
  self,
437
490
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
438
491
  async_flag: Optional[bool] = None,
492
+ trace_flag: Optional[bool] = None,
493
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
439
494
  ) -> int:
440
495
  """
441
496
  1. Grouping MCPs by their entity URL
@@ -483,6 +538,16 @@ class DataHubRestEmitter(Closeable, Emitter):
483
538
  response = self._emit_generic(url, payload=_Chunk.join(chunk))
484
539
  responses.append(response)
485
540
 
541
+ if self._should_trace(async_flag, trace_flag, async_default=True):
542
+ trace_data = []
543
+ for response in responses:
544
+ data = extract_trace_data(response) if response else None
545
+ if data is not None:
546
+ trace_data.append(data)
547
+
548
+ if trace_data:
549
+ self._await_status(trace_data, trace_timeout)
550
+
486
551
  return len(responses)
487
552
 
488
553
  def _emit_restli_mcps(
@@ -593,6 +658,99 @@ class DataHubRestEmitter(Closeable, Emitter):
593
658
  "Unable to emit metadata to DataHub GMS", {"message": str(e)}
594
659
  ) from e
595
660
 
661
+ def _await_status(
662
+ self,
663
+ trace_data: List[TraceData],
664
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
665
+ ) -> None:
666
+ """Verify the status of asynchronous write operations.
667
+ Args:
668
+ trace_data: List of trace data to verify
669
+ trace_timeout: Maximum time to wait for verification.
670
+ Raises:
671
+ TraceTimeoutError: If verification fails or times out
672
+ TraceValidationError: Expected write was not completed successfully
673
+ """
674
+ if trace_timeout is None:
675
+ raise ValueError("trace_timeout cannot be None")
676
+
677
+ try:
678
+ if not trace_data:
679
+ logger.debug("No trace data to verify")
680
+ return
681
+
682
+ start_time = datetime.now()
683
+
684
+ for trace in trace_data:
685
+ current_backoff = TRACE_INITIAL_BACKOFF
686
+
687
+ while trace.data:
688
+ if datetime.now() - start_time > trace_timeout:
689
+ raise TraceTimeoutError(
690
+ f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
691
+ )
692
+
693
+ base_url = f"{self._gms_server}/openapi/v1/trace/write"
694
+ url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"
695
+
696
+ response = self._emit_generic(url, payload=trace.data)
697
+ json_data = response.json()
698
+
699
+ for urn, aspects in json_data.items():
700
+ for aspect_name, aspect_status in aspects.items():
701
+ if not aspect_status["success"]:
702
+ error_msg = (
703
+ f"Unable to validate async write to DataHub GMS: "
704
+ f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
705
+ f"Status: {aspect_status}"
706
+ )
707
+ raise TraceValidationError(error_msg, aspect_status)
708
+
709
+ primary_storage = aspect_status["primaryStorage"][
710
+ "writeStatus"
711
+ ]
712
+ search_storage = aspect_status["searchStorage"][
713
+ "writeStatus"
714
+ ]
715
+
716
+ # Remove resolved statuses
717
+ if (
718
+ primary_storage != TRACE_PENDING_STATUS
719
+ and search_storage != TRACE_PENDING_STATUS
720
+ ):
721
+ trace.data[urn].remove(aspect_name)
722
+
723
+ # Remove urns with all statuses resolved
724
+ if not trace.data[urn]:
725
+ trace.data.pop(urn)
726
+
727
+ # Adjust backoff based on response
728
+ if trace.data:
729
+ # If we still have pending items, increase backoff
730
+ current_backoff = min(
731
+ current_backoff * TRACE_BACKOFF_FACTOR, TRACE_MAX_BACKOFF
732
+ )
733
+ logger.debug(
734
+ f"Waiting {current_backoff} seconds before next check"
735
+ )
736
+ time.sleep(current_backoff)
737
+
738
+ except Exception as e:
739
+ logger.error(f"Error during status verification: {str(e)}")
740
+ raise
741
+
742
+ def _should_trace(
743
+ self,
744
+ async_flag: Optional[bool] = None,
745
+ trace_flag: Optional[bool] = None,
746
+ async_default: bool = False,
747
+ ) -> bool:
748
+ resolved_trace_flag = (
749
+ trace_flag if trace_flag is not None else self._default_trace_mode
750
+ )
751
+ resolved_async_flag = async_flag if async_flag is not None else async_default
752
+ return resolved_trace_flag and resolved_async_flag
753
+
596
754
  def __repr__(self) -> str:
597
755
  token_str = (
598
756
  f" with token: {self._token[:4]}**********{self._token[-4:]}"
@@ -34,8 +34,10 @@ from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
34
34
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
35
35
  from datahub.emitter.rest_emitter import (
36
36
  DEFAULT_REST_SINK_ENDPOINT,
37
+ DEFAULT_REST_TRACE_MODE,
37
38
  DatahubRestEmitter,
38
39
  RestSinkEndpoint,
40
+ RestTraceMode,
39
41
  )
40
42
  from datahub.emitter.serialization_helper import post_json_transform
41
43
  from datahub.ingestion.graph.config import (
@@ -146,6 +148,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
146
148
  client_certificate_path=self.config.client_certificate_path,
147
149
  disable_ssl_verification=self.config.disable_ssl_verification,
148
150
  openapi_ingestion=DEFAULT_REST_SINK_ENDPOINT == RestSinkEndpoint.OPENAPI,
151
+ default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
149
152
  )
150
153
 
151
154
  self.server_id = _MISSING_SERVER_ID
@@ -21,8 +21,10 @@ from datahub.emitter.mcp_builder import mcps_from_mce
21
21
  from datahub.emitter.rest_emitter import (
22
22
  BATCH_INGEST_MAX_PAYLOAD_LENGTH,
23
23
  DEFAULT_REST_SINK_ENDPOINT,
24
+ DEFAULT_REST_TRACE_MODE,
24
25
  DataHubRestEmitter,
25
26
  RestSinkEndpoint,
27
+ RestTraceMode,
26
28
  )
27
29
  from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
28
30
  from datahub.ingestion.api.sink import (
@@ -69,6 +71,7 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
69
71
  class DatahubRestSinkConfig(DatahubClientConfig):
70
72
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
71
73
  endpoint: RestSinkEndpoint = DEFAULT_REST_SINK_ENDPOINT
74
+ default_trace_mode: RestTraceMode = DEFAULT_REST_TRACE_MODE
72
75
 
73
76
  # These only apply in async modes.
74
77
  max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
@@ -176,6 +179,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
176
179
  client_certificate_path=config.client_certificate_path,
177
180
  disable_ssl_verification=config.disable_ssl_verification,
178
181
  openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
182
+ default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
179
183
  )
180
184
 
181
185
  @property