acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -52,7 +52,15 @@ from datahub.metadata.schema_classes import (
52
52
  UpstreamLineageClass,
53
53
  _Aspect as AspectAbstract,
54
54
  )
55
- from datahub.metadata.urns import DataFlowUrn, DatasetUrn, TagUrn
55
+ from datahub.metadata.urns import (
56
+ ChartUrn,
57
+ DashboardUrn,
58
+ DataFlowUrn,
59
+ DataJobUrn,
60
+ DataPlatformUrn,
61
+ DatasetUrn,
62
+ TagUrn,
63
+ )
56
64
  from datahub.utilities.urn_encoder import UrnEncoder
57
65
 
58
66
  logger = logging.getLogger(__name__)
@@ -119,7 +127,7 @@ def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
119
127
  def make_data_platform_urn(platform: str) -> str:
120
128
  if platform.startswith("urn:li:dataPlatform:"):
121
129
  return platform
122
- return f"urn:li:dataPlatform:{platform}"
130
+ return DataPlatformUrn.create_from_id(platform).urn()
123
131
 
124
132
 
125
133
  def make_dataset_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
@@ -236,7 +244,7 @@ def make_user_urn(username: str) -> str:
236
244
  Makes a user urn if the input is not a user or group urn already
237
245
  """
238
246
  return (
239
- f"urn:li:corpuser:{username}"
247
+ f"urn:li:corpuser:{UrnEncoder.encode_string(username)}"
240
248
  if not username.startswith(("urn:li:corpuser:", "urn:li:corpGroup:"))
241
249
  else username
242
250
  )
@@ -249,7 +257,7 @@ def make_group_urn(groupname: str) -> str:
249
257
  if groupname and groupname.startswith(("urn:li:corpGroup:", "urn:li:corpuser:")):
250
258
  return groupname
251
259
  else:
252
- return f"urn:li:corpGroup:{groupname}"
260
+ return f"urn:li:corpGroup:{UrnEncoder.encode_string(groupname)}"
253
261
 
254
262
 
255
263
  def make_tag_urn(tag: str) -> str:
@@ -301,7 +309,12 @@ def make_data_flow_urn(
301
309
 
302
310
 
303
311
  def make_data_job_urn_with_flow(flow_urn: str, job_id: str) -> str:
304
- return f"urn:li:dataJob:({flow_urn},{job_id})"
312
+ data_flow_urn = DataFlowUrn.from_string(flow_urn)
313
+ data_job_urn = DataJobUrn.create_from_ids(
314
+ data_flow_urn=data_flow_urn.urn(),
315
+ job_id=job_id,
316
+ )
317
+ return data_job_urn.urn()
305
318
 
306
319
 
307
320
  def make_data_process_instance_urn(dataProcessInstanceId: str) -> str:
@@ -324,10 +337,11 @@ def make_dashboard_urn(
324
337
  platform: str, name: str, platform_instance: Optional[str] = None
325
338
  ) -> str:
326
339
  # FIXME: dashboards don't currently include data platform urn prefixes.
327
- if platform_instance:
328
- return f"urn:li:dashboard:({platform},{platform_instance}.{name})"
329
- else:
330
- return f"urn:li:dashboard:({platform},{name})"
340
+ return DashboardUrn.create_from_ids(
341
+ platform=platform,
342
+ name=name,
343
+ platform_instance=platform_instance,
344
+ ).urn()
331
345
 
332
346
 
333
347
  def dashboard_urn_to_key(dashboard_urn: str) -> Optional[DashboardKeyClass]:
@@ -342,10 +356,11 @@ def make_chart_urn(
342
356
  platform: str, name: str, platform_instance: Optional[str] = None
343
357
  ) -> str:
344
358
  # FIXME: charts don't currently include data platform urn prefixes.
345
- if platform_instance:
346
- return f"urn:li:chart:({platform},{platform_instance}.{name})"
347
- else:
348
- return f"urn:li:chart:({platform},{name})"
359
+ return ChartUrn.create_from_ids(
360
+ platform=platform,
361
+ name=name,
362
+ platform_instance=platform_instance,
363
+ ).urn()
349
364
 
350
365
 
351
366
  def chart_urn_to_key(chart_urn: str) -> Optional[ChartKeyClass]:
@@ -36,7 +36,7 @@ from datahub.metadata.schema_classes import (
36
36
  SubTypesClass,
37
37
  TagAssociationClass,
38
38
  )
39
- from datahub.metadata.urns import StructuredPropertyUrn
39
+ from datahub.metadata.urns import ContainerUrn, StructuredPropertyUrn
40
40
 
41
41
  # In https://github.com/datahub-project/datahub/pull/11214, we added a
42
42
  # new env field to container properties. However, populating this field
@@ -87,6 +87,9 @@ class ContainerKey(DatahubKey):
87
87
  def property_dict(self) -> Dict[str, str]:
88
88
  return self.dict(by_alias=True, exclude_none=True)
89
89
 
90
+ def as_urn_typed(self) -> ContainerUrn:
91
+ return ContainerUrn.from_string(self.as_urn())
92
+
90
93
  def as_urn(self) -> str:
91
94
  return make_container_urn(guid=self.guid())
92
95
 
@@ -0,0 +1,145 @@
1
+ import json
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Optional, Sequence, Union
5
+
6
+ from requests import Response
7
+
8
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
+ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
10
+ MetadataChangeProposal,
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class TraceData:
18
+ trace_id: str
19
+ data: Dict[str, List[str]]
20
+
21
+ def __post_init__(self) -> None:
22
+ if not self.trace_id:
23
+ raise ValueError("trace_id cannot be empty")
24
+ if not isinstance(self.data, dict):
25
+ raise TypeError("data must be a dictionary")
26
+
27
+
28
+ def _extract_trace_id(
29
+ response: Response, trace_header: str = "traceparent"
30
+ ) -> Optional[str]:
31
+ """
32
+ Extract trace ID from response headers.
33
+ Args:
34
+ response: HTTP response object
35
+ trace_header: Name of the trace header to use
36
+ Returns:
37
+ Trace ID if found and response is valid, None otherwise
38
+ """
39
+ if not 200 <= response.status_code < 300:
40
+ logger.debug(f"Invalid status code: {response.status_code}")
41
+ return None
42
+
43
+ trace_id = response.headers.get(trace_header)
44
+ if not trace_id:
45
+ logger.debug(f"Missing trace header: {trace_header}")
46
+ return None
47
+
48
+ return trace_id
49
+
50
+
51
+ def extract_trace_data(
52
+ response: Response,
53
+ aspects_to_trace: Optional[List[str]] = None,
54
+ trace_header: str = "traceparent",
55
+ ) -> Optional[TraceData]:
56
+ """
57
+ Extract trace data from a response object.
58
+ Args:
59
+ response: HTTP response object
60
+ aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
61
+ trace_header: Name of the trace header to use (default: "traceparent")
62
+ Returns:
63
+ TraceData object if successful, None otherwise
64
+ Raises:
65
+ JSONDecodeError: If response body cannot be decoded as JSON
66
+ """
67
+ trace_id = _extract_trace_id(response, trace_header)
68
+ if not trace_id:
69
+ return None
70
+
71
+ try:
72
+ json_data = response.json()
73
+ if not isinstance(json_data, list):
74
+ logger.debug("JSON data is not a list")
75
+ return None
76
+
77
+ data: Dict[str, List[str]] = {}
78
+
79
+ for item in json_data:
80
+ urn = item.get("urn")
81
+ if not urn:
82
+ logger.debug(f"Skipping item without URN: {item}")
83
+ continue
84
+
85
+ if aspects_to_trace is None:
86
+ aspect_names = [
87
+ k for k, v in item.items() if k != "urn" and v is not None
88
+ ]
89
+ else:
90
+ aspect_names = [
91
+ field for field in aspects_to_trace if item.get(field) is not None
92
+ ]
93
+
94
+ data[urn] = aspect_names
95
+
96
+ return TraceData(trace_id=trace_id, data=data)
97
+
98
+ except json.JSONDecodeError as e:
99
+ logger.error(f"Failed to decode JSON response: {e}")
100
+ return None
101
+
102
+
103
+ def extract_trace_data_from_mcps(
104
+ response: Response,
105
+ mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
106
+ aspects_to_trace: Optional[List[str]] = None,
107
+ trace_header: str = "traceparent",
108
+ ) -> Optional[TraceData]:
109
+ """
110
+ Extract trace data from a response object and populate data from provided MCPs.
111
+ Args:
112
+ response: HTTP response object used only for trace_id extraction
113
+ mcps: List of MCP URN and aspect data
114
+ aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
115
+ trace_header: Name of the trace header to use (default: "traceparent")
116
+ Returns:
117
+ TraceData object if successful, None otherwise
118
+ """
119
+ trace_id = _extract_trace_id(response, trace_header)
120
+ if not trace_id:
121
+ return None
122
+
123
+ data: Dict[str, List[str]] = {}
124
+ try:
125
+ for mcp in mcps:
126
+ entity_urn = getattr(mcp, "entityUrn", None)
127
+ aspect_name = getattr(mcp, "aspectName", None)
128
+
129
+ if not entity_urn or not aspect_name:
130
+ logger.debug(f"Skipping MCP with missing URN or aspect name: {mcp}")
131
+ continue
132
+
133
+ if aspects_to_trace is not None and aspect_name not in aspects_to_trace:
134
+ continue
135
+
136
+ if entity_urn not in data:
137
+ data[entity_urn] = []
138
+
139
+ data[entity_urn].append(aspect_name)
140
+
141
+ return TraceData(trace_id=trace_id, data=data)
142
+
143
+ except AttributeError as e:
144
+ logger.error(f"Error processing MCPs: {e}")
145
+ return None
@@ -4,6 +4,11 @@ import functools
4
4
  import json
5
5
  import logging
6
6
  import os
7
+ import time
8
+ from collections import defaultdict
9
+ from dataclasses import dataclass
10
+ from datetime import datetime, timedelta
11
+ from enum import auto
7
12
  from json.decoder import JSONDecodeError
8
13
  from typing import (
9
14
  TYPE_CHECKING,
@@ -17,6 +22,7 @@ from typing import (
17
22
  Union,
18
23
  )
19
24
 
25
+ import pydantic
20
26
  import requests
21
27
  from deprecated import deprecated
22
28
  from requests.adapters import HTTPAdapter, Retry
@@ -27,13 +33,22 @@ from datahub.cli import config_utils
27
33
  from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
28
34
  from datahub.cli.env_utils import get_boolean_env_variable
29
35
  from datahub.configuration.common import (
36
+ ConfigEnum,
30
37
  ConfigModel,
31
38
  ConfigurationError,
32
39
  OperationalError,
40
+ TraceTimeoutError,
41
+ TraceValidationError,
33
42
  )
43
+ from datahub.emitter.aspect import JSON_CONTENT_TYPE
34
44
  from datahub.emitter.generic_emitter import Emitter
35
45
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
36
46
  from datahub.emitter.request_helper import make_curl_command
47
+ from datahub.emitter.response_helper import (
48
+ TraceData,
49
+ extract_trace_data,
50
+ extract_trace_data_from_mcps,
51
+ )
37
52
  from datahub.emitter.serialization_helper import pre_json_transform
38
53
  from datahub.ingestion.api.closeable import Closeable
39
54
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
@@ -63,6 +78,11 @@ _DEFAULT_RETRY_MAX_TIMES = int(
63
78
 
64
79
  _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
65
80
 
81
+ TRACE_PENDING_STATUS = "PENDING"
82
+ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
83
+ TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
84
+ TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
85
+
66
86
  # The limit is 16mb. We will use a max of 15mb to have some space
67
87
  # for overhead like request headers.
68
88
  # This applies to pretty much all calls to GMS.
@@ -77,6 +97,29 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
77
97
  )
78
98
 
79
99
 
100
+ class RestTraceMode(ConfigEnum):
101
+ ENABLED = auto()
102
+ DISABLED = auto()
103
+
104
+
105
+ class RestSinkEndpoint(ConfigEnum):
106
+ RESTLI = auto()
107
+ OPENAPI = auto()
108
+
109
+
110
+ DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
111
+ RestSinkEndpoint,
112
+ os.getenv("DATAHUB_REST_SINK_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
113
+ )
114
+
115
+
116
+ # Supported with v1.0
117
+ DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
118
+ RestTraceMode,
119
+ os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
120
+ )
121
+
122
+
80
123
  class RequestsSessionConfig(ConfigModel):
81
124
  timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
82
125
 
@@ -143,10 +186,32 @@ class RequestsSessionConfig(ConfigModel):
143
186
  return session
144
187
 
145
188
 
189
+ @dataclass
190
+ class _Chunk:
191
+ items: List[str]
192
+ total_bytes: int = 0
193
+
194
+ def add_item(self, item: str) -> bool:
195
+ item_bytes = len(item.encode())
196
+ if not self.items: # Always add at least one item even if over byte limit
197
+ self.items.append(item)
198
+ self.total_bytes += item_bytes
199
+ return True
200
+ self.items.append(item)
201
+ self.total_bytes += item_bytes
202
+ return True
203
+
204
+ @staticmethod
205
+ def join(chunk: "_Chunk") -> str:
206
+ return "[" + ",".join(chunk.items) + "]"
207
+
208
+
146
209
  class DataHubRestEmitter(Closeable, Emitter):
147
210
  _gms_server: str
148
211
  _token: Optional[str]
149
212
  _session: requests.Session
213
+ _openapi_ingestion: bool
214
+ _default_trace_mode: bool
150
215
 
151
216
  def __init__(
152
217
  self,
@@ -162,6 +227,8 @@ class DataHubRestEmitter(Closeable, Emitter):
162
227
  ca_certificate_path: Optional[str] = None,
163
228
  client_certificate_path: Optional[str] = None,
164
229
  disable_ssl_verification: bool = False,
230
+ openapi_ingestion: bool = False,
231
+ default_trace_mode: bool = False,
165
232
  ):
166
233
  if not gms_server:
167
234
  raise ConfigurationError("gms server is required")
@@ -174,9 +241,17 @@ class DataHubRestEmitter(Closeable, Emitter):
174
241
  self._gms_server = fixup_gms_url(gms_server)
175
242
  self._token = token
176
243
  self.server_config: Dict[str, Any] = {}
177
-
244
+ self._openapi_ingestion = openapi_ingestion
245
+ self._default_trace_mode = default_trace_mode
178
246
  self._session = requests.Session()
179
247
 
248
+ logger.debug(
249
+ f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
250
+ )
251
+
252
+ if self._default_trace_mode:
253
+ logger.debug("Using API Tracing for ingestion.")
254
+
180
255
  headers = {
181
256
  "X-RestLi-Protocol-Version": "2.0.0",
182
257
  "X-DataHub-Py-Cli-Version": nice_version_name(),
@@ -264,6 +339,43 @@ class DataHubRestEmitter(Closeable, Emitter):
264
339
 
265
340
  return DataHubGraph.from_emitter(self)
266
341
 
342
+ def _to_openapi_request(
343
+ self,
344
+ mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
345
+ async_flag: Optional[bool] = None,
346
+ async_default: bool = False,
347
+ ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
348
+ if mcp.aspect and mcp.aspectName:
349
+ resolved_async_flag = (
350
+ async_flag if async_flag is not None else async_default
351
+ )
352
+ url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
353
+
354
+ if isinstance(mcp, MetadataChangeProposalWrapper):
355
+ aspect_value = pre_json_transform(
356
+ mcp.to_obj(simplified_structure=True)
357
+ )["aspect"]["json"]
358
+ else:
359
+ obj = mcp.aspect.to_obj()
360
+ if obj.get("value") and obj.get("contentType") == JSON_CONTENT_TYPE:
361
+ obj = json.loads(obj["value"])
362
+ aspect_value = pre_json_transform(obj)
363
+ return (
364
+ url,
365
+ [
366
+ {
367
+ "urn": mcp.entityUrn,
368
+ mcp.aspectName: {
369
+ "value": aspect_value,
370
+ "systemMetadata": mcp.systemMetadata.to_obj()
371
+ if mcp.systemMetadata
372
+ else None,
373
+ },
374
+ }
375
+ ],
376
+ )
377
+ return None
378
+
267
379
  def emit(
268
380
  self,
269
381
  item: Union[
@@ -316,31 +428,135 @@ class DataHubRestEmitter(Closeable, Emitter):
316
428
  self,
317
429
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
318
430
  async_flag: Optional[bool] = None,
431
+ trace_flag: Optional[bool] = None,
432
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
319
433
  ) -> None:
320
- url = f"{self._gms_server}/aspects?action=ingestProposal"
321
434
  ensure_has_system_metadata(mcp)
322
435
 
323
- mcp_obj = pre_json_transform(mcp.to_obj())
324
- payload_dict = {"proposal": mcp_obj}
436
+ trace_data = None
325
437
 
326
- if async_flag is not None:
327
- payload_dict["async"] = "true" if async_flag else "false"
438
+ if self._openapi_ingestion:
439
+ request = self._to_openapi_request(mcp, async_flag, async_default=False)
440
+ if request:
441
+ response = self._emit_generic(request[0], payload=request[1])
328
442
 
329
- payload = json.dumps(payload_dict)
443
+ if self._should_trace(async_flag, trace_flag):
444
+ trace_data = extract_trace_data(response) if response else None
330
445
 
331
- self._emit_generic(url, payload)
446
+ else:
447
+ url = f"{self._gms_server}/aspects?action=ingestProposal"
448
+
449
+ mcp_obj = pre_json_transform(mcp.to_obj())
450
+ payload_dict = {"proposal": mcp_obj}
451
+
452
+ if async_flag is not None:
453
+ payload_dict["async"] = "true" if async_flag else "false"
454
+
455
+ payload = json.dumps(payload_dict)
456
+
457
+ response = self._emit_generic(url, payload)
458
+
459
+ if self._should_trace(async_flag, trace_flag):
460
+ trace_data = (
461
+ extract_trace_data_from_mcps(response, [mcp]) if response else None
462
+ )
463
+
464
+ if trace_data:
465
+ self._await_status(
466
+ [trace_data],
467
+ trace_timeout,
468
+ )
332
469
 
333
470
  def emit_mcps(
334
471
  self,
335
472
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
336
473
  async_flag: Optional[bool] = None,
474
+ trace_flag: Optional[bool] = None,
475
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
337
476
  ) -> int:
338
477
  if _DATAHUB_EMITTER_TRACE:
339
478
  logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
340
- url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
479
+
341
480
  for mcp in mcps:
342
481
  ensure_has_system_metadata(mcp)
343
482
 
483
+ if self._openapi_ingestion:
484
+ return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
485
+ else:
486
+ return self._emit_restli_mcps(mcps, async_flag)
487
+
488
+ def _emit_openapi_mcps(
489
+ self,
490
+ mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
491
+ async_flag: Optional[bool] = None,
492
+ trace_flag: Optional[bool] = None,
493
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
494
+ ) -> int:
495
+ """
496
+ 1. Grouping MCPs by their entity URL
497
+ 2. Breaking down large batches into smaller chunks based on both:
498
+ * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
499
+ * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
500
+
501
+ The Chunk class encapsulates both the items and their byte size tracking
502
+ Serializing the items only once with json.dumps(request[1]) and reusing that
503
+ The chunking logic handles edge cases (always accepting at least one item per chunk)
504
+ The joining logic is efficient with a simple string concatenation
505
+
506
+ :param mcps: metadata change proposals to transmit
507
+ :param async_flag: the mode
508
+ :return: number of requests
509
+ """
510
+ # group by entity url
511
+ batches: Dict[str, List[_Chunk]] = defaultdict(
512
+ lambda: [_Chunk(items=[])]
513
+ ) # Initialize with one empty Chunk
514
+
515
+ for mcp in mcps:
516
+ request = self._to_openapi_request(mcp, async_flag, async_default=True)
517
+ if request:
518
+ current_chunk = batches[request[0]][-1] # Get the last chunk
519
+ # Only serialize once
520
+ serialized_item = json.dumps(request[1][0])
521
+ item_bytes = len(serialized_item.encode())
522
+
523
+ # If adding this item would exceed max_bytes, create a new chunk
524
+ # Unless the chunk is empty (always add at least one item)
525
+ if current_chunk.items and (
526
+ current_chunk.total_bytes + item_bytes > INGEST_MAX_PAYLOAD_BYTES
527
+ or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
528
+ ):
529
+ new_chunk = _Chunk(items=[])
530
+ batches[request[0]].append(new_chunk)
531
+ current_chunk = new_chunk
532
+
533
+ current_chunk.add_item(serialized_item)
534
+
535
+ responses = []
536
+ for url, chunks in batches.items():
537
+ for chunk in chunks:
538
+ response = self._emit_generic(url, payload=_Chunk.join(chunk))
539
+ responses.append(response)
540
+
541
+ if self._should_trace(async_flag, trace_flag, async_default=True):
542
+ trace_data = []
543
+ for response in responses:
544
+ data = extract_trace_data(response) if response else None
545
+ if data is not None:
546
+ trace_data.append(data)
547
+
548
+ if trace_data:
549
+ self._await_status(trace_data, trace_timeout)
550
+
551
+ return len(responses)
552
+
553
+ def _emit_restli_mcps(
554
+ self,
555
+ mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
556
+ async_flag: Optional[bool] = None,
557
+ ) -> int:
558
+ url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
559
+
344
560
  mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
345
561
 
346
562
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
@@ -392,7 +608,10 @@ class DataHubRestEmitter(Closeable, Emitter):
392
608
  payload = json.dumps(snapshot)
393
609
  self._emit_generic(url, payload)
394
610
 
395
- def _emit_generic(self, url: str, payload: str) -> None:
611
+ def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
612
+ if not isinstance(payload, str):
613
+ payload = json.dumps(payload)
614
+
396
615
  curl_command = make_curl_command(self._session, "POST", url, payload)
397
616
  payload_size = len(payload)
398
617
  if payload_size > INGEST_MAX_PAYLOAD_BYTES:
@@ -408,6 +627,7 @@ class DataHubRestEmitter(Closeable, Emitter):
408
627
  try:
409
628
  response = self._session.post(url, data=payload)
410
629
  response.raise_for_status()
630
+ return response
411
631
  except HTTPError as e:
412
632
  try:
413
633
  info: Dict = response.json()
@@ -438,6 +658,99 @@ class DataHubRestEmitter(Closeable, Emitter):
438
658
  "Unable to emit metadata to DataHub GMS", {"message": str(e)}
439
659
  ) from e
440
660
 
661
+ def _await_status(
662
+ self,
663
+ trace_data: List[TraceData],
664
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
665
+ ) -> None:
666
+ """Verify the status of asynchronous write operations.
667
+ Args:
668
+ trace_data: List of trace data to verify
669
+ trace_timeout: Maximum time to wait for verification.
670
+ Raises:
671
+ TraceTimeoutError: If verification fails or times out
672
+ TraceValidationError: Expected write was not completed successfully
673
+ """
674
+ if trace_timeout is None:
675
+ raise ValueError("trace_timeout cannot be None")
676
+
677
+ try:
678
+ if not trace_data:
679
+ logger.debug("No trace data to verify")
680
+ return
681
+
682
+ start_time = datetime.now()
683
+
684
+ for trace in trace_data:
685
+ current_backoff = TRACE_INITIAL_BACKOFF
686
+
687
+ while trace.data:
688
+ if datetime.now() - start_time > trace_timeout:
689
+ raise TraceTimeoutError(
690
+ f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
691
+ )
692
+
693
+ base_url = f"{self._gms_server}/openapi/v1/trace/write"
694
+ url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"
695
+
696
+ response = self._emit_generic(url, payload=trace.data)
697
+ json_data = response.json()
698
+
699
+ for urn, aspects in json_data.items():
700
+ for aspect_name, aspect_status in aspects.items():
701
+ if not aspect_status["success"]:
702
+ error_msg = (
703
+ f"Unable to validate async write to DataHub GMS: "
704
+ f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
705
+ f"Status: {aspect_status}"
706
+ )
707
+ raise TraceValidationError(error_msg, aspect_status)
708
+
709
+ primary_storage = aspect_status["primaryStorage"][
710
+ "writeStatus"
711
+ ]
712
+ search_storage = aspect_status["searchStorage"][
713
+ "writeStatus"
714
+ ]
715
+
716
+ # Remove resolved statuses
717
+ if (
718
+ primary_storage != TRACE_PENDING_STATUS
719
+ and search_storage != TRACE_PENDING_STATUS
720
+ ):
721
+ trace.data[urn].remove(aspect_name)
722
+
723
+ # Remove urns with all statuses resolved
724
+ if not trace.data[urn]:
725
+ trace.data.pop(urn)
726
+
727
+ # Adjust backoff based on response
728
+ if trace.data:
729
+ # If we still have pending items, increase backoff
730
+ current_backoff = min(
731
+ current_backoff * TRACE_BACKOFF_FACTOR, TRACE_MAX_BACKOFF
732
+ )
733
+ logger.debug(
734
+ f"Waiting {current_backoff} seconds before next check"
735
+ )
736
+ time.sleep(current_backoff)
737
+
738
+ except Exception as e:
739
+ logger.error(f"Error during status verification: {str(e)}")
740
+ raise
741
+
742
+ def _should_trace(
743
+ self,
744
+ async_flag: Optional[bool] = None,
745
+ trace_flag: Optional[bool] = None,
746
+ async_default: bool = False,
747
+ ) -> bool:
748
+ resolved_trace_flag = (
749
+ trace_flag if trace_flag is not None else self._default_trace_mode
750
+ )
751
+ resolved_async_flag = async_flag if async_flag is not None else async_default
752
+ return resolved_trace_flag and resolved_async_flag
753
+
441
754
  def __repr__(self) -> str:
442
755
  token_str = (
443
756
  f" with token: {self._token[:4]}**********{self._token[-4:]}"