acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,11 @@ import functools
4
4
  import json
5
5
  import logging
6
6
  import os
7
+ import time
8
+ from collections import defaultdict
9
+ from dataclasses import dataclass
10
+ from datetime import datetime, timedelta
11
+ from enum import auto
7
12
  from json.decoder import JSONDecodeError
8
13
  from typing import (
9
14
  TYPE_CHECKING,
@@ -17,6 +22,7 @@ from typing import (
17
22
  Union,
18
23
  )
19
24
 
25
+ import pydantic
20
26
  import requests
21
27
  from deprecated import deprecated
22
28
  from requests.adapters import HTTPAdapter, Retry
@@ -27,13 +33,22 @@ from datahub.cli import config_utils
27
33
  from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
28
34
  from datahub.cli.env_utils import get_boolean_env_variable
29
35
  from datahub.configuration.common import (
36
+ ConfigEnum,
30
37
  ConfigModel,
31
38
  ConfigurationError,
32
39
  OperationalError,
40
+ TraceTimeoutError,
41
+ TraceValidationError,
33
42
  )
43
+ from datahub.emitter.aspect import JSON_CONTENT_TYPE
34
44
  from datahub.emitter.generic_emitter import Emitter
35
45
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
36
46
  from datahub.emitter.request_helper import make_curl_command
47
+ from datahub.emitter.response_helper import (
48
+ TraceData,
49
+ extract_trace_data,
50
+ extract_trace_data_from_mcps,
51
+ )
37
52
  from datahub.emitter.serialization_helper import pre_json_transform
38
53
  from datahub.ingestion.api.closeable import Closeable
39
54
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
@@ -63,6 +78,11 @@ _DEFAULT_RETRY_MAX_TIMES = int(
63
78
 
64
79
  _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
65
80
 
81
+ TRACE_PENDING_STATUS = "PENDING"
82
+ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
83
+ TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
84
+ TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
85
+
66
86
  # The limit is 16mb. We will use a max of 15mb to have some space
67
87
  # for overhead like request headers.
68
88
  # This applies to pretty much all calls to GMS.
@@ -77,6 +97,29 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
77
97
  )
78
98
 
79
99
 
100
+ class RestTraceMode(ConfigEnum):
101
+ ENABLED = auto()
102
+ DISABLED = auto()
103
+
104
+
105
+ class RestSinkEndpoint(ConfigEnum):
106
+ RESTLI = auto()
107
+ OPENAPI = auto()
108
+
109
+
110
+ DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
111
+ RestSinkEndpoint,
112
+ os.getenv("DATAHUB_REST_SINK_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
113
+ )
114
+
115
+
116
+ # Supported with v1.0
117
+ DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
118
+ RestTraceMode,
119
+ os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
120
+ )
121
+
122
+
80
123
  class RequestsSessionConfig(ConfigModel):
81
124
  timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
82
125
 
@@ -143,10 +186,32 @@ class RequestsSessionConfig(ConfigModel):
143
186
  return session
144
187
 
145
188
 
189
+ @dataclass
190
+ class _Chunk:
191
+ items: List[str]
192
+ total_bytes: int = 0
193
+
194
+ def add_item(self, item: str) -> bool:
195
+ item_bytes = len(item.encode())
196
+ if not self.items: # Always add at least one item even if over byte limit
197
+ self.items.append(item)
198
+ self.total_bytes += item_bytes
199
+ return True
200
+ self.items.append(item)
201
+ self.total_bytes += item_bytes
202
+ return True
203
+
204
+ @staticmethod
205
+ def join(chunk: "_Chunk") -> str:
206
+ return "[" + ",".join(chunk.items) + "]"
207
+
208
+
146
209
  class DataHubRestEmitter(Closeable, Emitter):
147
210
  _gms_server: str
148
211
  _token: Optional[str]
149
212
  _session: requests.Session
213
+ _openapi_ingestion: bool
214
+ _default_trace_mode: bool
150
215
 
151
216
  def __init__(
152
217
  self,
@@ -162,6 +227,8 @@ class DataHubRestEmitter(Closeable, Emitter):
162
227
  ca_certificate_path: Optional[str] = None,
163
228
  client_certificate_path: Optional[str] = None,
164
229
  disable_ssl_verification: bool = False,
230
+ openapi_ingestion: bool = False,
231
+ default_trace_mode: bool = False,
165
232
  ):
166
233
  if not gms_server:
167
234
  raise ConfigurationError("gms server is required")
@@ -174,9 +241,17 @@ class DataHubRestEmitter(Closeable, Emitter):
174
241
  self._gms_server = fixup_gms_url(gms_server)
175
242
  self._token = token
176
243
  self.server_config: Dict[str, Any] = {}
177
-
244
+ self._openapi_ingestion = openapi_ingestion
245
+ self._default_trace_mode = default_trace_mode
178
246
  self._session = requests.Session()
179
247
 
248
+ logger.debug(
249
+ f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
250
+ )
251
+
252
+ if self._default_trace_mode:
253
+ logger.debug("Using API Tracing for ingestion.")
254
+
180
255
  headers = {
181
256
  "X-RestLi-Protocol-Version": "2.0.0",
182
257
  "X-DataHub-Py-Cli-Version": nice_version_name(),
@@ -264,6 +339,43 @@ class DataHubRestEmitter(Closeable, Emitter):
264
339
 
265
340
  return DataHubGraph.from_emitter(self)
266
341
 
342
+ def _to_openapi_request(
343
+ self,
344
+ mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
345
+ async_flag: Optional[bool] = None,
346
+ async_default: bool = False,
347
+ ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
348
+ if mcp.aspect and mcp.aspectName:
349
+ resolved_async_flag = (
350
+ async_flag if async_flag is not None else async_default
351
+ )
352
+ url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
353
+
354
+ if isinstance(mcp, MetadataChangeProposalWrapper):
355
+ aspect_value = pre_json_transform(
356
+ mcp.to_obj(simplified_structure=True)
357
+ )["aspect"]["json"]
358
+ else:
359
+ obj = mcp.aspect.to_obj()
360
+ if obj.get("value") and obj.get("contentType") == JSON_CONTENT_TYPE:
361
+ obj = json.loads(obj["value"])
362
+ aspect_value = pre_json_transform(obj)
363
+ return (
364
+ url,
365
+ [
366
+ {
367
+ "urn": mcp.entityUrn,
368
+ mcp.aspectName: {
369
+ "value": aspect_value,
370
+ "systemMetadata": mcp.systemMetadata.to_obj()
371
+ if mcp.systemMetadata
372
+ else None,
373
+ },
374
+ }
375
+ ],
376
+ )
377
+ return None
378
+
267
379
  def emit(
268
380
  self,
269
381
  item: Union[
@@ -316,31 +428,135 @@ class DataHubRestEmitter(Closeable, Emitter):
316
428
  self,
317
429
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
318
430
  async_flag: Optional[bool] = None,
431
+ trace_flag: Optional[bool] = None,
432
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
319
433
  ) -> None:
320
- url = f"{self._gms_server}/aspects?action=ingestProposal"
321
434
  ensure_has_system_metadata(mcp)
322
435
 
323
- mcp_obj = pre_json_transform(mcp.to_obj())
324
- payload_dict = {"proposal": mcp_obj}
436
+ trace_data = None
325
437
 
326
- if async_flag is not None:
327
- payload_dict["async"] = "true" if async_flag else "false"
438
+ if self._openapi_ingestion:
439
+ request = self._to_openapi_request(mcp, async_flag, async_default=False)
440
+ if request:
441
+ response = self._emit_generic(request[0], payload=request[1])
328
442
 
329
- payload = json.dumps(payload_dict)
443
+ if self._should_trace(async_flag, trace_flag):
444
+ trace_data = extract_trace_data(response) if response else None
330
445
 
331
- self._emit_generic(url, payload)
446
+ else:
447
+ url = f"{self._gms_server}/aspects?action=ingestProposal"
448
+
449
+ mcp_obj = pre_json_transform(mcp.to_obj())
450
+ payload_dict = {"proposal": mcp_obj}
451
+
452
+ if async_flag is not None:
453
+ payload_dict["async"] = "true" if async_flag else "false"
454
+
455
+ payload = json.dumps(payload_dict)
456
+
457
+ response = self._emit_generic(url, payload)
458
+
459
+ if self._should_trace(async_flag, trace_flag):
460
+ trace_data = (
461
+ extract_trace_data_from_mcps(response, [mcp]) if response else None
462
+ )
463
+
464
+ if trace_data:
465
+ self._await_status(
466
+ [trace_data],
467
+ trace_timeout,
468
+ )
332
469
 
333
470
  def emit_mcps(
334
471
  self,
335
472
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
336
473
  async_flag: Optional[bool] = None,
474
+ trace_flag: Optional[bool] = None,
475
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
337
476
  ) -> int:
338
477
  if _DATAHUB_EMITTER_TRACE:
339
478
  logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
340
- url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
479
+
341
480
  for mcp in mcps:
342
481
  ensure_has_system_metadata(mcp)
343
482
 
483
+ if self._openapi_ingestion:
484
+ return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
485
+ else:
486
+ return self._emit_restli_mcps(mcps, async_flag)
487
+
488
+ def _emit_openapi_mcps(
489
+ self,
490
+ mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
491
+ async_flag: Optional[bool] = None,
492
+ trace_flag: Optional[bool] = None,
493
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
494
+ ) -> int:
495
+ """
496
+ 1. Grouping MCPs by their entity URL
497
+ 2. Breaking down large batches into smaller chunks based on both:
498
+ * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
499
+ * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
500
+
501
+ The Chunk class encapsulates both the items and their byte size tracking
502
+ Serializing the items only once with json.dumps(request[1]) and reusing that
503
+ The chunking logic handles edge cases (always accepting at least one item per chunk)
504
+ The joining logic is efficient with a simple string concatenation
505
+
506
+ :param mcps: metadata change proposals to transmit
507
+ :param async_flag: the mode
508
+ :return: number of requests
509
+ """
510
+ # group by entity url
511
+ batches: Dict[str, List[_Chunk]] = defaultdict(
512
+ lambda: [_Chunk(items=[])]
513
+ ) # Initialize with one empty Chunk
514
+
515
+ for mcp in mcps:
516
+ request = self._to_openapi_request(mcp, async_flag, async_default=True)
517
+ if request:
518
+ current_chunk = batches[request[0]][-1] # Get the last chunk
519
+ # Only serialize once
520
+ serialized_item = json.dumps(request[1][0])
521
+ item_bytes = len(serialized_item.encode())
522
+
523
+ # If adding this item would exceed max_bytes, create a new chunk
524
+ # Unless the chunk is empty (always add at least one item)
525
+ if current_chunk.items and (
526
+ current_chunk.total_bytes + item_bytes > INGEST_MAX_PAYLOAD_BYTES
527
+ or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
528
+ ):
529
+ new_chunk = _Chunk(items=[])
530
+ batches[request[0]].append(new_chunk)
531
+ current_chunk = new_chunk
532
+
533
+ current_chunk.add_item(serialized_item)
534
+
535
+ responses = []
536
+ for url, chunks in batches.items():
537
+ for chunk in chunks:
538
+ response = self._emit_generic(url, payload=_Chunk.join(chunk))
539
+ responses.append(response)
540
+
541
+ if self._should_trace(async_flag, trace_flag, async_default=True):
542
+ trace_data = []
543
+ for response in responses:
544
+ data = extract_trace_data(response) if response else None
545
+ if data is not None:
546
+ trace_data.append(data)
547
+
548
+ if trace_data:
549
+ self._await_status(trace_data, trace_timeout)
550
+
551
+ return len(responses)
552
+
553
+ def _emit_restli_mcps(
554
+ self,
555
+ mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
556
+ async_flag: Optional[bool] = None,
557
+ ) -> int:
558
+ url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
559
+
344
560
  mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
345
561
 
346
562
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
@@ -392,7 +608,10 @@ class DataHubRestEmitter(Closeable, Emitter):
392
608
  payload = json.dumps(snapshot)
393
609
  self._emit_generic(url, payload)
394
610
 
395
- def _emit_generic(self, url: str, payload: str) -> None:
611
+ def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
612
+ if not isinstance(payload, str):
613
+ payload = json.dumps(payload)
614
+
396
615
  curl_command = make_curl_command(self._session, "POST", url, payload)
397
616
  payload_size = len(payload)
398
617
  if payload_size > INGEST_MAX_PAYLOAD_BYTES:
@@ -408,6 +627,7 @@ class DataHubRestEmitter(Closeable, Emitter):
408
627
  try:
409
628
  response = self._session.post(url, data=payload)
410
629
  response.raise_for_status()
630
+ return response
411
631
  except HTTPError as e:
412
632
  try:
413
633
  info: Dict = response.json()
@@ -438,6 +658,99 @@ class DataHubRestEmitter(Closeable, Emitter):
438
658
  "Unable to emit metadata to DataHub GMS", {"message": str(e)}
439
659
  ) from e
440
660
 
661
+ def _await_status(
662
+ self,
663
+ trace_data: List[TraceData],
664
+ trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
665
+ ) -> None:
666
+ """Verify the status of asynchronous write operations.
667
+ Args:
668
+ trace_data: List of trace data to verify
669
+ trace_timeout: Maximum time to wait for verification.
670
+ Raises:
671
+ TraceTimeoutError: If verification fails or times out
672
+ TraceValidationError: Expected write was not completed successfully
673
+ """
674
+ if trace_timeout is None:
675
+ raise ValueError("trace_timeout cannot be None")
676
+
677
+ try:
678
+ if not trace_data:
679
+ logger.debug("No trace data to verify")
680
+ return
681
+
682
+ start_time = datetime.now()
683
+
684
+ for trace in trace_data:
685
+ current_backoff = TRACE_INITIAL_BACKOFF
686
+
687
+ while trace.data:
688
+ if datetime.now() - start_time > trace_timeout:
689
+ raise TraceTimeoutError(
690
+ f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
691
+ )
692
+
693
+ base_url = f"{self._gms_server}/openapi/v1/trace/write"
694
+ url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"
695
+
696
+ response = self._emit_generic(url, payload=trace.data)
697
+ json_data = response.json()
698
+
699
+ for urn, aspects in json_data.items():
700
+ for aspect_name, aspect_status in aspects.items():
701
+ if not aspect_status["success"]:
702
+ error_msg = (
703
+ f"Unable to validate async write to DataHub GMS: "
704
+ f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
705
+ f"Status: {aspect_status}"
706
+ )
707
+ raise TraceValidationError(error_msg, aspect_status)
708
+
709
+ primary_storage = aspect_status["primaryStorage"][
710
+ "writeStatus"
711
+ ]
712
+ search_storage = aspect_status["searchStorage"][
713
+ "writeStatus"
714
+ ]
715
+
716
+ # Remove resolved statuses
717
+ if (
718
+ primary_storage != TRACE_PENDING_STATUS
719
+ and search_storage != TRACE_PENDING_STATUS
720
+ ):
721
+ trace.data[urn].remove(aspect_name)
722
+
723
+ # Remove urns with all statuses resolved
724
+ if not trace.data[urn]:
725
+ trace.data.pop(urn)
726
+
727
+ # Adjust backoff based on response
728
+ if trace.data:
729
+ # If we still have pending items, increase backoff
730
+ current_backoff = min(
731
+ current_backoff * TRACE_BACKOFF_FACTOR, TRACE_MAX_BACKOFF
732
+ )
733
+ logger.debug(
734
+ f"Waiting {current_backoff} seconds before next check"
735
+ )
736
+ time.sleep(current_backoff)
737
+
738
+ except Exception as e:
739
+ logger.error(f"Error during status verification: {str(e)}")
740
+ raise
741
+
742
+ def _should_trace(
743
+ self,
744
+ async_flag: Optional[bool] = None,
745
+ trace_flag: Optional[bool] = None,
746
+ async_default: bool = False,
747
+ ) -> bool:
748
+ resolved_trace_flag = (
749
+ trace_flag if trace_flag is not None else self._default_trace_mode
750
+ )
751
+ resolved_async_flag = async_flag if async_flag is not None else async_default
752
+ return resolved_trace_flag and resolved_async_flag
753
+
441
754
  def __repr__(self) -> str:
442
755
  token_str = (
443
756
  f" with token: {self._token[:4]}**********{self._token[-4:]}"
@@ -3,7 +3,7 @@ from enum import Enum, auto
3
3
  from typing import Callable, Dict, Optional, Type
4
4
 
5
5
  from datahub.ingestion.api.common import PipelineContext
6
- from datahub.ingestion.api.source import ( # noqa: I250
6
+ from datahub.ingestion.api.source import (
7
7
  Source,
8
8
  SourceCapability as SourceCapability,
9
9
  )
@@ -250,6 +250,10 @@ def auto_browse_path_v2(
250
250
  emitted_urns: Set[str] = set()
251
251
  containers_used_as_parent: Set[str] = set()
252
252
  for urn, batch in _batch_workunits_by_urn(stream):
253
+ # Do not generate browse path v2 for entities that do not support it
254
+ if not entity_supports_aspect(guess_entity_type(urn), BrowsePathsV2Class):
255
+ yield from batch
256
+ continue
253
257
  container_path: Optional[List[BrowsePathEntryClass]] = None
254
258
  legacy_path: Optional[List[BrowsePathEntryClass]] = None
255
259
  browse_path_v2: Optional[List[BrowsePathEntryClass]] = None
@@ -48,12 +48,12 @@ class S3ListIterator(Iterator):
48
48
  def __next__(self) -> FileInfo:
49
49
  try:
50
50
  return next(self._file_statuses)
51
- except StopIteration:
51
+ except StopIteration as e:
52
52
  if self._token:
53
53
  self.fetch()
54
54
  return next(self._file_statuses)
55
55
  else:
56
- raise StopIteration()
56
+ raise e
57
57
 
58
58
  def fetch(self):
59
59
  params = dict(Bucket=self._bucket, Prefix=self._prefix, MaxKeys=self._max_keys)
@@ -279,11 +279,7 @@ class ClassificationHandler:
279
279
  "Dataset_Name": dataset_name,
280
280
  }
281
281
  ),
282
- values=(
283
- sample_data[schema_field.fieldPath]
284
- if schema_field.fieldPath in sample_data.keys()
285
- else []
286
- ),
282
+ values=sample_data.get(schema_field.fieldPath, []),
287
283
  )
288
284
  )
289
285