acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,6 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import time
8
- import warnings
9
8
  from collections import defaultdict
10
9
  from dataclasses import dataclass
11
10
  from datetime import datetime, timedelta
@@ -41,23 +40,26 @@ from datahub.configuration.common import (
41
40
  TraceTimeoutError,
42
41
  TraceValidationError,
43
42
  )
44
- from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
45
43
  from datahub.emitter.generic_emitter import Emitter
46
44
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
47
- from datahub.emitter.request_helper import make_curl_command
45
+ from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
48
46
  from datahub.emitter.response_helper import (
49
47
  TraceData,
50
48
  extract_trace_data,
51
49
  extract_trace_data_from_mcps,
52
50
  )
53
51
  from datahub.emitter.serialization_helper import pre_json_transform
54
- from datahub.errors import APITracingWarning
55
52
  from datahub.ingestion.api.closeable import Closeable
53
+ from datahub.ingestion.graph.config import (
54
+ DATAHUB_COMPONENT_ENV,
55
+ ClientMode,
56
+ )
56
57
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
57
58
  MetadataChangeEvent,
58
59
  MetadataChangeProposal,
59
60
  )
60
61
  from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
62
+ from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
61
63
 
62
64
  if TYPE_CHECKING:
63
65
  from datahub.ingestion.graph.client import DataHubGraph
@@ -80,6 +82,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
80
82
 
81
83
  _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
82
84
 
85
+ _DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
86
+
83
87
  TRACE_PENDING_STATUS = "PENDING"
84
88
  TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
85
89
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
@@ -134,12 +138,24 @@ class RequestsSessionConfig(ConfigModel):
134
138
  ca_certificate_path: Optional[str] = None
135
139
  client_certificate_path: Optional[str] = None
136
140
  disable_ssl_verification: bool = False
141
+ client_mode: Optional[ClientMode] = _DEFAULT_CLIENT_MODE
142
+ datahub_component: Optional[str] = None
137
143
 
138
144
  def build_session(self) -> requests.Session:
139
145
  session = requests.Session()
140
146
 
141
- if self.extra_headers:
142
- session.headers.update(self.extra_headers)
147
+ user_agent = self._get_user_agent_string(session)
148
+
149
+ base_headers = {
150
+ "User-Agent": user_agent,
151
+ "X-DataHub-Client-Mode": self.client_mode.name
152
+ if self.client_mode
153
+ else _DEFAULT_CLIENT_MODE.name,
154
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
155
+ }
156
+
157
+ headers = {**base_headers, **self.extra_headers}
158
+ session.headers.update(headers)
143
159
 
144
160
  if self.client_certificate_path:
145
161
  session.cert = self.client_certificate_path
@@ -187,6 +203,59 @@ class RequestsSessionConfig(ConfigModel):
187
203
 
188
204
  return session
189
205
 
206
+ @classmethod
207
+ def get_client_mode_from_session(
208
+ cls, session: requests.Session
209
+ ) -> Optional[ClientMode]:
210
+ """
211
+ Extract the ClientMode enum from a requests Session by checking the headers.
212
+
213
+ Args:
214
+ session: The requests.Session object to check
215
+
216
+ Returns:
217
+ The corresponding ClientMode enum value if found, None otherwise
218
+ """
219
+ # Check if the session has the X-DataHub-Client-Mode header
220
+ mode_str = session.headers.get("X-DataHub-Client-Mode")
221
+
222
+ if not mode_str:
223
+ return None
224
+
225
+ # Try to convert the string value to enum
226
+ try:
227
+ # First ensure we're working with a str value
228
+ if isinstance(mode_str, bytes):
229
+ mode_str = mode_str.decode("utf-8")
230
+
231
+ # Then find the matching enum value
232
+ for mode in ClientMode:
233
+ if mode.name == mode_str:
234
+ return mode
235
+
236
+ # If we got here, no matching enum was found
237
+ return None
238
+ except Exception:
239
+ # Handle any other errors
240
+ return None
241
+
242
+ def _get_user_agent_string(self, session: requests.Session) -> str:
243
+ """Generate appropriate user agent string based on client mode"""
244
+ version = nice_version_name()
245
+ client_mode = self.client_mode if self.client_mode else _DEFAULT_CLIENT_MODE
246
+
247
+ if "User-Agent" in session.headers:
248
+ user_agent = session.headers["User-Agent"]
249
+ if isinstance(user_agent, bytes):
250
+ requests_user_agent = " " + user_agent.decode("utf-8")
251
+ else:
252
+ requests_user_agent = " " + user_agent
253
+ else:
254
+ requests_user_agent = ""
255
+
256
+ # 1.0 refers to the user agent string version
257
+ return f"DataHub-Client/1.0 ({client_mode.name.lower()}; {self.datahub_component if self.datahub_component else DATAHUB_COMPONENT_ENV}; {version}){requests_user_agent}"
258
+
190
259
 
191
260
  @dataclass
192
261
  class _Chunk:
@@ -212,8 +281,9 @@ class DataHubRestEmitter(Closeable, Emitter):
212
281
  _gms_server: str
213
282
  _token: Optional[str]
214
283
  _session: requests.Session
215
- _openapi_ingestion: bool
284
+ _openapi_ingestion: Optional[bool]
216
285
  _default_trace_mode: bool
286
+ server_config: RestServiceConfig
217
287
 
218
288
  def __init__(
219
289
  self,
@@ -229,10 +299,10 @@ class DataHubRestEmitter(Closeable, Emitter):
229
299
  ca_certificate_path: Optional[str] = None,
230
300
  client_certificate_path: Optional[str] = None,
231
301
  disable_ssl_verification: bool = False,
232
- openapi_ingestion: bool = (
233
- DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
234
- ),
302
+ openapi_ingestion: Optional[bool] = None,
235
303
  default_trace_mode: bool = False,
304
+ client_mode: Optional[ClientMode] = None,
305
+ datahub_component: Optional[str] = None,
236
306
  ):
237
307
  if not gms_server:
238
308
  raise ConfigurationError("gms server is required")
@@ -244,13 +314,10 @@ class DataHubRestEmitter(Closeable, Emitter):
244
314
 
245
315
  self._gms_server = fixup_gms_url(gms_server)
246
316
  self._token = token
247
- self.server_config: Dict[str, Any] = {}
248
- self._openapi_ingestion = openapi_ingestion
249
317
  self._default_trace_mode = default_trace_mode
250
318
  self._session = requests.Session()
251
-
252
- logger.debug(
253
- f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
319
+ self._openapi_ingestion = (
320
+ openapi_ingestion # Re-evaluated after test connection
254
321
  )
255
322
 
256
323
  if self._default_trace_mode:
@@ -258,7 +325,6 @@ class DataHubRestEmitter(Closeable, Emitter):
258
325
 
259
326
  headers = {
260
327
  "X-RestLi-Protocol-Version": "2.0.0",
261
- "X-DataHub-Py-Cli-Version": nice_version_name(),
262
328
  "Content-Type": "application/json",
263
329
  }
264
330
  if token:
@@ -304,37 +370,54 @@ class DataHubRestEmitter(Closeable, Emitter):
304
370
  ca_certificate_path=ca_certificate_path,
305
371
  client_certificate_path=client_certificate_path,
306
372
  disable_ssl_verification=disable_ssl_verification,
373
+ client_mode=client_mode,
374
+ datahub_component=datahub_component,
307
375
  )
308
376
 
309
377
  self._session = self._session_config.build_session()
310
378
 
311
379
  def test_connection(self) -> None:
312
380
  url = f"{self._gms_server}/config"
313
- response = self._session.get(url)
314
- if response.status_code == 200:
315
- config: dict = response.json()
316
- if config.get("noCode") == "true":
317
- self.server_config = config
318
- return
381
+ try:
382
+ # Create a config instance with session and URL
383
+ config = RestServiceConfig(session=self._session, url=url)
384
+ # Attempt to load config, which will throw ConfigurationError if there's an issue
385
+ config.fetch_config()
386
+ self.server_config = config
387
+
388
+ # Determine OpenAPI mode
389
+ if self._openapi_ingestion is None:
390
+ # No constructor parameter
391
+ if (
392
+ not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
393
+ and self._session_config.client_mode == ClientMode.SDK
394
+ and self.server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
395
+ ):
396
+ # Enable if SDK client and no environment variable specified
397
+ self._openapi_ingestion = True
398
+ else:
399
+ # The system env is specifying the value
400
+ self._openapi_ingestion = (
401
+ DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
402
+ )
319
403
 
320
- else:
321
- raise ConfigurationError(
322
- "You seem to have connected to the frontend service instead of the GMS endpoint. "
323
- "The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
324
- "For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
325
- )
326
- else:
327
404
  logger.debug(
328
- f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
405
+ f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
329
406
  )
330
- if response.status_code == 401:
331
- message = f"Unable to connect to {url} - got an authentication error: {response.text}."
332
- else:
333
- message = f"Unable to connect to {url} with status_code: {response.status_code}."
334
- message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
335
- raise ConfigurationError(message)
336
407
 
337
- def get_server_config(self) -> dict:
408
+ # Set default tracing for SDK
409
+ if (
410
+ self._session_config.client_mode == ClientMode.SDK
411
+ and self.server_config.supports_feature(ServiceFeature.API_TRACING)
412
+ ):
413
+ # Enable tracing if using SDK & server supported
414
+ self._default_trace_mode = True
415
+
416
+ except ConfigurationError as e:
417
+ # Just re-raise the exception
418
+ raise e
419
+
420
+ def get_server_config(self) -> RestServiceConfig:
338
421
  self.test_connection()
339
422
  return self.server_config
340
423
 
@@ -348,43 +431,24 @@ class DataHubRestEmitter(Closeable, Emitter):
348
431
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
349
432
  async_flag: Optional[bool] = None,
350
433
  async_default: bool = False,
351
- ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
352
- if mcp.aspect and mcp.aspectName:
353
- resolved_async_flag = (
354
- async_flag if async_flag is not None else async_default
355
- )
356
- url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
434
+ ) -> Optional[OpenApiRequest]:
435
+ """
436
+ Convert a MetadataChangeProposal to an OpenAPI request format.
357
437
 
358
- if isinstance(mcp, MetadataChangeProposalWrapper):
359
- aspect_value = pre_json_transform(
360
- mcp.to_obj(simplified_structure=True)
361
- )["aspect"]["json"]
362
- else:
363
- obj = mcp.aspect.to_obj()
364
- content_type = obj.get("contentType")
365
- if obj.get("value") and content_type == JSON_CONTENT_TYPE:
366
- # Undo double serialization.
367
- obj = json.loads(obj["value"])
368
- elif content_type == JSON_PATCH_CONTENT_TYPE:
369
- raise NotImplementedError(
370
- "Patches are not supported for OpenAPI ingestion. Set the endpoint to RESTLI."
371
- )
372
- aspect_value = pre_json_transform(obj)
373
- return (
374
- url,
375
- [
376
- {
377
- "urn": mcp.entityUrn,
378
- mcp.aspectName: {
379
- "value": aspect_value,
380
- "systemMetadata": mcp.systemMetadata.to_obj()
381
- if mcp.systemMetadata
382
- else None,
383
- },
384
- }
385
- ],
386
- )
387
- return None
438
+ Args:
439
+ mcp: The metadata change proposal
440
+ async_flag: Optional flag to override async behavior
441
+ async_default: Default async behavior if not specified
442
+
443
+ Returns:
444
+ An OpenApiRequest object or None if the MCP doesn't have required fields
445
+ """
446
+ return OpenApiRequest.from_mcp(
447
+ mcp=mcp,
448
+ gms_server=self._gms_server,
449
+ async_flag=async_flag,
450
+ async_default=async_default,
451
+ )
388
452
 
389
453
  def emit(
390
454
  self,
@@ -448,7 +512,9 @@ class DataHubRestEmitter(Closeable, Emitter):
448
512
  if self._openapi_ingestion:
449
513
  request = self._to_openapi_request(mcp, async_flag, async_default=False)
450
514
  if request:
451
- response = self._emit_generic(request[0], payload=request[1])
515
+ response = self._emit_generic(
516
+ request.url, payload=request.payload, method=request.method
517
+ )
452
518
 
453
519
  if self._should_trace(async_flag, trace_flag):
454
520
  trace_data = extract_trace_data(response) if response else None
@@ -503,31 +569,36 @@ class DataHubRestEmitter(Closeable, Emitter):
503
569
  trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
504
570
  ) -> int:
505
571
  """
506
- 1. Grouping MCPs by their entity URL
572
+ 1. Grouping MCPs by their HTTP method and entity URL and HTTP method
507
573
  2. Breaking down large batches into smaller chunks based on both:
508
574
  * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
509
575
  * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
510
576
 
511
577
  The Chunk class encapsulates both the items and their byte size tracking
512
- Serializing the items only once with json.dumps(request[1]) and reusing that
578
+ Serializing the items only once with json.dumps(request.payload) and reusing that
513
579
  The chunking logic handles edge cases (always accepting at least one item per chunk)
514
580
  The joining logic is efficient with a simple string concatenation
515
581
 
516
582
  :param mcps: metadata change proposals to transmit
517
583
  :param async_flag: the mode
584
+ :param trace_flag: whether to trace the requests
585
+ :param trace_timeout: timeout for tracing
518
586
  :return: number of requests
519
587
  """
520
- # group by entity url
521
- batches: Dict[str, List[_Chunk]] = defaultdict(
588
+ # Group by entity URL and HTTP method
589
+ batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
522
590
  lambda: [_Chunk(items=[])]
523
591
  ) # Initialize with one empty Chunk
524
592
 
525
593
  for mcp in mcps:
526
594
  request = self._to_openapi_request(mcp, async_flag, async_default=True)
527
595
  if request:
528
- current_chunk = batches[request[0]][-1] # Get the last chunk
529
- # Only serialize once
530
- serialized_item = json.dumps(request[1][0])
596
+ # Create a composite key with both method and URL
597
+ key = (request.method, request.url)
598
+ current_chunk = batches[key][-1] # Get the last chunk
599
+
600
+ # Only serialize once - we're serializing a single payload item
601
+ serialized_item = json.dumps(request.payload[0])
531
602
  item_bytes = len(serialized_item.encode())
532
603
 
533
604
  # If adding this item would exceed max_bytes, create a new chunk
@@ -537,15 +608,17 @@ class DataHubRestEmitter(Closeable, Emitter):
537
608
  or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
538
609
  ):
539
610
  new_chunk = _Chunk(items=[])
540
- batches[request[0]].append(new_chunk)
611
+ batches[key].append(new_chunk)
541
612
  current_chunk = new_chunk
542
613
 
543
614
  current_chunk.add_item(serialized_item)
544
615
 
545
616
  responses = []
546
- for url, chunks in batches.items():
617
+ for (method, url), chunks in batches.items():
547
618
  for chunk in chunks:
548
- response = self._emit_generic(url, payload=_Chunk.join(chunk))
619
+ response = self._emit_generic(
620
+ url, payload=_Chunk.join(chunk), method=method
621
+ )
549
622
  responses.append(response)
550
623
 
551
624
  if self._should_trace(async_flag, trace_flag, async_default=True):
@@ -618,11 +691,13 @@ class DataHubRestEmitter(Closeable, Emitter):
618
691
  payload = json.dumps(snapshot)
619
692
  self._emit_generic(url, payload)
620
693
 
621
- def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
694
+ def _emit_generic(
695
+ self, url: str, payload: Union[str, Any], method: str = "POST"
696
+ ) -> requests.Response:
622
697
  if not isinstance(payload, str):
623
698
  payload = json.dumps(payload)
624
699
 
625
- curl_command = make_curl_command(self._session, "POST", url, payload)
700
+ curl_command = make_curl_command(self._session, method, url, payload)
626
701
  payload_size = len(payload)
627
702
  if payload_size > INGEST_MAX_PAYLOAD_BYTES:
628
703
  # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
@@ -635,7 +710,8 @@ class DataHubRestEmitter(Closeable, Emitter):
635
710
  curl_command,
636
711
  )
637
712
  try:
638
- response = self._session.post(url, data=payload)
713
+ method_func = getattr(self._session, method.lower())
714
+ response = method_func(url, data=payload) if payload else method_func(url)
639
715
  response.raise_for_status()
640
716
  return response
641
717
  except HTTPError as e:
@@ -759,12 +835,6 @@ class DataHubRestEmitter(Closeable, Emitter):
759
835
  trace_flag if trace_flag is not None else self._default_trace_mode
760
836
  )
761
837
  resolved_async_flag = async_flag if async_flag is not None else async_default
762
- if resolved_trace_flag and not resolved_async_flag:
763
- warnings.warn(
764
- "API tracing is only available with async ingestion. For sync mode, API errors will be surfaced as exceptions.",
765
- APITracingWarning,
766
- stacklevel=3,
767
- )
768
838
  return resolved_trace_flag and resolved_async_flag
769
839
 
770
840
  def __repr__(self) -> str:
datahub/entrypoints.py CHANGED
@@ -37,6 +37,7 @@ from datahub.cli.telemetry import telemetry as telemetry_cli
37
37
  from datahub.cli.timeline_cli import timeline
38
38
  from datahub.configuration.common import should_show_stack_trace
39
39
  from datahub.ingestion.graph.client import get_default_graph
40
+ from datahub.ingestion.graph.config import ClientMode
40
41
  from datahub.telemetry import telemetry
41
42
  from datahub.utilities._custom_package_loader import model_version_name
42
43
  from datahub.utilities.logging_manager import configure_logging
@@ -117,7 +118,7 @@ def version(include_server: bool = False) -> None:
117
118
  click.echo(f"Models: {model_version_name()}")
118
119
  click.echo(f"Python version: {sys.version}")
119
120
  if include_server:
120
- server_config = get_default_graph().get_config()
121
+ server_config = get_default_graph(ClientMode.CLI).get_config()
121
122
  click.echo(f"Server config: {server_config}")
122
123
 
123
124
 
datahub/errors.py CHANGED
@@ -41,3 +41,7 @@ class ExperimentalWarning(Warning):
41
41
 
42
42
  class APITracingWarning(Warning):
43
43
  pass
44
+
45
+
46
+ class DataHubDeprecationWarning(DeprecationWarning):
47
+ pass
@@ -23,6 +23,7 @@ class EnsureAspectSizeProcessor:
23
23
  ):
24
24
  self.report = report
25
25
  self.payload_constraint = payload_constraint
26
+ self.schema_size_constraint = int(self.payload_constraint * 0.985)
26
27
 
27
28
  def ensure_dataset_profile_size(
28
29
  self, dataset_urn: str, profile: DatasetProfileClass
@@ -68,7 +69,7 @@ class EnsureAspectSizeProcessor:
68
69
  for field in schema.fields:
69
70
  field_size = len(json.dumps(pre_json_transform(field.to_obj())))
70
71
  logger.debug(f"Field {field.fieldPath} takes total {field_size}")
71
- if total_fields_size + field_size < self.payload_constraint:
72
+ if total_fields_size + field_size < self.schema_size_constraint:
72
73
  accepted_fields.append(field)
73
74
  total_fields_size += field_size
74
75
  else:
@@ -420,12 +420,9 @@ class Source(Closeable, metaclass=ABCMeta):
420
420
  Run in order, first in list is applied first. Be careful with order when overriding.
421
421
  """
422
422
  browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
423
- if (
424
- self.ctx.pipeline_config
425
- and self.ctx.pipeline_config.flags.generate_browse_path_v2
426
- ):
423
+ if self.ctx.flags.generate_browse_path_v2:
427
424
  browse_path_processor = self._get_browse_path_processor(
428
- self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run
425
+ self.ctx.flags.generate_browse_path_v2_dry_run
429
426
  )
430
427
 
431
428
  auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
@@ -92,6 +92,7 @@ def create_dataset_props_patch_builder(
92
92
  patch_builder.set_last_modified(dataset_properties.lastModified)
93
93
  patch_builder.set_qualified_name(dataset_properties.qualifiedName)
94
94
  patch_builder.add_custom_properties(dataset_properties.customProperties)
95
+ patch_builder.set_external_url(dataset_properties.externalUrl)
95
96
 
96
97
  return patch_builder
97
98
 
@@ -319,8 +319,10 @@ def classification_workunit_processor(
319
319
  partial(
320
320
  data_reader.get_sample_data_for_table,
321
321
  table_id,
322
- classification_handler.config.classification.sample_size
323
- * SAMPLE_SIZE_MULTIPLIER,
322
+ int(
323
+ classification_handler.config.classification.sample_size
324
+ * SAMPLE_SIZE_MULTIPLIER
325
+ ),
324
326
  **(data_reader_kwargs or {}),
325
327
  )
326
328
  if data_reader
@@ -34,14 +34,13 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
34
34
  from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
35
35
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
36
36
  from datahub.emitter.rest_emitter import (
37
- DEFAULT_REST_EMITTER_ENDPOINT,
38
37
  DEFAULT_REST_TRACE_MODE,
39
38
  DatahubRestEmitter,
40
- RestSinkEndpoint,
41
39
  RestTraceMode,
42
40
  )
43
41
  from datahub.emitter.serialization_helper import post_json_transform
44
42
  from datahub.ingestion.graph.config import (
43
+ ClientMode,
45
44
  DatahubClientConfig as DatahubClientConfig,
46
45
  )
47
46
  from datahub.ingestion.graph.connections import (
@@ -55,6 +54,7 @@ from datahub.ingestion.graph.filters import (
55
54
  RemovedStatusFilter,
56
55
  generate_filter,
57
56
  )
57
+ from datahub.ingestion.graph.links import make_url_for_urn
58
58
  from datahub.ingestion.source.state.checkpoint import Checkpoint
59
59
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
60
60
  MetadataChangeEvent,
@@ -158,11 +158,12 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
158
158
  ca_certificate_path=self.config.ca_certificate_path,
159
159
  client_certificate_path=self.config.client_certificate_path,
160
160
  disable_ssl_verification=self.config.disable_ssl_verification,
161
- openapi_ingestion=DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI,
161
+ openapi_ingestion=self.config.openapi_ingestion,
162
162
  default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
163
+ client_mode=config.client_mode,
164
+ datahub_component=config.datahub_component,
163
165
  )
164
-
165
- self.server_id = _MISSING_SERVER_ID
166
+ self.server_id: str = _MISSING_SERVER_ID
166
167
 
167
168
  def test_connection(self) -> None:
168
169
  super().test_connection()
@@ -187,20 +188,36 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
187
188
  """Get the public-facing base url of the frontend
188
189
 
189
190
  This url can be used to construct links to the frontend. The url will not include a trailing slash.
191
+
190
192
  Note: Only supported with DataHub Cloud.
191
193
  """
192
194
 
193
195
  if not self.server_config:
194
196
  self.test_connection()
195
197
 
196
- base_url = self.server_config.get("baseUrl")
198
+ base_url = self.server_config.raw_config.get("baseUrl")
197
199
  if not base_url:
198
200
  raise ValueError("baseUrl not found in server config")
199
201
  return base_url
200
202
 
203
+ def url_for(self, entity_urn: Union[str, Urn]) -> str:
204
+ """Get the UI url for an entity.
205
+
206
+ Note: Only supported with DataHub Cloud.
207
+
208
+ Args:
209
+ entity_urn: The urn of the entity to get the url for.
210
+
211
+ Returns:
212
+ The public-facing url for the entity.
213
+ """
214
+
215
+ return make_url_for_urn(self.frontend_base_url, str(entity_urn))
216
+
201
217
  @classmethod
202
218
  def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
203
219
  session_config = emitter._session_config
220
+
204
221
  if isinstance(session_config.timeout, tuple):
205
222
  # TODO: This is slightly lossy. Eventually, we want to modify the emitter
206
223
  # to accept a tuple for timeout_sec, and then we'll be able to remove this.
@@ -218,6 +235,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
218
235
  disable_ssl_verification=session_config.disable_ssl_verification,
219
236
  ca_certificate_path=session_config.ca_certificate_path,
220
237
  client_certificate_path=session_config.client_certificate_path,
238
+ client_mode=session_config.client_mode,
239
+ datahub_component=session_config.datahub_component,
221
240
  )
222
241
  )
223
242
 
@@ -358,7 +377,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
358
377
  )
359
378
 
360
379
  def get_config(self) -> Dict[str, Any]:
361
- return self._get_generic(f"{self.config.server}/config")
380
+ return self.get_server_config().config
362
381
 
363
382
  def get_ownership(self, entity_urn: str) -> Optional[OwnershipClass]:
364
383
  return self.get_aspect(entity_urn=entity_urn, aspect_type=OwnershipClass)
@@ -1952,8 +1971,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1952
1971
  super().close()
1953
1972
 
1954
1973
 
1955
- def get_default_graph() -> DataHubGraph:
1974
+ @functools.lru_cache(maxsize=None)
1975
+ def get_default_graph(
1976
+ client_mode: Optional[ClientMode] = None,
1977
+ datahub_component: Optional[str] = None,
1978
+ ) -> DataHubGraph:
1956
1979
  graph_config = config_utils.load_client_config()
1980
+ graph_config.client_mode = client_mode
1981
+ graph_config.datahub_component = datahub_component
1957
1982
  graph = DataHubGraph(graph_config)
1958
1983
  graph.test_connection()
1959
1984
  telemetry_instance.set_context(server=graph)
@@ -1,8 +1,19 @@
1
+ import os
2
+ from enum import Enum, auto
1
3
  from typing import Dict, List, Optional
2
4
 
3
5
  from datahub.configuration.common import ConfigModel
4
6
 
5
7
 
8
+ class ClientMode(Enum):
9
+ INGESTION = auto()
10
+ CLI = auto()
11
+ SDK = auto()
12
+
13
+
14
+ DATAHUB_COMPONENT_ENV: str = os.getenv("DATAHUB_COMPONENT", "datahub").lower()
15
+
16
+
6
17
  class DatahubClientConfig(ConfigModel):
7
18
  """Configuration class for holding connectivity to datahub gms"""
8
19
 
@@ -17,3 +28,6 @@ class DatahubClientConfig(ConfigModel):
17
28
  ca_certificate_path: Optional[str] = None
18
29
  client_certificate_path: Optional[str] = None
19
30
  disable_ssl_verification: bool = False
31
+ openapi_ingestion: Optional[bool] = None
32
+ client_mode: Optional[ClientMode] = None
33
+ datahub_component: Optional[str] = None
@@ -18,7 +18,7 @@ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
18
18
  # This can be put directly into the orFilters parameter in GraphQL.
19
19
  RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
20
20
 
21
- # Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
21
+ # Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
22
22
  FilterOperator: TypeAlias = Literal[
23
23
  "CONTAIN",
24
24
  "EQUAL",