acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show
  1. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2524 -2471
  2. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +87 -87
  3. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datajob/dataflow.py +3 -3
  7. datahub/api/entities/forms/forms.py +34 -34
  8. datahub/api/graphql/assertion.py +1 -1
  9. datahub/api/graphql/operation.py +4 -4
  10. datahub/cli/check_cli.py +3 -2
  11. datahub/cli/config_utils.py +2 -2
  12. datahub/cli/delete_cli.py +6 -5
  13. datahub/cli/docker_cli.py +2 -2
  14. datahub/cli/exists_cli.py +2 -1
  15. datahub/cli/get_cli.py +2 -1
  16. datahub/cli/iceberg_cli.py +6 -5
  17. datahub/cli/ingest_cli.py +9 -6
  18. datahub/cli/migrate.py +4 -3
  19. datahub/cli/migration_utils.py +4 -3
  20. datahub/cli/put_cli.py +3 -2
  21. datahub/cli/specific/assertions_cli.py +2 -1
  22. datahub/cli/specific/datacontract_cli.py +3 -2
  23. datahub/cli/specific/dataproduct_cli.py +10 -9
  24. datahub/cli/specific/dataset_cli.py +4 -3
  25. datahub/cli/specific/forms_cli.py +2 -1
  26. datahub/cli/specific/group_cli.py +2 -1
  27. datahub/cli/specific/structuredproperties_cli.py +4 -3
  28. datahub/cli/specific/user_cli.py +2 -1
  29. datahub/cli/state_cli.py +2 -1
  30. datahub/cli/timeline_cli.py +2 -1
  31. datahub/configuration/source_common.py +1 -1
  32. datahub/emitter/request_helper.py +116 -3
  33. datahub/emitter/rest_emitter.py +163 -93
  34. datahub/entrypoints.py +2 -1
  35. datahub/ingestion/api/source.py +2 -5
  36. datahub/ingestion/glossary/classification_mixin.py +4 -2
  37. datahub/ingestion/graph/client.py +16 -7
  38. datahub/ingestion/graph/config.py +14 -0
  39. datahub/ingestion/graph/filters.py +1 -1
  40. datahub/ingestion/run/pipeline.py +3 -2
  41. datahub/ingestion/run/pipeline_config.py +1 -1
  42. datahub/ingestion/sink/datahub_rest.py +5 -6
  43. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  44. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  45. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  46. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  47. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  48. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  49. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  50. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  51. datahub/ingestion/source/feast.py +4 -4
  52. datahub/ingestion/source/ge_data_profiler.py +2 -1
  53. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  54. datahub/ingestion/source/ldap.py +1 -1
  55. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  56. datahub/ingestion/source/looker/lookml_source.py +7 -1
  57. datahub/ingestion/source/metadata/lineage.py +2 -1
  58. datahub/ingestion/source/mode.py +74 -28
  59. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  60. datahub/ingestion/source/powerbi/config.py +1 -1
  61. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  62. datahub/ingestion/source/redshift/usage.py +10 -9
  63. datahub/ingestion/source/sql/clickhouse.py +5 -1
  64. datahub/ingestion/source/sql/druid.py +7 -2
  65. datahub/ingestion/source/sql/oracle.py +6 -2
  66. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  67. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  68. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  69. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  70. datahub/integrations/assertion/common.py +3 -2
  71. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
  72. datahub/metadata/_urns/urn_defs.py +1786 -1786
  73. datahub/metadata/schema.avsc +17364 -16988
  74. datahub/metadata/schema_classes.py +3 -3
  75. datahub/metadata/schemas/__init__.py +3 -3
  76. datahub/sdk/main_client.py +2 -2
  77. datahub/secret/datahub_secret_store.py +2 -1
  78. datahub/telemetry/telemetry.py +2 -2
  79. datahub/testing/check_imports.py +1 -1
  80. datahub/upgrade/upgrade.py +10 -12
  81. datahub/utilities/logging_manager.py +8 -1
  82. datahub/utilities/server_config_util.py +378 -10
  83. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  84. datahub/utilities/urn_encoder.py +1 -1
  85. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
  86. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
  87. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,6 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import time
8
- import warnings
9
8
  from collections import defaultdict
10
9
  from dataclasses import dataclass
11
10
  from datetime import datetime, timedelta
@@ -41,23 +40,26 @@ from datahub.configuration.common import (
41
40
  TraceTimeoutError,
42
41
  TraceValidationError,
43
42
  )
44
- from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
45
43
  from datahub.emitter.generic_emitter import Emitter
46
44
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
47
- from datahub.emitter.request_helper import make_curl_command
45
+ from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
48
46
  from datahub.emitter.response_helper import (
49
47
  TraceData,
50
48
  extract_trace_data,
51
49
  extract_trace_data_from_mcps,
52
50
  )
53
51
  from datahub.emitter.serialization_helper import pre_json_transform
54
- from datahub.errors import APITracingWarning
55
52
  from datahub.ingestion.api.closeable import Closeable
53
+ from datahub.ingestion.graph.config import (
54
+ DATAHUB_COMPONENT_ENV,
55
+ ClientMode,
56
+ )
56
57
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
57
58
  MetadataChangeEvent,
58
59
  MetadataChangeProposal,
59
60
  )
60
61
  from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
62
+ from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
61
63
 
62
64
  if TYPE_CHECKING:
63
65
  from datahub.ingestion.graph.client import DataHubGraph
@@ -80,6 +82,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
80
82
 
81
83
  _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
82
84
 
85
+ _DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
86
+
83
87
  TRACE_PENDING_STATUS = "PENDING"
84
88
  TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
85
89
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
@@ -134,12 +138,24 @@ class RequestsSessionConfig(ConfigModel):
134
138
  ca_certificate_path: Optional[str] = None
135
139
  client_certificate_path: Optional[str] = None
136
140
  disable_ssl_verification: bool = False
141
+ client_mode: Optional[ClientMode] = _DEFAULT_CLIENT_MODE
142
+ datahub_component: Optional[str] = None
137
143
 
138
144
  def build_session(self) -> requests.Session:
139
145
  session = requests.Session()
140
146
 
141
- if self.extra_headers:
142
- session.headers.update(self.extra_headers)
147
+ user_agent = self._get_user_agent_string(session)
148
+
149
+ base_headers = {
150
+ "User-Agent": user_agent,
151
+ "X-DataHub-Client-Mode": self.client_mode.name
152
+ if self.client_mode
153
+ else _DEFAULT_CLIENT_MODE.name,
154
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
155
+ }
156
+
157
+ headers = {**base_headers, **self.extra_headers}
158
+ session.headers.update(headers)
143
159
 
144
160
  if self.client_certificate_path:
145
161
  session.cert = self.client_certificate_path
@@ -187,6 +203,59 @@ class RequestsSessionConfig(ConfigModel):
187
203
 
188
204
  return session
189
205
 
206
+ @classmethod
207
+ def get_client_mode_from_session(
208
+ cls, session: requests.Session
209
+ ) -> Optional[ClientMode]:
210
+ """
211
+ Extract the ClientMode enum from a requests Session by checking the headers.
212
+
213
+ Args:
214
+ session: The requests.Session object to check
215
+
216
+ Returns:
217
+ The corresponding ClientMode enum value if found, None otherwise
218
+ """
219
+ # Check if the session has the X-DataHub-Client-Mode header
220
+ mode_str = session.headers.get("X-DataHub-Client-Mode")
221
+
222
+ if not mode_str:
223
+ return None
224
+
225
+ # Try to convert the string value to enum
226
+ try:
227
+ # First ensure we're working with a str value
228
+ if isinstance(mode_str, bytes):
229
+ mode_str = mode_str.decode("utf-8")
230
+
231
+ # Then find the matching enum value
232
+ for mode in ClientMode:
233
+ if mode.name == mode_str:
234
+ return mode
235
+
236
+ # If we got here, no matching enum was found
237
+ return None
238
+ except Exception:
239
+ # Handle any other errors
240
+ return None
241
+
242
+ def _get_user_agent_string(self, session: requests.Session) -> str:
243
+ """Generate appropriate user agent string based on client mode"""
244
+ version = nice_version_name()
245
+ client_mode = self.client_mode if self.client_mode else _DEFAULT_CLIENT_MODE
246
+
247
+ if "User-Agent" in session.headers:
248
+ user_agent = session.headers["User-Agent"]
249
+ if isinstance(user_agent, bytes):
250
+ requests_user_agent = " " + user_agent.decode("utf-8")
251
+ else:
252
+ requests_user_agent = " " + user_agent
253
+ else:
254
+ requests_user_agent = ""
255
+
256
+ # 1.0 refers to the user agent string version
257
+ return f"DataHub-Client/1.0 ({client_mode.name.lower()}; {self.datahub_component if self.datahub_component else DATAHUB_COMPONENT_ENV}; {version}){requests_user_agent}"
258
+
190
259
 
191
260
  @dataclass
192
261
  class _Chunk:
@@ -212,8 +281,9 @@ class DataHubRestEmitter(Closeable, Emitter):
212
281
  _gms_server: str
213
282
  _token: Optional[str]
214
283
  _session: requests.Session
215
- _openapi_ingestion: bool
284
+ _openapi_ingestion: Optional[bool]
216
285
  _default_trace_mode: bool
286
+ server_config: RestServiceConfig
217
287
 
218
288
  def __init__(
219
289
  self,
@@ -229,10 +299,10 @@ class DataHubRestEmitter(Closeable, Emitter):
229
299
  ca_certificate_path: Optional[str] = None,
230
300
  client_certificate_path: Optional[str] = None,
231
301
  disable_ssl_verification: bool = False,
232
- openapi_ingestion: bool = (
233
- DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
234
- ),
302
+ openapi_ingestion: Optional[bool] = None,
235
303
  default_trace_mode: bool = False,
304
+ client_mode: Optional[ClientMode] = None,
305
+ datahub_component: Optional[str] = None,
236
306
  ):
237
307
  if not gms_server:
238
308
  raise ConfigurationError("gms server is required")
@@ -244,13 +314,10 @@ class DataHubRestEmitter(Closeable, Emitter):
244
314
 
245
315
  self._gms_server = fixup_gms_url(gms_server)
246
316
  self._token = token
247
- self.server_config: Dict[str, Any] = {}
248
- self._openapi_ingestion = openapi_ingestion
249
317
  self._default_trace_mode = default_trace_mode
250
318
  self._session = requests.Session()
251
-
252
- logger.debug(
253
- f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
319
+ self._openapi_ingestion = (
320
+ openapi_ingestion # Re-evaluated after test connection
254
321
  )
255
322
 
256
323
  if self._default_trace_mode:
@@ -258,7 +325,6 @@ class DataHubRestEmitter(Closeable, Emitter):
258
325
 
259
326
  headers = {
260
327
  "X-RestLi-Protocol-Version": "2.0.0",
261
- "X-DataHub-Py-Cli-Version": nice_version_name(),
262
328
  "Content-Type": "application/json",
263
329
  }
264
330
  if token:
@@ -304,37 +370,54 @@ class DataHubRestEmitter(Closeable, Emitter):
304
370
  ca_certificate_path=ca_certificate_path,
305
371
  client_certificate_path=client_certificate_path,
306
372
  disable_ssl_verification=disable_ssl_verification,
373
+ client_mode=client_mode,
374
+ datahub_component=datahub_component,
307
375
  )
308
376
 
309
377
  self._session = self._session_config.build_session()
310
378
 
311
379
  def test_connection(self) -> None:
312
380
  url = f"{self._gms_server}/config"
313
- response = self._session.get(url)
314
- if response.status_code == 200:
315
- config: dict = response.json()
316
- if config.get("noCode") == "true":
317
- self.server_config = config
318
- return
381
+ try:
382
+ # Create a config instance with session and URL
383
+ config = RestServiceConfig(session=self._session, url=url)
384
+ # Attempt to load config, which will throw ConfigurationError if there's an issue
385
+ config.fetch_config()
386
+ self.server_config = config
387
+
388
+ # Determine OpenAPI mode
389
+ if self._openapi_ingestion is None:
390
+ # No constructor parameter
391
+ if (
392
+ not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
393
+ and self._session_config.client_mode == ClientMode.SDK
394
+ and self.server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
395
+ ):
396
+ # Enable if SDK client and no environment variable specified
397
+ self._openapi_ingestion = True
398
+ else:
399
+ # The system env is specifying the value
400
+ self._openapi_ingestion = (
401
+ DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
402
+ )
319
403
 
320
- else:
321
- raise ConfigurationError(
322
- "You seem to have connected to the frontend service instead of the GMS endpoint. "
323
- "The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
324
- "For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
325
- )
326
- else:
327
404
  logger.debug(
328
- f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
405
+ f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
329
406
  )
330
- if response.status_code == 401:
331
- message = f"Unable to connect to {url} - got an authentication error: {response.text}."
332
- else:
333
- message = f"Unable to connect to {url} with status_code: {response.status_code}."
334
- message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
335
- raise ConfigurationError(message)
336
407
 
337
- def get_server_config(self) -> dict:
408
+ # Set default tracing for SDK
409
+ if (
410
+ self._session_config.client_mode == ClientMode.SDK
411
+ and self.server_config.supports_feature(ServiceFeature.API_TRACING)
412
+ ):
413
+ # Enable tracing if using SDK & server supported
414
+ self._default_trace_mode = True
415
+
416
+ except ConfigurationError as e:
417
+ # Just re-raise the exception
418
+ raise e
419
+
420
+ def get_server_config(self) -> RestServiceConfig:
338
421
  self.test_connection()
339
422
  return self.server_config
340
423
 
@@ -348,43 +431,24 @@ class DataHubRestEmitter(Closeable, Emitter):
348
431
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
349
432
  async_flag: Optional[bool] = None,
350
433
  async_default: bool = False,
351
- ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
352
- if mcp.aspect and mcp.aspectName:
353
- resolved_async_flag = (
354
- async_flag if async_flag is not None else async_default
355
- )
356
- url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
434
+ ) -> Optional[OpenApiRequest]:
435
+ """
436
+ Convert a MetadataChangeProposal to an OpenAPI request format.
357
437
 
358
- if isinstance(mcp, MetadataChangeProposalWrapper):
359
- aspect_value = pre_json_transform(
360
- mcp.to_obj(simplified_structure=True)
361
- )["aspect"]["json"]
362
- else:
363
- obj = mcp.aspect.to_obj()
364
- content_type = obj.get("contentType")
365
- if obj.get("value") and content_type == JSON_CONTENT_TYPE:
366
- # Undo double serialization.
367
- obj = json.loads(obj["value"])
368
- elif content_type == JSON_PATCH_CONTENT_TYPE:
369
- raise NotImplementedError(
370
- "Patches are not supported for OpenAPI ingestion. Set the endpoint to RESTLI."
371
- )
372
- aspect_value = pre_json_transform(obj)
373
- return (
374
- url,
375
- [
376
- {
377
- "urn": mcp.entityUrn,
378
- mcp.aspectName: {
379
- "value": aspect_value,
380
- "systemMetadata": mcp.systemMetadata.to_obj()
381
- if mcp.systemMetadata
382
- else None,
383
- },
384
- }
385
- ],
386
- )
387
- return None
438
+ Args:
439
+ mcp: The metadata change proposal
440
+ async_flag: Optional flag to override async behavior
441
+ async_default: Default async behavior if not specified
442
+
443
+ Returns:
444
+ An OpenApiRequest object or None if the MCP doesn't have required fields
445
+ """
446
+ return OpenApiRequest.from_mcp(
447
+ mcp=mcp,
448
+ gms_server=self._gms_server,
449
+ async_flag=async_flag,
450
+ async_default=async_default,
451
+ )
388
452
 
389
453
  def emit(
390
454
  self,
@@ -448,7 +512,9 @@ class DataHubRestEmitter(Closeable, Emitter):
448
512
  if self._openapi_ingestion:
449
513
  request = self._to_openapi_request(mcp, async_flag, async_default=False)
450
514
  if request:
451
- response = self._emit_generic(request[0], payload=request[1])
515
+ response = self._emit_generic(
516
+ request.url, payload=request.payload, method=request.method
517
+ )
452
518
 
453
519
  if self._should_trace(async_flag, trace_flag):
454
520
  trace_data = extract_trace_data(response) if response else None
@@ -503,31 +569,36 @@ class DataHubRestEmitter(Closeable, Emitter):
503
569
  trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
504
570
  ) -> int:
505
571
  """
506
- 1. Grouping MCPs by their entity URL
572
+ 1. Grouping MCPs by their HTTP method and entity URL and HTTP method
507
573
  2. Breaking down large batches into smaller chunks based on both:
508
574
  * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
509
575
  * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
510
576
 
511
577
  The Chunk class encapsulates both the items and their byte size tracking
512
- Serializing the items only once with json.dumps(request[1]) and reusing that
578
+ Serializing the items only once with json.dumps(request.payload) and reusing that
513
579
  The chunking logic handles edge cases (always accepting at least one item per chunk)
514
580
  The joining logic is efficient with a simple string concatenation
515
581
 
516
582
  :param mcps: metadata change proposals to transmit
517
583
  :param async_flag: the mode
584
+ :param trace_flag: whether to trace the requests
585
+ :param trace_timeout: timeout for tracing
518
586
  :return: number of requests
519
587
  """
520
- # group by entity url
521
- batches: Dict[str, List[_Chunk]] = defaultdict(
588
+ # Group by entity URL and HTTP method
589
+ batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
522
590
  lambda: [_Chunk(items=[])]
523
591
  ) # Initialize with one empty Chunk
524
592
 
525
593
  for mcp in mcps:
526
594
  request = self._to_openapi_request(mcp, async_flag, async_default=True)
527
595
  if request:
528
- current_chunk = batches[request[0]][-1] # Get the last chunk
529
- # Only serialize once
530
- serialized_item = json.dumps(request[1][0])
596
+ # Create a composite key with both method and URL
597
+ key = (request.method, request.url)
598
+ current_chunk = batches[key][-1] # Get the last chunk
599
+
600
+ # Only serialize once - we're serializing a single payload item
601
+ serialized_item = json.dumps(request.payload[0])
531
602
  item_bytes = len(serialized_item.encode())
532
603
 
533
604
  # If adding this item would exceed max_bytes, create a new chunk
@@ -537,15 +608,17 @@ class DataHubRestEmitter(Closeable, Emitter):
537
608
  or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
538
609
  ):
539
610
  new_chunk = _Chunk(items=[])
540
- batches[request[0]].append(new_chunk)
611
+ batches[key].append(new_chunk)
541
612
  current_chunk = new_chunk
542
613
 
543
614
  current_chunk.add_item(serialized_item)
544
615
 
545
616
  responses = []
546
- for url, chunks in batches.items():
617
+ for (method, url), chunks in batches.items():
547
618
  for chunk in chunks:
548
- response = self._emit_generic(url, payload=_Chunk.join(chunk))
619
+ response = self._emit_generic(
620
+ url, payload=_Chunk.join(chunk), method=method
621
+ )
549
622
  responses.append(response)
550
623
 
551
624
  if self._should_trace(async_flag, trace_flag, async_default=True):
@@ -618,11 +691,13 @@ class DataHubRestEmitter(Closeable, Emitter):
618
691
  payload = json.dumps(snapshot)
619
692
  self._emit_generic(url, payload)
620
693
 
621
- def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
694
+ def _emit_generic(
695
+ self, url: str, payload: Union[str, Any], method: str = "POST"
696
+ ) -> requests.Response:
622
697
  if not isinstance(payload, str):
623
698
  payload = json.dumps(payload)
624
699
 
625
- curl_command = make_curl_command(self._session, "POST", url, payload)
700
+ curl_command = make_curl_command(self._session, method, url, payload)
626
701
  payload_size = len(payload)
627
702
  if payload_size > INGEST_MAX_PAYLOAD_BYTES:
628
703
  # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
@@ -635,7 +710,8 @@ class DataHubRestEmitter(Closeable, Emitter):
635
710
  curl_command,
636
711
  )
637
712
  try:
638
- response = self._session.post(url, data=payload)
713
+ method_func = getattr(self._session, method.lower())
714
+ response = method_func(url, data=payload) if payload else method_func(url)
639
715
  response.raise_for_status()
640
716
  return response
641
717
  except HTTPError as e:
@@ -759,12 +835,6 @@ class DataHubRestEmitter(Closeable, Emitter):
759
835
  trace_flag if trace_flag is not None else self._default_trace_mode
760
836
  )
761
837
  resolved_async_flag = async_flag if async_flag is not None else async_default
762
- if resolved_trace_flag and not resolved_async_flag:
763
- warnings.warn(
764
- "API tracing is only available with async ingestion. For sync mode, API errors will be surfaced as exceptions.",
765
- APITracingWarning,
766
- stacklevel=3,
767
- )
768
838
  return resolved_trace_flag and resolved_async_flag
769
839
 
770
840
  def __repr__(self) -> str:
datahub/entrypoints.py CHANGED
@@ -37,6 +37,7 @@ from datahub.cli.telemetry import telemetry as telemetry_cli
37
37
  from datahub.cli.timeline_cli import timeline
38
38
  from datahub.configuration.common import should_show_stack_trace
39
39
  from datahub.ingestion.graph.client import get_default_graph
40
+ from datahub.ingestion.graph.config import ClientMode
40
41
  from datahub.telemetry import telemetry
41
42
  from datahub.utilities._custom_package_loader import model_version_name
42
43
  from datahub.utilities.logging_manager import configure_logging
@@ -117,7 +118,7 @@ def version(include_server: bool = False) -> None:
117
118
  click.echo(f"Models: {model_version_name()}")
118
119
  click.echo(f"Python version: {sys.version}")
119
120
  if include_server:
120
- server_config = get_default_graph().get_config()
121
+ server_config = get_default_graph(ClientMode.CLI).get_config()
121
122
  click.echo(f"Server config: {server_config}")
122
123
 
123
124
 
@@ -420,12 +420,9 @@ class Source(Closeable, metaclass=ABCMeta):
420
420
  Run in order, first in list is applied first. Be careful with order when overriding.
421
421
  """
422
422
  browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
423
- if (
424
- self.ctx.pipeline_config
425
- and self.ctx.pipeline_config.flags.generate_browse_path_v2
426
- ):
423
+ if self.ctx.flags.generate_browse_path_v2:
427
424
  browse_path_processor = self._get_browse_path_processor(
428
- self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run
425
+ self.ctx.flags.generate_browse_path_v2_dry_run
429
426
  )
430
427
 
431
428
  auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
@@ -319,8 +319,10 @@ def classification_workunit_processor(
319
319
  partial(
320
320
  data_reader.get_sample_data_for_table,
321
321
  table_id,
322
- classification_handler.config.classification.sample_size
323
- * SAMPLE_SIZE_MULTIPLIER,
322
+ int(
323
+ classification_handler.config.classification.sample_size
324
+ * SAMPLE_SIZE_MULTIPLIER
325
+ ),
324
326
  **(data_reader_kwargs or {}),
325
327
  )
326
328
  if data_reader
@@ -34,14 +34,13 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
34
34
  from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
35
35
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
36
36
  from datahub.emitter.rest_emitter import (
37
- DEFAULT_REST_EMITTER_ENDPOINT,
38
37
  DEFAULT_REST_TRACE_MODE,
39
38
  DatahubRestEmitter,
40
- RestSinkEndpoint,
41
39
  RestTraceMode,
42
40
  )
43
41
  from datahub.emitter.serialization_helper import post_json_transform
44
42
  from datahub.ingestion.graph.config import (
43
+ ClientMode,
45
44
  DatahubClientConfig as DatahubClientConfig,
46
45
  )
47
46
  from datahub.ingestion.graph.connections import (
@@ -158,11 +157,12 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
158
157
  ca_certificate_path=self.config.ca_certificate_path,
159
158
  client_certificate_path=self.config.client_certificate_path,
160
159
  disable_ssl_verification=self.config.disable_ssl_verification,
161
- openapi_ingestion=DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI,
160
+ openapi_ingestion=self.config.openapi_ingestion,
162
161
  default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
162
+ client_mode=config.client_mode,
163
+ datahub_component=config.datahub_component,
163
164
  )
164
-
165
- self.server_id = _MISSING_SERVER_ID
165
+ self.server_id: str = _MISSING_SERVER_ID
166
166
 
167
167
  def test_connection(self) -> None:
168
168
  super().test_connection()
@@ -193,7 +193,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
193
193
  if not self.server_config:
194
194
  self.test_connection()
195
195
 
196
- base_url = self.server_config.get("baseUrl")
196
+ base_url = self.server_config.raw_config.get("baseUrl")
197
197
  if not base_url:
198
198
  raise ValueError("baseUrl not found in server config")
199
199
  return base_url
@@ -201,6 +201,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
201
201
  @classmethod
202
202
  def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
203
203
  session_config = emitter._session_config
204
+
204
205
  if isinstance(session_config.timeout, tuple):
205
206
  # TODO: This is slightly lossy. Eventually, we want to modify the emitter
206
207
  # to accept a tuple for timeout_sec, and then we'll be able to remove this.
@@ -218,6 +219,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
218
219
  disable_ssl_verification=session_config.disable_ssl_verification,
219
220
  ca_certificate_path=session_config.ca_certificate_path,
220
221
  client_certificate_path=session_config.client_certificate_path,
222
+ client_mode=session_config.client_mode,
223
+ datahub_component=session_config.datahub_component,
221
224
  )
222
225
  )
223
226
 
@@ -1952,8 +1955,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1952
1955
  super().close()
1953
1956
 
1954
1957
 
1955
- def get_default_graph() -> DataHubGraph:
1958
+ @functools.lru_cache(maxsize=None)
1959
+ def get_default_graph(
1960
+ client_mode: Optional[ClientMode] = None,
1961
+ datahub_component: Optional[str] = None,
1962
+ ) -> DataHubGraph:
1956
1963
  graph_config = config_utils.load_client_config()
1964
+ graph_config.client_mode = client_mode
1965
+ graph_config.datahub_component = datahub_component
1957
1966
  graph = DataHubGraph(graph_config)
1958
1967
  graph.test_connection()
1959
1968
  telemetry_instance.set_context(server=graph)
@@ -1,8 +1,19 @@
1
+ import os
2
+ from enum import Enum, auto
1
3
  from typing import Dict, List, Optional
2
4
 
3
5
  from datahub.configuration.common import ConfigModel
4
6
 
5
7
 
8
+ class ClientMode(Enum):
9
+ INGESTION = auto()
10
+ CLI = auto()
11
+ SDK = auto()
12
+
13
+
14
+ DATAHUB_COMPONENT_ENV: str = os.getenv("DATAHUB_COMPONENT", "datahub").lower()
15
+
16
+
6
17
  class DatahubClientConfig(ConfigModel):
7
18
  """Configuration class for holding connectivity to datahub gms"""
8
19
 
@@ -17,3 +28,6 @@ class DatahubClientConfig(ConfigModel):
17
28
  ca_certificate_path: Optional[str] = None
18
29
  client_certificate_path: Optional[str] = None
19
30
  disable_ssl_verification: bool = False
31
+ openapi_ingestion: Optional[bool] = None
32
+ client_mode: Optional[ClientMode] = None
33
+ datahub_component: Optional[str] = None
@@ -18,7 +18,7 @@ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
18
18
  # This can be put directly into the orFilters parameter in GraphQL.
19
19
  RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
20
20
 
21
- # Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
21
+ # Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
22
22
  FilterOperator: TypeAlias = Literal[
23
23
  "CONTAIN",
24
24
  "EQUAL",
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
31
31
  from datahub.ingestion.api.transform import Transformer
32
32
  from datahub.ingestion.extractor.extractor_registry import extractor_registry
33
33
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
34
+ from datahub.ingestion.graph.config import ClientMode
34
35
  from datahub.ingestion.reporting.reporting_provider_registry import (
35
36
  reporting_provider_registry,
36
37
  )
@@ -136,9 +137,8 @@ class CliReport(Report):
136
137
 
137
138
 
138
139
  def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
139
- graph = get_default_graph()
140
+ graph = get_default_graph(ClientMode.INGESTION)
140
141
  sink_config = graph._make_rest_sink_config()
141
-
142
142
  return DatahubRestSink(ctx, sink_config)
143
143
 
144
144
 
@@ -175,6 +175,7 @@ class Pipeline:
175
175
  self.graph: Optional[DataHubGraph] = None
176
176
  with _add_init_error_context("connect to DataHub"):
177
177
  if self.config.datahub_api:
178
+ self.config.datahub_api.client_mode = ClientMode.INGESTION
178
179
  self.graph = exit_stack.enter_context(
179
180
  DataHubGraph(self.config.datahub_api)
180
181
  )
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
7
7
  from pydantic import Field, validator
8
8
 
9
9
  from datahub.configuration.common import ConfigModel, DynamicTypedConfig
10
- from datahub.ingestion.graph.client import DatahubClientConfig
10
+ from datahub.ingestion.graph.config import DatahubClientConfig
11
11
  from datahub.ingestion.sink.file import FileSinkConfig
12
12
 
13
13
  logger = logging.getLogger(__name__)
@@ -34,7 +34,7 @@ from datahub.ingestion.api.sink import (
34
34
  WriteCallback,
35
35
  )
36
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
- from datahub.ingestion.graph.client import DatahubClientConfig
37
+ from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
38
38
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
39
39
  MetadataChangeEvent,
40
40
  MetadataChangeProposal,
@@ -140,11 +140,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
140
140
  f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
141
141
  ) from exc
142
142
 
143
- self.report.gms_version = (
144
- gms_config.get("versions", {})
145
- .get("acryldata/datahub", {})
146
- .get("version", None)
147
- )
143
+ self.report.gms_version = gms_config.service_version
148
144
  self.report.mode = self.config.mode
149
145
  self.report.max_threads = self.config.max_threads
150
146
  logger.debug("Setting env variables to override config")
@@ -180,6 +176,8 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
180
176
  disable_ssl_verification=config.disable_ssl_verification,
181
177
  openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
182
178
  default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
179
+ client_mode=config.client_mode,
180
+ datahub_component=config.datahub_component,
183
181
  )
184
182
 
185
183
  @property
@@ -190,6 +188,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
190
188
  # https://github.com/psf/requests/issues/1871#issuecomment-32751346
191
189
  thread_local = self._emitter_thread_local
192
190
  if not hasattr(thread_local, "emitter"):
191
+ self.config.client_mode = ClientMode.INGESTION
193
192
  thread_local.emitter = DatahubRestSink._make_emitter(self.config)
194
193
  return thread_local.emitter
195
194