acryl-datahub 1.0.0.3rc10__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (45) hide show
  1. {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2471 -2418
  2. {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +45 -45
  3. {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/forms/forms.py +2 -1
  6. datahub/cli/check_cli.py +3 -2
  7. datahub/cli/config_utils.py +2 -2
  8. datahub/cli/delete_cli.py +5 -4
  9. datahub/cli/exists_cli.py +2 -1
  10. datahub/cli/get_cli.py +2 -1
  11. datahub/cli/iceberg_cli.py +6 -5
  12. datahub/cli/ingest_cli.py +9 -6
  13. datahub/cli/migrate.py +4 -3
  14. datahub/cli/migration_utils.py +4 -3
  15. datahub/cli/put_cli.py +3 -2
  16. datahub/cli/specific/assertions_cli.py +2 -1
  17. datahub/cli/specific/datacontract_cli.py +3 -2
  18. datahub/cli/specific/dataproduct_cli.py +10 -9
  19. datahub/cli/specific/dataset_cli.py +4 -3
  20. datahub/cli/specific/forms_cli.py +2 -1
  21. datahub/cli/specific/group_cli.py +2 -1
  22. datahub/cli/specific/structuredproperties_cli.py +4 -3
  23. datahub/cli/specific/user_cli.py +2 -1
  24. datahub/cli/state_cli.py +2 -1
  25. datahub/cli/timeline_cli.py +2 -1
  26. datahub/emitter/rest_emitter.py +120 -42
  27. datahub/entrypoints.py +2 -1
  28. datahub/ingestion/graph/client.py +16 -9
  29. datahub/ingestion/graph/config.py +13 -0
  30. datahub/ingestion/run/pipeline.py +3 -2
  31. datahub/ingestion/run/pipeline_config.py +1 -1
  32. datahub/ingestion/sink/datahub_rest.py +5 -6
  33. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  34. datahub/ingestion/source/ge_data_profiler.py +2 -1
  35. datahub/ingestion/source/metadata/lineage.py +2 -1
  36. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  37. datahub/integrations/assertion/common.py +3 -2
  38. datahub/sdk/main_client.py +2 -2
  39. datahub/secret/datahub_secret_store.py +2 -1
  40. datahub/telemetry/telemetry.py +2 -2
  41. datahub/upgrade/upgrade.py +10 -12
  42. datahub/utilities/server_config_util.py +378 -10
  43. {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
  44. {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
  45. {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from datahub.emitter.mce_builder import (
20
20
  validate_ownership_type,
21
21
  )
22
22
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
23
+ from datahub.ingestion.graph.config import ClientMode
23
24
  from datahub.metadata.schema_classes import OwnerClass, OwnershipTypeClass
24
25
  from datahub.specific.dataproduct import DataProductPatchBuilder
25
26
  from datahub.telemetry import telemetry
@@ -81,7 +82,7 @@ def mutate(file: Path, validate_assets: bool, external_url: str, upsert: bool) -
81
82
 
82
83
  config_dict = load_file(pathlib.Path(file))
83
84
  id = config_dict.get("id") if isinstance(config_dict, dict) else None
84
- with get_default_graph() as graph:
85
+ with get_default_graph(ClientMode.CLI) as graph:
85
86
  data_product: DataProduct = DataProduct.from_yaml(file, graph)
86
87
  external_url_override = (
87
88
  external_url
@@ -162,7 +163,7 @@ def upsert(file: Path, validate_assets: bool, external_url: str) -> None:
162
163
  def diff(file: Path, update: bool) -> None:
163
164
  """Diff a Data Product file with its twin in DataHub"""
164
165
 
165
- with get_default_graph() as emitter:
166
+ with get_default_graph(ClientMode.CLI) as emitter:
166
167
  id: Optional[str] = None
167
168
  try:
168
169
  data_product_local: DataProduct = DataProduct.from_yaml(file, emitter)
@@ -216,7 +217,7 @@ def delete(urn: str, file: Path, hard: bool) -> None:
216
217
  raise click.Abort()
217
218
 
218
219
  graph: DataHubGraph
219
- with get_default_graph() as graph:
220
+ with get_default_graph(ClientMode.CLI) as graph:
220
221
  data_product_urn = (
221
222
  urn if urn.startswith("urn:li:dataProduct") else f"urn:li:dataProduct:{urn}"
222
223
  )
@@ -248,7 +249,7 @@ def get(urn: str, to_file: str) -> None:
248
249
  if not urn.startswith("urn:li:dataProduct:"):
249
250
  urn = f"urn:li:dataProduct:{urn}"
250
251
 
251
- with get_default_graph() as graph:
252
+ with get_default_graph(ClientMode.CLI) as graph:
252
253
  if graph.exists(urn):
253
254
  dataproduct: DataProduct = DataProduct.from_datahub(graph=graph, id=urn)
254
255
  click.secho(
@@ -306,7 +307,7 @@ def set_description(urn: str, description: str, md_file: Path) -> None:
306
307
 
307
308
  dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
308
309
  dataproduct_patcher.set_description(description)
309
- with get_default_graph() as graph:
310
+ with get_default_graph(ClientMode.CLI) as graph:
310
311
  _abort_if_non_existent_urn(graph, urn, "set description")
311
312
  for mcp in dataproduct_patcher.build():
312
313
  graph.emit(mcp)
@@ -342,7 +343,7 @@ def add_owner(urn: str, owner: str, owner_type: str) -> None:
342
343
  owner=_get_owner_urn(owner), type=owner_type, typeUrn=owner_type_urn
343
344
  )
344
345
  )
345
- with get_default_graph() as graph:
346
+ with get_default_graph(ClientMode.CLI) as graph:
346
347
  _abort_if_non_existent_urn(graph, urn, "add owners")
347
348
  for mcp in dataproduct_patcher.build():
348
349
  graph.emit(mcp)
@@ -360,7 +361,7 @@ def remove_owner(urn: str, owner_urn: str) -> None:
360
361
  urn = f"urn:li:dataProduct:{urn}"
361
362
  dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
362
363
  dataproduct_patcher.remove_owner(owner=_get_owner_urn(owner_urn))
363
- with get_default_graph() as graph:
364
+ with get_default_graph(ClientMode.CLI) as graph:
364
365
  _abort_if_non_existent_urn(graph, urn, "remove owners")
365
366
  for mcp in dataproduct_patcher.build():
366
367
  click.echo(json.dumps(mcp.to_obj()))
@@ -382,7 +383,7 @@ def add_asset(urn: str, asset: str, validate_assets: bool) -> None:
382
383
  urn = f"urn:li:dataProduct:{urn}"
383
384
  dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
384
385
  dataproduct_patcher.add_asset(asset)
385
- with get_default_graph() as graph:
386
+ with get_default_graph(ClientMode.CLI) as graph:
386
387
  _abort_if_non_existent_urn(graph, urn, "add assets")
387
388
  if validate_assets:
388
389
  _abort_if_non_existent_urn(
@@ -409,7 +410,7 @@ def remove_asset(urn: str, asset: str, validate_assets: bool) -> None:
409
410
  urn = f"urn:li:dataProduct:{urn}"
410
411
  dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
411
412
  dataproduct_patcher.remove_asset(asset)
412
- with get_default_graph() as graph:
413
+ with get_default_graph(ClientMode.CLI) as graph:
413
414
  _abort_if_non_existent_urn(graph, urn, "remove assets")
414
415
  if validate_assets:
415
416
  _abort_if_non_existent_urn(
@@ -12,6 +12,7 @@ from click_default_group import DefaultGroup
12
12
  from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
13
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
14
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
15
+ from datahub.ingestion.graph.config import ClientMode
15
16
  from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
16
17
  from datahub.telemetry import telemetry
17
18
  from datahub.upgrade import upgrade
@@ -54,7 +55,7 @@ def get(urn: str, to_file: str) -> None:
54
55
  if not urn.startswith("urn:li:dataset:"):
55
56
  urn = f"urn:li:dataset:{urn}"
56
57
 
57
- with get_default_graph() as graph:
58
+ with get_default_graph(ClientMode.CLI) as graph:
58
59
  if graph.exists(urn):
59
60
  dataset: Dataset = Dataset.from_datahub(graph=graph, urn=urn)
60
61
  click.secho(
@@ -82,7 +83,7 @@ def add_sibling(urn: str, sibling_urns: Tuple[str]) -> None:
82
83
  all_urns.add(urn)
83
84
  for sibling_urn in sibling_urns:
84
85
  all_urns.add(sibling_urn)
85
- with get_default_graph() as graph:
86
+ with get_default_graph(ClientMode.CLI) as graph:
86
87
  for _urn in all_urns:
87
88
  _emit_sibling(graph, urn, _urn, all_urns)
88
89
 
@@ -181,7 +182,7 @@ def sync(file: str, to_datahub: bool, dry_run: bool) -> None:
181
182
  dry_run_prefix = "[dry-run]: " if dry_run else "" # prefix to use in messages
182
183
 
183
184
  failures: List[str] = []
184
- with get_default_graph() as graph:
185
+ with get_default_graph(ClientMode.CLI) as graph:
185
186
  datasets = Dataset.from_yaml(file)
186
187
  for dataset in datasets:
187
188
  assert (
@@ -7,6 +7,7 @@ from click_default_group import DefaultGroup
7
7
 
8
8
  from datahub.api.entities.forms.forms import Forms
9
9
  from datahub.ingestion.graph.client import get_default_graph
10
+ from datahub.ingestion.graph.config import ClientMode
10
11
  from datahub.telemetry import telemetry
11
12
  from datahub.upgrade import upgrade
12
13
 
@@ -40,7 +41,7 @@ def upsert(file: Path) -> None:
40
41
  @telemetry.with_telemetry()
41
42
  def get(urn: str, to_file: str) -> None:
42
43
  """Get form from DataHub"""
43
- with get_default_graph() as graph:
44
+ with get_default_graph(ClientMode.CLI) as graph:
44
45
  if graph.exists(urn):
45
46
  form: Forms = Forms.from_datahub(graph=graph, urn=urn)
46
47
  click.secho(
@@ -10,6 +10,7 @@ from datahub.api.entities.corpgroup.corpgroup import (
10
10
  )
11
11
  from datahub.cli.specific.file_loader import load_file
12
12
  from datahub.ingestion.graph.client import get_default_graph
13
+ from datahub.ingestion.graph.config import ClientMode
13
14
  from datahub.telemetry import telemetry
14
15
  from datahub.upgrade import upgrade
15
16
 
@@ -40,7 +41,7 @@ def upsert(file: Path, override_editable: bool) -> None:
40
41
 
41
42
  config_dict = load_file(file)
42
43
  group_configs = config_dict if isinstance(config_dict, list) else [config_dict]
43
- with get_default_graph() as emitter:
44
+ with get_default_graph(ClientMode.CLI) as emitter:
44
45
  for group_config in group_configs:
45
46
  try:
46
47
  datahub_group = CorpGroup.parse_obj(group_config)
@@ -11,6 +11,7 @@ from datahub.api.entities.structuredproperties.structuredproperties import (
11
11
  StructuredProperties,
12
12
  )
13
13
  from datahub.ingestion.graph.client import get_default_graph
14
+ from datahub.ingestion.graph.config import ClientMode
14
15
  from datahub.telemetry import telemetry
15
16
  from datahub.upgrade import upgrade
16
17
  from datahub.utilities.urns.urn import Urn
@@ -33,7 +34,7 @@ def properties() -> None:
33
34
  def upsert(file: Path) -> None:
34
35
  """Upsert structured properties in DataHub."""
35
36
 
36
- with get_default_graph() as graph:
37
+ with get_default_graph(ClientMode.CLI) as graph:
37
38
  StructuredProperties.create(str(file), graph)
38
39
 
39
40
 
@@ -48,7 +49,7 @@ def get(urn: str, to_file: str) -> None:
48
49
  """Get structured properties from DataHub"""
49
50
  urn = Urn.make_structured_property_urn(urn)
50
51
 
51
- with get_default_graph() as graph:
52
+ with get_default_graph(ClientMode.CLI) as graph:
52
53
  if graph.exists(urn):
53
54
  structuredproperties: StructuredProperties = (
54
55
  StructuredProperties.from_datahub(graph=graph, urn=urn)
@@ -117,7 +118,7 @@ def list(details: bool, to_file: str) -> None:
117
118
  with open(file, "w") as fp:
118
119
  yaml.dump(serialized_objects, fp)
119
120
 
120
- with get_default_graph() as graph:
121
+ with get_default_graph(ClientMode.CLI) as graph:
121
122
  if details:
122
123
  logger.info(
123
124
  "Listing structured properties with details. Use --no-details for urns only"
@@ -8,6 +8,7 @@ from click_default_group import DefaultGroup
8
8
  from datahub.api.entities.corpuser.corpuser import CorpUser, CorpUserGenerationConfig
9
9
  from datahub.cli.specific.file_loader import load_file
10
10
  from datahub.ingestion.graph.client import get_default_graph
11
+ from datahub.ingestion.graph.config import ClientMode
11
12
  from datahub.telemetry import telemetry
12
13
  from datahub.upgrade import upgrade
13
14
 
@@ -38,7 +39,7 @@ def upsert(file: Path, override_editable: bool) -> None:
38
39
 
39
40
  config_dict = load_file(pathlib.Path(file))
40
41
  user_configs = config_dict if isinstance(config_dict, list) else [config_dict]
41
- with get_default_graph() as emitter:
42
+ with get_default_graph(ClientMode.CLI) as emitter:
42
43
  for user_config in user_configs:
43
44
  try:
44
45
  datahub_user: CorpUser = CorpUser.parse_obj(user_config)
datahub/cli/state_cli.py CHANGED
@@ -5,6 +5,7 @@ import click
5
5
  from click_default_group import DefaultGroup
6
6
 
7
7
  from datahub.ingestion.graph.client import get_default_graph
8
+ from datahub.ingestion.graph.config import ClientMode
8
9
  from datahub.telemetry import telemetry
9
10
  from datahub.upgrade import upgrade
10
11
 
@@ -28,7 +29,7 @@ def inspect(pipeline_name: str, platform: str) -> None:
28
29
  Only works for state entity removal for now.
29
30
  """
30
31
 
31
- datahub_graph = get_default_graph()
32
+ datahub_graph = get_default_graph(ClientMode.CLI)
32
33
  checkpoint = datahub_graph.get_latest_pipeline_checkpoint(pipeline_name, platform)
33
34
  if not checkpoint:
34
35
  click.secho("No ingestion state found.", fg="red")
@@ -9,6 +9,7 @@ from requests import Response
9
9
 
10
10
  from datahub.emitter.mce_builder import dataset_urn_to_key, schema_field_urn_to_key
11
11
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
12
+ from datahub.ingestion.graph.config import ClientMode
12
13
  from datahub.telemetry import telemetry
13
14
  from datahub.upgrade import upgrade
14
15
  from datahub.utilities.urns.urn import Urn
@@ -63,7 +64,7 @@ def get_timeline(
63
64
  diff: bool,
64
65
  graph: Optional[DataHubGraph] = None,
65
66
  ) -> Any:
66
- client = graph if graph else get_default_graph()
67
+ client = graph if graph else get_default_graph(ClientMode.CLI)
67
68
  session = client._session
68
69
  host = client.config.server
69
70
  if urn.startswith("urn%3A"):
@@ -5,7 +5,6 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import time
8
- import warnings
9
8
  from collections import defaultdict
10
9
  from dataclasses import dataclass
11
10
  from datetime import datetime, timedelta
@@ -50,13 +49,17 @@ from datahub.emitter.response_helper import (
50
49
  extract_trace_data_from_mcps,
51
50
  )
52
51
  from datahub.emitter.serialization_helper import pre_json_transform
53
- from datahub.errors import APITracingWarning
54
52
  from datahub.ingestion.api.closeable import Closeable
53
+ from datahub.ingestion.graph.config import (
54
+ DATAHUB_COMPONENT_ENV,
55
+ ClientMode,
56
+ )
55
57
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
56
58
  MetadataChangeEvent,
57
59
  MetadataChangeProposal,
58
60
  )
59
61
  from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
62
+ from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
60
63
 
61
64
  if TYPE_CHECKING:
62
65
  from datahub.ingestion.graph.client import DataHubGraph
@@ -79,6 +82,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
79
82
 
80
83
  _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
81
84
 
85
+ _DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
86
+
82
87
  TRACE_PENDING_STATUS = "PENDING"
83
88
  TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
84
89
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
@@ -133,12 +138,24 @@ class RequestsSessionConfig(ConfigModel):
133
138
  ca_certificate_path: Optional[str] = None
134
139
  client_certificate_path: Optional[str] = None
135
140
  disable_ssl_verification: bool = False
141
+ client_mode: Optional[ClientMode] = _DEFAULT_CLIENT_MODE
142
+ datahub_component: Optional[str] = None
136
143
 
137
144
  def build_session(self) -> requests.Session:
138
145
  session = requests.Session()
139
146
 
140
- if self.extra_headers:
141
- session.headers.update(self.extra_headers)
147
+ user_agent = self._get_user_agent_string(session)
148
+
149
+ base_headers = {
150
+ "User-Agent": user_agent,
151
+ "X-DataHub-Client-Mode": self.client_mode.name
152
+ if self.client_mode
153
+ else _DEFAULT_CLIENT_MODE.name,
154
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
155
+ }
156
+
157
+ headers = {**base_headers, **self.extra_headers}
158
+ session.headers.update(headers)
142
159
 
143
160
  if self.client_certificate_path:
144
161
  session.cert = self.client_certificate_path
@@ -186,6 +203,59 @@ class RequestsSessionConfig(ConfigModel):
186
203
 
187
204
  return session
188
205
 
206
+ @classmethod
207
+ def get_client_mode_from_session(
208
+ cls, session: requests.Session
209
+ ) -> Optional[ClientMode]:
210
+ """
211
+ Extract the ClientMode enum from a requests Session by checking the headers.
212
+
213
+ Args:
214
+ session: The requests.Session object to check
215
+
216
+ Returns:
217
+ The corresponding ClientMode enum value if found, None otherwise
218
+ """
219
+ # Check if the session has the X-DataHub-Client-Mode header
220
+ mode_str = session.headers.get("X-DataHub-Client-Mode")
221
+
222
+ if not mode_str:
223
+ return None
224
+
225
+ # Try to convert the string value to enum
226
+ try:
227
+ # First ensure we're working with a str value
228
+ if isinstance(mode_str, bytes):
229
+ mode_str = mode_str.decode("utf-8")
230
+
231
+ # Then find the matching enum value
232
+ for mode in ClientMode:
233
+ if mode.name == mode_str:
234
+ return mode
235
+
236
+ # If we got here, no matching enum was found
237
+ return None
238
+ except Exception:
239
+ # Handle any other errors
240
+ return None
241
+
242
+ def _get_user_agent_string(self, session: requests.Session) -> str:
243
+ """Generate appropriate user agent string based on client mode"""
244
+ version = nice_version_name()
245
+ client_mode = self.client_mode if self.client_mode else _DEFAULT_CLIENT_MODE
246
+
247
+ if "User-Agent" in session.headers:
248
+ user_agent = session.headers["User-Agent"]
249
+ if isinstance(user_agent, bytes):
250
+ requests_user_agent = " " + user_agent.decode("utf-8")
251
+ else:
252
+ requests_user_agent = " " + user_agent
253
+ else:
254
+ requests_user_agent = ""
255
+
256
+ # 1.0 refers to the user agent string version
257
+ return f"DataHub-Client/1.0 ({client_mode.name.lower()}; {self.datahub_component if self.datahub_component else DATAHUB_COMPONENT_ENV}; {version}){requests_user_agent}"
258
+
189
259
 
190
260
  @dataclass
191
261
  class _Chunk:
@@ -211,8 +281,9 @@ class DataHubRestEmitter(Closeable, Emitter):
211
281
  _gms_server: str
212
282
  _token: Optional[str]
213
283
  _session: requests.Session
214
- _openapi_ingestion: bool
284
+ _openapi_ingestion: Optional[bool]
215
285
  _default_trace_mode: bool
286
+ server_config: RestServiceConfig
216
287
 
217
288
  def __init__(
218
289
  self,
@@ -228,10 +299,10 @@ class DataHubRestEmitter(Closeable, Emitter):
228
299
  ca_certificate_path: Optional[str] = None,
229
300
  client_certificate_path: Optional[str] = None,
230
301
  disable_ssl_verification: bool = False,
231
- openapi_ingestion: bool = (
232
- DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
233
- ),
302
+ openapi_ingestion: Optional[bool] = None,
234
303
  default_trace_mode: bool = False,
304
+ client_mode: Optional[ClientMode] = None,
305
+ datahub_component: Optional[str] = None,
235
306
  ):
236
307
  if not gms_server:
237
308
  raise ConfigurationError("gms server is required")
@@ -243,13 +314,10 @@ class DataHubRestEmitter(Closeable, Emitter):
243
314
 
244
315
  self._gms_server = fixup_gms_url(gms_server)
245
316
  self._token = token
246
- self.server_config: Dict[str, Any] = {}
247
- self._openapi_ingestion = openapi_ingestion
248
317
  self._default_trace_mode = default_trace_mode
249
318
  self._session = requests.Session()
250
-
251
- logger.debug(
252
- f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
319
+ self._openapi_ingestion = (
320
+ openapi_ingestion # Re-evaluated after test connection
253
321
  )
254
322
 
255
323
  if self._default_trace_mode:
@@ -257,7 +325,6 @@ class DataHubRestEmitter(Closeable, Emitter):
257
325
 
258
326
  headers = {
259
327
  "X-RestLi-Protocol-Version": "2.0.0",
260
- "X-DataHub-Py-Cli-Version": nice_version_name(),
261
328
  "Content-Type": "application/json",
262
329
  }
263
330
  if token:
@@ -303,37 +370,54 @@ class DataHubRestEmitter(Closeable, Emitter):
303
370
  ca_certificate_path=ca_certificate_path,
304
371
  client_certificate_path=client_certificate_path,
305
372
  disable_ssl_verification=disable_ssl_verification,
373
+ client_mode=client_mode,
374
+ datahub_component=datahub_component,
306
375
  )
307
376
 
308
377
  self._session = self._session_config.build_session()
309
378
 
310
379
  def test_connection(self) -> None:
311
380
  url = f"{self._gms_server}/config"
312
- response = self._session.get(url)
313
- if response.status_code == 200:
314
- config: dict = response.json()
315
- if config.get("noCode") == "true":
316
- self.server_config = config
317
- return
381
+ try:
382
+ # Create a config instance with session and URL
383
+ config = RestServiceConfig(session=self._session, url=url)
384
+ # Attempt to load config, which will throw ConfigurationError if there's an issue
385
+ config.fetch_config()
386
+ self.server_config = config
387
+
388
+ # Determine OpenAPI mode
389
+ if self._openapi_ingestion is None:
390
+ # No constructor parameter
391
+ if (
392
+ not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
393
+ and self._session_config.client_mode == ClientMode.SDK
394
+ and self.server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
395
+ ):
396
+ # Enable if SDK client and no environment variable specified
397
+ self._openapi_ingestion = True
398
+ else:
399
+ # The system env is specifying the value
400
+ self._openapi_ingestion = (
401
+ DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
402
+ )
318
403
 
319
- else:
320
- raise ConfigurationError(
321
- "You seem to have connected to the frontend service instead of the GMS endpoint. "
322
- "The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
323
- "For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
324
- )
325
- else:
326
404
  logger.debug(
327
- f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
405
+ f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
328
406
  )
329
- if response.status_code == 401:
330
- message = f"Unable to connect to {url} - got an authentication error: {response.text}."
331
- else:
332
- message = f"Unable to connect to {url} with status_code: {response.status_code}."
333
- message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
334
- raise ConfigurationError(message)
335
407
 
336
- def get_server_config(self) -> dict:
408
+ # Set default tracing for SDK
409
+ if (
410
+ self._session_config.client_mode == ClientMode.SDK
411
+ and self.server_config.supports_feature(ServiceFeature.API_TRACING)
412
+ ):
413
+ # Enable tracing if using SDK & server supported
414
+ self._default_trace_mode = True
415
+
416
+ except ConfigurationError as e:
417
+ # Just re-raise the exception
418
+ raise e
419
+
420
+ def get_server_config(self) -> RestServiceConfig:
337
421
  self.test_connection()
338
422
  return self.server_config
339
423
 
@@ -485,7 +569,7 @@ class DataHubRestEmitter(Closeable, Emitter):
485
569
  trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
486
570
  ) -> int:
487
571
  """
488
- 1. Grouping MCPs by their HTTP method and entity URL
572
+ 1. Grouping MCPs by their HTTP method and entity URL and HTTP method
489
573
  2. Breaking down large batches into smaller chunks based on both:
490
574
  * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
491
575
  * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
@@ -751,12 +835,6 @@ class DataHubRestEmitter(Closeable, Emitter):
751
835
  trace_flag if trace_flag is not None else self._default_trace_mode
752
836
  )
753
837
  resolved_async_flag = async_flag if async_flag is not None else async_default
754
- if resolved_trace_flag and not resolved_async_flag:
755
- warnings.warn(
756
- "API tracing is only available with async ingestion. For sync mode, API errors will be surfaced as exceptions.",
757
- APITracingWarning,
758
- stacklevel=3,
759
- )
760
838
  return resolved_trace_flag and resolved_async_flag
761
839
 
762
840
  def __repr__(self) -> str:
datahub/entrypoints.py CHANGED
@@ -37,6 +37,7 @@ from datahub.cli.telemetry import telemetry as telemetry_cli
37
37
  from datahub.cli.timeline_cli import timeline
38
38
  from datahub.configuration.common import should_show_stack_trace
39
39
  from datahub.ingestion.graph.client import get_default_graph
40
+ from datahub.ingestion.graph.config import ClientMode
40
41
  from datahub.telemetry import telemetry
41
42
  from datahub.utilities._custom_package_loader import model_version_name
42
43
  from datahub.utilities.logging_manager import configure_logging
@@ -117,7 +118,7 @@ def version(include_server: bool = False) -> None:
117
118
  click.echo(f"Models: {model_version_name()}")
118
119
  click.echo(f"Python version: {sys.version}")
119
120
  if include_server:
120
- server_config = get_default_graph().get_config()
121
+ server_config = get_default_graph(ClientMode.CLI).get_config()
121
122
  click.echo(f"Server config: {server_config}")
122
123
 
123
124
 
@@ -34,14 +34,13 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
34
34
  from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
35
35
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
36
36
  from datahub.emitter.rest_emitter import (
37
- DEFAULT_REST_EMITTER_ENDPOINT,
38
37
  DEFAULT_REST_TRACE_MODE,
39
38
  DatahubRestEmitter,
40
- RestSinkEndpoint,
41
39
  RestTraceMode,
42
40
  )
43
41
  from datahub.emitter.serialization_helper import post_json_transform
44
42
  from datahub.ingestion.graph.config import (
43
+ ClientMode,
45
44
  DatahubClientConfig as DatahubClientConfig,
46
45
  )
47
46
  from datahub.ingestion.graph.connections import (
@@ -158,13 +157,12 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
158
157
  ca_certificate_path=self.config.ca_certificate_path,
159
158
  client_certificate_path=self.config.client_certificate_path,
160
159
  disable_ssl_verification=self.config.disable_ssl_verification,
161
- openapi_ingestion=self.config.openapi_ingestion
162
- if self.config.openapi_ingestion is not None
163
- else (DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI),
160
+ openapi_ingestion=self.config.openapi_ingestion,
164
161
  default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
162
+ client_mode=config.client_mode,
163
+ datahub_component=config.datahub_component,
165
164
  )
166
-
167
- self.server_id = _MISSING_SERVER_ID
165
+ self.server_id: str = _MISSING_SERVER_ID
168
166
 
169
167
  def test_connection(self) -> None:
170
168
  super().test_connection()
@@ -195,7 +193,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
195
193
  if not self.server_config:
196
194
  self.test_connection()
197
195
 
198
- base_url = self.server_config.get("baseUrl")
196
+ base_url = self.server_config.raw_config.get("baseUrl")
199
197
  if not base_url:
200
198
  raise ValueError("baseUrl not found in server config")
201
199
  return base_url
@@ -203,6 +201,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
203
201
  @classmethod
204
202
  def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
205
203
  session_config = emitter._session_config
204
+
206
205
  if isinstance(session_config.timeout, tuple):
207
206
  # TODO: This is slightly lossy. Eventually, we want to modify the emitter
208
207
  # to accept a tuple for timeout_sec, and then we'll be able to remove this.
@@ -220,6 +219,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
220
219
  disable_ssl_verification=session_config.disable_ssl_verification,
221
220
  ca_certificate_path=session_config.ca_certificate_path,
222
221
  client_certificate_path=session_config.client_certificate_path,
222
+ client_mode=session_config.client_mode,
223
+ datahub_component=session_config.datahub_component,
223
224
  )
224
225
  )
225
226
 
@@ -1954,8 +1955,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1954
1955
  super().close()
1955
1956
 
1956
1957
 
1957
- def get_default_graph() -> DataHubGraph:
1958
+ @functools.lru_cache(maxsize=None)
1959
+ def get_default_graph(
1960
+ client_mode: Optional[ClientMode] = None,
1961
+ datahub_component: Optional[str] = None,
1962
+ ) -> DataHubGraph:
1958
1963
  graph_config = config_utils.load_client_config()
1964
+ graph_config.client_mode = client_mode
1965
+ graph_config.datahub_component = datahub_component
1959
1966
  graph = DataHubGraph(graph_config)
1960
1967
  graph.test_connection()
1961
1968
  telemetry_instance.set_context(server=graph)
@@ -1,8 +1,19 @@
1
+ import os
2
+ from enum import Enum, auto
1
3
  from typing import Dict, List, Optional
2
4
 
3
5
  from datahub.configuration.common import ConfigModel
4
6
 
5
7
 
8
+ class ClientMode(Enum):
9
+ INGESTION = auto()
10
+ CLI = auto()
11
+ SDK = auto()
12
+
13
+
14
+ DATAHUB_COMPONENT_ENV: str = os.getenv("DATAHUB_COMPONENT", "datahub").lower()
15
+
16
+
6
17
  class DatahubClientConfig(ConfigModel):
7
18
  """Configuration class for holding connectivity to datahub gms"""
8
19
 
@@ -18,3 +29,5 @@ class DatahubClientConfig(ConfigModel):
18
29
  client_certificate_path: Optional[str] = None
19
30
  disable_ssl_verification: bool = False
20
31
  openapi_ingestion: Optional[bool] = None
32
+ client_mode: Optional[ClientMode] = None
33
+ datahub_component: Optional[str] = None
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
31
31
  from datahub.ingestion.api.transform import Transformer
32
32
  from datahub.ingestion.extractor.extractor_registry import extractor_registry
33
33
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
34
+ from datahub.ingestion.graph.config import ClientMode
34
35
  from datahub.ingestion.reporting.reporting_provider_registry import (
35
36
  reporting_provider_registry,
36
37
  )
@@ -136,9 +137,8 @@ class CliReport(Report):
136
137
 
137
138
 
138
139
  def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
139
- graph = get_default_graph()
140
+ graph = get_default_graph(ClientMode.INGESTION)
140
141
  sink_config = graph._make_rest_sink_config()
141
-
142
142
  return DatahubRestSink(ctx, sink_config)
143
143
 
144
144
 
@@ -175,6 +175,7 @@ class Pipeline:
175
175
  self.graph: Optional[DataHubGraph] = None
176
176
  with _add_init_error_context("connect to DataHub"):
177
177
  if self.config.datahub_api:
178
+ self.config.datahub_api.client_mode = ClientMode.INGESTION
178
179
  self.graph = exit_stack.enter_context(
179
180
  DataHubGraph(self.config.datahub_api)
180
181
  )
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
7
7
  from pydantic import Field, validator
8
8
 
9
9
  from datahub.configuration.common import ConfigModel, DynamicTypedConfig
10
- from datahub.ingestion.graph.client import DatahubClientConfig
10
+ from datahub.ingestion.graph.config import DatahubClientConfig
11
11
  from datahub.ingestion.sink.file import FileSinkConfig
12
12
 
13
13
  logger = logging.getLogger(__name__)