acryl-datahub 1.0.0.3rc10__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2471 -2418
- {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +45 -45
- {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/entities/forms/forms.py +2 -1
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +5 -4
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/emitter/rest_emitter.py +120 -42
- datahub/entrypoints.py +2 -1
- datahub/ingestion/graph/client.py +16 -9
- datahub/ingestion/graph/config.py +13 -0
- datahub/ingestion/run/pipeline.py +3 -2
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +2 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/integrations/assertion/common.py +3 -2
- datahub/sdk/main_client.py +2 -2
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/telemetry/telemetry.py +2 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/server_config_util.py +378 -10
- {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc10.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ from datahub.emitter.mce_builder import (
|
|
|
20
20
|
validate_ownership_type,
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
23
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
23
24
|
from datahub.metadata.schema_classes import OwnerClass, OwnershipTypeClass
|
|
24
25
|
from datahub.specific.dataproduct import DataProductPatchBuilder
|
|
25
26
|
from datahub.telemetry import telemetry
|
|
@@ -81,7 +82,7 @@ def mutate(file: Path, validate_assets: bool, external_url: str, upsert: bool) -
|
|
|
81
82
|
|
|
82
83
|
config_dict = load_file(pathlib.Path(file))
|
|
83
84
|
id = config_dict.get("id") if isinstance(config_dict, dict) else None
|
|
84
|
-
with get_default_graph() as graph:
|
|
85
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
85
86
|
data_product: DataProduct = DataProduct.from_yaml(file, graph)
|
|
86
87
|
external_url_override = (
|
|
87
88
|
external_url
|
|
@@ -162,7 +163,7 @@ def upsert(file: Path, validate_assets: bool, external_url: str) -> None:
|
|
|
162
163
|
def diff(file: Path, update: bool) -> None:
|
|
163
164
|
"""Diff a Data Product file with its twin in DataHub"""
|
|
164
165
|
|
|
165
|
-
with get_default_graph() as emitter:
|
|
166
|
+
with get_default_graph(ClientMode.CLI) as emitter:
|
|
166
167
|
id: Optional[str] = None
|
|
167
168
|
try:
|
|
168
169
|
data_product_local: DataProduct = DataProduct.from_yaml(file, emitter)
|
|
@@ -216,7 +217,7 @@ def delete(urn: str, file: Path, hard: bool) -> None:
|
|
|
216
217
|
raise click.Abort()
|
|
217
218
|
|
|
218
219
|
graph: DataHubGraph
|
|
219
|
-
with get_default_graph() as graph:
|
|
220
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
220
221
|
data_product_urn = (
|
|
221
222
|
urn if urn.startswith("urn:li:dataProduct") else f"urn:li:dataProduct:{urn}"
|
|
222
223
|
)
|
|
@@ -248,7 +249,7 @@ def get(urn: str, to_file: str) -> None:
|
|
|
248
249
|
if not urn.startswith("urn:li:dataProduct:"):
|
|
249
250
|
urn = f"urn:li:dataProduct:{urn}"
|
|
250
251
|
|
|
251
|
-
with get_default_graph() as graph:
|
|
252
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
252
253
|
if graph.exists(urn):
|
|
253
254
|
dataproduct: DataProduct = DataProduct.from_datahub(graph=graph, id=urn)
|
|
254
255
|
click.secho(
|
|
@@ -306,7 +307,7 @@ def set_description(urn: str, description: str, md_file: Path) -> None:
|
|
|
306
307
|
|
|
307
308
|
dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
|
|
308
309
|
dataproduct_patcher.set_description(description)
|
|
309
|
-
with get_default_graph() as graph:
|
|
310
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
310
311
|
_abort_if_non_existent_urn(graph, urn, "set description")
|
|
311
312
|
for mcp in dataproduct_patcher.build():
|
|
312
313
|
graph.emit(mcp)
|
|
@@ -342,7 +343,7 @@ def add_owner(urn: str, owner: str, owner_type: str) -> None:
|
|
|
342
343
|
owner=_get_owner_urn(owner), type=owner_type, typeUrn=owner_type_urn
|
|
343
344
|
)
|
|
344
345
|
)
|
|
345
|
-
with get_default_graph() as graph:
|
|
346
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
346
347
|
_abort_if_non_existent_urn(graph, urn, "add owners")
|
|
347
348
|
for mcp in dataproduct_patcher.build():
|
|
348
349
|
graph.emit(mcp)
|
|
@@ -360,7 +361,7 @@ def remove_owner(urn: str, owner_urn: str) -> None:
|
|
|
360
361
|
urn = f"urn:li:dataProduct:{urn}"
|
|
361
362
|
dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
|
|
362
363
|
dataproduct_patcher.remove_owner(owner=_get_owner_urn(owner_urn))
|
|
363
|
-
with get_default_graph() as graph:
|
|
364
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
364
365
|
_abort_if_non_existent_urn(graph, urn, "remove owners")
|
|
365
366
|
for mcp in dataproduct_patcher.build():
|
|
366
367
|
click.echo(json.dumps(mcp.to_obj()))
|
|
@@ -382,7 +383,7 @@ def add_asset(urn: str, asset: str, validate_assets: bool) -> None:
|
|
|
382
383
|
urn = f"urn:li:dataProduct:{urn}"
|
|
383
384
|
dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
|
|
384
385
|
dataproduct_patcher.add_asset(asset)
|
|
385
|
-
with get_default_graph() as graph:
|
|
386
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
386
387
|
_abort_if_non_existent_urn(graph, urn, "add assets")
|
|
387
388
|
if validate_assets:
|
|
388
389
|
_abort_if_non_existent_urn(
|
|
@@ -409,7 +410,7 @@ def remove_asset(urn: str, asset: str, validate_assets: bool) -> None:
|
|
|
409
410
|
urn = f"urn:li:dataProduct:{urn}"
|
|
410
411
|
dataproduct_patcher: DataProductPatchBuilder = DataProduct.get_patch_builder(urn)
|
|
411
412
|
dataproduct_patcher.remove_asset(asset)
|
|
412
|
-
with get_default_graph() as graph:
|
|
413
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
413
414
|
_abort_if_non_existent_urn(graph, urn, "remove assets")
|
|
414
415
|
if validate_assets:
|
|
415
416
|
_abort_if_non_existent_urn(
|
|
@@ -12,6 +12,7 @@ from click_default_group import DefaultGroup
|
|
|
12
12
|
from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
|
|
13
13
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
14
14
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
15
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
15
16
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
|
|
16
17
|
from datahub.telemetry import telemetry
|
|
17
18
|
from datahub.upgrade import upgrade
|
|
@@ -54,7 +55,7 @@ def get(urn: str, to_file: str) -> None:
|
|
|
54
55
|
if not urn.startswith("urn:li:dataset:"):
|
|
55
56
|
urn = f"urn:li:dataset:{urn}"
|
|
56
57
|
|
|
57
|
-
with get_default_graph() as graph:
|
|
58
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
58
59
|
if graph.exists(urn):
|
|
59
60
|
dataset: Dataset = Dataset.from_datahub(graph=graph, urn=urn)
|
|
60
61
|
click.secho(
|
|
@@ -82,7 +83,7 @@ def add_sibling(urn: str, sibling_urns: Tuple[str]) -> None:
|
|
|
82
83
|
all_urns.add(urn)
|
|
83
84
|
for sibling_urn in sibling_urns:
|
|
84
85
|
all_urns.add(sibling_urn)
|
|
85
|
-
with get_default_graph() as graph:
|
|
86
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
86
87
|
for _urn in all_urns:
|
|
87
88
|
_emit_sibling(graph, urn, _urn, all_urns)
|
|
88
89
|
|
|
@@ -181,7 +182,7 @@ def sync(file: str, to_datahub: bool, dry_run: bool) -> None:
|
|
|
181
182
|
dry_run_prefix = "[dry-run]: " if dry_run else "" # prefix to use in messages
|
|
182
183
|
|
|
183
184
|
failures: List[str] = []
|
|
184
|
-
with get_default_graph() as graph:
|
|
185
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
185
186
|
datasets = Dataset.from_yaml(file)
|
|
186
187
|
for dataset in datasets:
|
|
187
188
|
assert (
|
|
@@ -7,6 +7,7 @@ from click_default_group import DefaultGroup
|
|
|
7
7
|
|
|
8
8
|
from datahub.api.entities.forms.forms import Forms
|
|
9
9
|
from datahub.ingestion.graph.client import get_default_graph
|
|
10
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
10
11
|
from datahub.telemetry import telemetry
|
|
11
12
|
from datahub.upgrade import upgrade
|
|
12
13
|
|
|
@@ -40,7 +41,7 @@ def upsert(file: Path) -> None:
|
|
|
40
41
|
@telemetry.with_telemetry()
|
|
41
42
|
def get(urn: str, to_file: str) -> None:
|
|
42
43
|
"""Get form from DataHub"""
|
|
43
|
-
with get_default_graph() as graph:
|
|
44
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
44
45
|
if graph.exists(urn):
|
|
45
46
|
form: Forms = Forms.from_datahub(graph=graph, urn=urn)
|
|
46
47
|
click.secho(
|
|
@@ -10,6 +10,7 @@ from datahub.api.entities.corpgroup.corpgroup import (
|
|
|
10
10
|
)
|
|
11
11
|
from datahub.cli.specific.file_loader import load_file
|
|
12
12
|
from datahub.ingestion.graph.client import get_default_graph
|
|
13
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
13
14
|
from datahub.telemetry import telemetry
|
|
14
15
|
from datahub.upgrade import upgrade
|
|
15
16
|
|
|
@@ -40,7 +41,7 @@ def upsert(file: Path, override_editable: bool) -> None:
|
|
|
40
41
|
|
|
41
42
|
config_dict = load_file(file)
|
|
42
43
|
group_configs = config_dict if isinstance(config_dict, list) else [config_dict]
|
|
43
|
-
with get_default_graph() as emitter:
|
|
44
|
+
with get_default_graph(ClientMode.CLI) as emitter:
|
|
44
45
|
for group_config in group_configs:
|
|
45
46
|
try:
|
|
46
47
|
datahub_group = CorpGroup.parse_obj(group_config)
|
|
@@ -11,6 +11,7 @@ from datahub.api.entities.structuredproperties.structuredproperties import (
|
|
|
11
11
|
StructuredProperties,
|
|
12
12
|
)
|
|
13
13
|
from datahub.ingestion.graph.client import get_default_graph
|
|
14
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
14
15
|
from datahub.telemetry import telemetry
|
|
15
16
|
from datahub.upgrade import upgrade
|
|
16
17
|
from datahub.utilities.urns.urn import Urn
|
|
@@ -33,7 +34,7 @@ def properties() -> None:
|
|
|
33
34
|
def upsert(file: Path) -> None:
|
|
34
35
|
"""Upsert structured properties in DataHub."""
|
|
35
36
|
|
|
36
|
-
with get_default_graph() as graph:
|
|
37
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
37
38
|
StructuredProperties.create(str(file), graph)
|
|
38
39
|
|
|
39
40
|
|
|
@@ -48,7 +49,7 @@ def get(urn: str, to_file: str) -> None:
|
|
|
48
49
|
"""Get structured properties from DataHub"""
|
|
49
50
|
urn = Urn.make_structured_property_urn(urn)
|
|
50
51
|
|
|
51
|
-
with get_default_graph() as graph:
|
|
52
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
52
53
|
if graph.exists(urn):
|
|
53
54
|
structuredproperties: StructuredProperties = (
|
|
54
55
|
StructuredProperties.from_datahub(graph=graph, urn=urn)
|
|
@@ -117,7 +118,7 @@ def list(details: bool, to_file: str) -> None:
|
|
|
117
118
|
with open(file, "w") as fp:
|
|
118
119
|
yaml.dump(serialized_objects, fp)
|
|
119
120
|
|
|
120
|
-
with get_default_graph() as graph:
|
|
121
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
121
122
|
if details:
|
|
122
123
|
logger.info(
|
|
123
124
|
"Listing structured properties with details. Use --no-details for urns only"
|
datahub/cli/specific/user_cli.py
CHANGED
|
@@ -8,6 +8,7 @@ from click_default_group import DefaultGroup
|
|
|
8
8
|
from datahub.api.entities.corpuser.corpuser import CorpUser, CorpUserGenerationConfig
|
|
9
9
|
from datahub.cli.specific.file_loader import load_file
|
|
10
10
|
from datahub.ingestion.graph.client import get_default_graph
|
|
11
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
11
12
|
from datahub.telemetry import telemetry
|
|
12
13
|
from datahub.upgrade import upgrade
|
|
13
14
|
|
|
@@ -38,7 +39,7 @@ def upsert(file: Path, override_editable: bool) -> None:
|
|
|
38
39
|
|
|
39
40
|
config_dict = load_file(pathlib.Path(file))
|
|
40
41
|
user_configs = config_dict if isinstance(config_dict, list) else [config_dict]
|
|
41
|
-
with get_default_graph() as emitter:
|
|
42
|
+
with get_default_graph(ClientMode.CLI) as emitter:
|
|
42
43
|
for user_config in user_configs:
|
|
43
44
|
try:
|
|
44
45
|
datahub_user: CorpUser = CorpUser.parse_obj(user_config)
|
datahub/cli/state_cli.py
CHANGED
|
@@ -5,6 +5,7 @@ import click
|
|
|
5
5
|
from click_default_group import DefaultGroup
|
|
6
6
|
|
|
7
7
|
from datahub.ingestion.graph.client import get_default_graph
|
|
8
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
8
9
|
from datahub.telemetry import telemetry
|
|
9
10
|
from datahub.upgrade import upgrade
|
|
10
11
|
|
|
@@ -28,7 +29,7 @@ def inspect(pipeline_name: str, platform: str) -> None:
|
|
|
28
29
|
Only works for state entity removal for now.
|
|
29
30
|
"""
|
|
30
31
|
|
|
31
|
-
datahub_graph = get_default_graph()
|
|
32
|
+
datahub_graph = get_default_graph(ClientMode.CLI)
|
|
32
33
|
checkpoint = datahub_graph.get_latest_pipeline_checkpoint(pipeline_name, platform)
|
|
33
34
|
if not checkpoint:
|
|
34
35
|
click.secho("No ingestion state found.", fg="red")
|
datahub/cli/timeline_cli.py
CHANGED
|
@@ -9,6 +9,7 @@ from requests import Response
|
|
|
9
9
|
|
|
10
10
|
from datahub.emitter.mce_builder import dataset_urn_to_key, schema_field_urn_to_key
|
|
11
11
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
12
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
12
13
|
from datahub.telemetry import telemetry
|
|
13
14
|
from datahub.upgrade import upgrade
|
|
14
15
|
from datahub.utilities.urns.urn import Urn
|
|
@@ -63,7 +64,7 @@ def get_timeline(
|
|
|
63
64
|
diff: bool,
|
|
64
65
|
graph: Optional[DataHubGraph] = None,
|
|
65
66
|
) -> Any:
|
|
66
|
-
client = graph if graph else get_default_graph()
|
|
67
|
+
client = graph if graph else get_default_graph(ClientMode.CLI)
|
|
67
68
|
session = client._session
|
|
68
69
|
host = client.config.server
|
|
69
70
|
if urn.startswith("urn%3A"):
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -5,7 +5,6 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import time
|
|
8
|
-
import warnings
|
|
9
8
|
from collections import defaultdict
|
|
10
9
|
from dataclasses import dataclass
|
|
11
10
|
from datetime import datetime, timedelta
|
|
@@ -50,13 +49,17 @@ from datahub.emitter.response_helper import (
|
|
|
50
49
|
extract_trace_data_from_mcps,
|
|
51
50
|
)
|
|
52
51
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
53
|
-
from datahub.errors import APITracingWarning
|
|
54
52
|
from datahub.ingestion.api.closeable import Closeable
|
|
53
|
+
from datahub.ingestion.graph.config import (
|
|
54
|
+
DATAHUB_COMPONENT_ENV,
|
|
55
|
+
ClientMode,
|
|
56
|
+
)
|
|
55
57
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
56
58
|
MetadataChangeEvent,
|
|
57
59
|
MetadataChangeProposal,
|
|
58
60
|
)
|
|
59
61
|
from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
|
|
62
|
+
from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
|
|
60
63
|
|
|
61
64
|
if TYPE_CHECKING:
|
|
62
65
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
@@ -79,6 +82,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
79
82
|
|
|
80
83
|
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
81
84
|
|
|
85
|
+
_DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
|
|
86
|
+
|
|
82
87
|
TRACE_PENDING_STATUS = "PENDING"
|
|
83
88
|
TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
84
89
|
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
@@ -133,12 +138,24 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
133
138
|
ca_certificate_path: Optional[str] = None
|
|
134
139
|
client_certificate_path: Optional[str] = None
|
|
135
140
|
disable_ssl_verification: bool = False
|
|
141
|
+
client_mode: Optional[ClientMode] = _DEFAULT_CLIENT_MODE
|
|
142
|
+
datahub_component: Optional[str] = None
|
|
136
143
|
|
|
137
144
|
def build_session(self) -> requests.Session:
|
|
138
145
|
session = requests.Session()
|
|
139
146
|
|
|
140
|
-
|
|
141
|
-
|
|
147
|
+
user_agent = self._get_user_agent_string(session)
|
|
148
|
+
|
|
149
|
+
base_headers = {
|
|
150
|
+
"User-Agent": user_agent,
|
|
151
|
+
"X-DataHub-Client-Mode": self.client_mode.name
|
|
152
|
+
if self.client_mode
|
|
153
|
+
else _DEFAULT_CLIENT_MODE.name,
|
|
154
|
+
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
headers = {**base_headers, **self.extra_headers}
|
|
158
|
+
session.headers.update(headers)
|
|
142
159
|
|
|
143
160
|
if self.client_certificate_path:
|
|
144
161
|
session.cert = self.client_certificate_path
|
|
@@ -186,6 +203,59 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
186
203
|
|
|
187
204
|
return session
|
|
188
205
|
|
|
206
|
+
@classmethod
|
|
207
|
+
def get_client_mode_from_session(
|
|
208
|
+
cls, session: requests.Session
|
|
209
|
+
) -> Optional[ClientMode]:
|
|
210
|
+
"""
|
|
211
|
+
Extract the ClientMode enum from a requests Session by checking the headers.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
session: The requests.Session object to check
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
The corresponding ClientMode enum value if found, None otherwise
|
|
218
|
+
"""
|
|
219
|
+
# Check if the session has the X-DataHub-Client-Mode header
|
|
220
|
+
mode_str = session.headers.get("X-DataHub-Client-Mode")
|
|
221
|
+
|
|
222
|
+
if not mode_str:
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
# Try to convert the string value to enum
|
|
226
|
+
try:
|
|
227
|
+
# First ensure we're working with a str value
|
|
228
|
+
if isinstance(mode_str, bytes):
|
|
229
|
+
mode_str = mode_str.decode("utf-8")
|
|
230
|
+
|
|
231
|
+
# Then find the matching enum value
|
|
232
|
+
for mode in ClientMode:
|
|
233
|
+
if mode.name == mode_str:
|
|
234
|
+
return mode
|
|
235
|
+
|
|
236
|
+
# If we got here, no matching enum was found
|
|
237
|
+
return None
|
|
238
|
+
except Exception:
|
|
239
|
+
# Handle any other errors
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
def _get_user_agent_string(self, session: requests.Session) -> str:
|
|
243
|
+
"""Generate appropriate user agent string based on client mode"""
|
|
244
|
+
version = nice_version_name()
|
|
245
|
+
client_mode = self.client_mode if self.client_mode else _DEFAULT_CLIENT_MODE
|
|
246
|
+
|
|
247
|
+
if "User-Agent" in session.headers:
|
|
248
|
+
user_agent = session.headers["User-Agent"]
|
|
249
|
+
if isinstance(user_agent, bytes):
|
|
250
|
+
requests_user_agent = " " + user_agent.decode("utf-8")
|
|
251
|
+
else:
|
|
252
|
+
requests_user_agent = " " + user_agent
|
|
253
|
+
else:
|
|
254
|
+
requests_user_agent = ""
|
|
255
|
+
|
|
256
|
+
# 1.0 refers to the user agent string version
|
|
257
|
+
return f"DataHub-Client/1.0 ({client_mode.name.lower()}; {self.datahub_component if self.datahub_component else DATAHUB_COMPONENT_ENV}; {version}){requests_user_agent}"
|
|
258
|
+
|
|
189
259
|
|
|
190
260
|
@dataclass
|
|
191
261
|
class _Chunk:
|
|
@@ -211,8 +281,9 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
211
281
|
_gms_server: str
|
|
212
282
|
_token: Optional[str]
|
|
213
283
|
_session: requests.Session
|
|
214
|
-
_openapi_ingestion: bool
|
|
284
|
+
_openapi_ingestion: Optional[bool]
|
|
215
285
|
_default_trace_mode: bool
|
|
286
|
+
server_config: RestServiceConfig
|
|
216
287
|
|
|
217
288
|
def __init__(
|
|
218
289
|
self,
|
|
@@ -228,10 +299,10 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
228
299
|
ca_certificate_path: Optional[str] = None,
|
|
229
300
|
client_certificate_path: Optional[str] = None,
|
|
230
301
|
disable_ssl_verification: bool = False,
|
|
231
|
-
openapi_ingestion: bool =
|
|
232
|
-
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
233
|
-
),
|
|
302
|
+
openapi_ingestion: Optional[bool] = None,
|
|
234
303
|
default_trace_mode: bool = False,
|
|
304
|
+
client_mode: Optional[ClientMode] = None,
|
|
305
|
+
datahub_component: Optional[str] = None,
|
|
235
306
|
):
|
|
236
307
|
if not gms_server:
|
|
237
308
|
raise ConfigurationError("gms server is required")
|
|
@@ -243,13 +314,10 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
243
314
|
|
|
244
315
|
self._gms_server = fixup_gms_url(gms_server)
|
|
245
316
|
self._token = token
|
|
246
|
-
self.server_config: Dict[str, Any] = {}
|
|
247
|
-
self._openapi_ingestion = openapi_ingestion
|
|
248
317
|
self._default_trace_mode = default_trace_mode
|
|
249
318
|
self._session = requests.Session()
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
319
|
+
self._openapi_ingestion = (
|
|
320
|
+
openapi_ingestion # Re-evaluated after test connection
|
|
253
321
|
)
|
|
254
322
|
|
|
255
323
|
if self._default_trace_mode:
|
|
@@ -257,7 +325,6 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
257
325
|
|
|
258
326
|
headers = {
|
|
259
327
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
260
|
-
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
261
328
|
"Content-Type": "application/json",
|
|
262
329
|
}
|
|
263
330
|
if token:
|
|
@@ -303,37 +370,54 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
303
370
|
ca_certificate_path=ca_certificate_path,
|
|
304
371
|
client_certificate_path=client_certificate_path,
|
|
305
372
|
disable_ssl_verification=disable_ssl_verification,
|
|
373
|
+
client_mode=client_mode,
|
|
374
|
+
datahub_component=datahub_component,
|
|
306
375
|
)
|
|
307
376
|
|
|
308
377
|
self._session = self._session_config.build_session()
|
|
309
378
|
|
|
310
379
|
def test_connection(self) -> None:
|
|
311
380
|
url = f"{self._gms_server}/config"
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
config
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
381
|
+
try:
|
|
382
|
+
# Create a config instance with session and URL
|
|
383
|
+
config = RestServiceConfig(session=self._session, url=url)
|
|
384
|
+
# Attempt to load config, which will throw ConfigurationError if there's an issue
|
|
385
|
+
config.fetch_config()
|
|
386
|
+
self.server_config = config
|
|
387
|
+
|
|
388
|
+
# Determine OpenAPI mode
|
|
389
|
+
if self._openapi_ingestion is None:
|
|
390
|
+
# No constructor parameter
|
|
391
|
+
if (
|
|
392
|
+
not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
|
|
393
|
+
and self._session_config.client_mode == ClientMode.SDK
|
|
394
|
+
and self.server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
|
|
395
|
+
):
|
|
396
|
+
# Enable if SDK client and no environment variable specified
|
|
397
|
+
self._openapi_ingestion = True
|
|
398
|
+
else:
|
|
399
|
+
# The system env is specifying the value
|
|
400
|
+
self._openapi_ingestion = (
|
|
401
|
+
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
402
|
+
)
|
|
318
403
|
|
|
319
|
-
else:
|
|
320
|
-
raise ConfigurationError(
|
|
321
|
-
"You seem to have connected to the frontend service instead of the GMS endpoint. "
|
|
322
|
-
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
|
|
323
|
-
"For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
|
|
324
|
-
)
|
|
325
|
-
else:
|
|
326
404
|
logger.debug(
|
|
327
|
-
f"
|
|
405
|
+
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
328
406
|
)
|
|
329
|
-
if response.status_code == 401:
|
|
330
|
-
message = f"Unable to connect to {url} - got an authentication error: {response.text}."
|
|
331
|
-
else:
|
|
332
|
-
message = f"Unable to connect to {url} with status_code: {response.status_code}."
|
|
333
|
-
message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
|
|
334
|
-
raise ConfigurationError(message)
|
|
335
407
|
|
|
336
|
-
|
|
408
|
+
# Set default tracing for SDK
|
|
409
|
+
if (
|
|
410
|
+
self._session_config.client_mode == ClientMode.SDK
|
|
411
|
+
and self.server_config.supports_feature(ServiceFeature.API_TRACING)
|
|
412
|
+
):
|
|
413
|
+
# Enable tracing if using SDK & server supported
|
|
414
|
+
self._default_trace_mode = True
|
|
415
|
+
|
|
416
|
+
except ConfigurationError as e:
|
|
417
|
+
# Just re-raise the exception
|
|
418
|
+
raise e
|
|
419
|
+
|
|
420
|
+
def get_server_config(self) -> RestServiceConfig:
|
|
337
421
|
self.test_connection()
|
|
338
422
|
return self.server_config
|
|
339
423
|
|
|
@@ -485,7 +569,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
485
569
|
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
486
570
|
) -> int:
|
|
487
571
|
"""
|
|
488
|
-
1. Grouping MCPs by their HTTP method and entity URL
|
|
572
|
+
1. Grouping MCPs by their HTTP method and entity URL and HTTP method
|
|
489
573
|
2. Breaking down large batches into smaller chunks based on both:
|
|
490
574
|
* Total byte size (INGEST_MAX_PAYLOAD_BYTES)
|
|
491
575
|
* Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
|
|
@@ -751,12 +835,6 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
751
835
|
trace_flag if trace_flag is not None else self._default_trace_mode
|
|
752
836
|
)
|
|
753
837
|
resolved_async_flag = async_flag if async_flag is not None else async_default
|
|
754
|
-
if resolved_trace_flag and not resolved_async_flag:
|
|
755
|
-
warnings.warn(
|
|
756
|
-
"API tracing is only available with async ingestion. For sync mode, API errors will be surfaced as exceptions.",
|
|
757
|
-
APITracingWarning,
|
|
758
|
-
stacklevel=3,
|
|
759
|
-
)
|
|
760
838
|
return resolved_trace_flag and resolved_async_flag
|
|
761
839
|
|
|
762
840
|
def __repr__(self) -> str:
|
datahub/entrypoints.py
CHANGED
|
@@ -37,6 +37,7 @@ from datahub.cli.telemetry import telemetry as telemetry_cli
|
|
|
37
37
|
from datahub.cli.timeline_cli import timeline
|
|
38
38
|
from datahub.configuration.common import should_show_stack_trace
|
|
39
39
|
from datahub.ingestion.graph.client import get_default_graph
|
|
40
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
40
41
|
from datahub.telemetry import telemetry
|
|
41
42
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
42
43
|
from datahub.utilities.logging_manager import configure_logging
|
|
@@ -117,7 +118,7 @@ def version(include_server: bool = False) -> None:
|
|
|
117
118
|
click.echo(f"Models: {model_version_name()}")
|
|
118
119
|
click.echo(f"Python version: {sys.version}")
|
|
119
120
|
if include_server:
|
|
120
|
-
server_config = get_default_graph().get_config()
|
|
121
|
+
server_config = get_default_graph(ClientMode.CLI).get_config()
|
|
121
122
|
click.echo(f"Server config: {server_config}")
|
|
122
123
|
|
|
123
124
|
|
|
@@ -34,14 +34,13 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
|
|
|
34
34
|
from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
|
|
35
35
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
36
36
|
from datahub.emitter.rest_emitter import (
|
|
37
|
-
DEFAULT_REST_EMITTER_ENDPOINT,
|
|
38
37
|
DEFAULT_REST_TRACE_MODE,
|
|
39
38
|
DatahubRestEmitter,
|
|
40
|
-
RestSinkEndpoint,
|
|
41
39
|
RestTraceMode,
|
|
42
40
|
)
|
|
43
41
|
from datahub.emitter.serialization_helper import post_json_transform
|
|
44
42
|
from datahub.ingestion.graph.config import (
|
|
43
|
+
ClientMode,
|
|
45
44
|
DatahubClientConfig as DatahubClientConfig,
|
|
46
45
|
)
|
|
47
46
|
from datahub.ingestion.graph.connections import (
|
|
@@ -158,13 +157,12 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
158
157
|
ca_certificate_path=self.config.ca_certificate_path,
|
|
159
158
|
client_certificate_path=self.config.client_certificate_path,
|
|
160
159
|
disable_ssl_verification=self.config.disable_ssl_verification,
|
|
161
|
-
openapi_ingestion=self.config.openapi_ingestion
|
|
162
|
-
if self.config.openapi_ingestion is not None
|
|
163
|
-
else (DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI),
|
|
160
|
+
openapi_ingestion=self.config.openapi_ingestion,
|
|
164
161
|
default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
|
|
162
|
+
client_mode=config.client_mode,
|
|
163
|
+
datahub_component=config.datahub_component,
|
|
165
164
|
)
|
|
166
|
-
|
|
167
|
-
self.server_id = _MISSING_SERVER_ID
|
|
165
|
+
self.server_id: str = _MISSING_SERVER_ID
|
|
168
166
|
|
|
169
167
|
def test_connection(self) -> None:
|
|
170
168
|
super().test_connection()
|
|
@@ -195,7 +193,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
195
193
|
if not self.server_config:
|
|
196
194
|
self.test_connection()
|
|
197
195
|
|
|
198
|
-
base_url = self.server_config.get("baseUrl")
|
|
196
|
+
base_url = self.server_config.raw_config.get("baseUrl")
|
|
199
197
|
if not base_url:
|
|
200
198
|
raise ValueError("baseUrl not found in server config")
|
|
201
199
|
return base_url
|
|
@@ -203,6 +201,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
203
201
|
@classmethod
|
|
204
202
|
def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
|
|
205
203
|
session_config = emitter._session_config
|
|
204
|
+
|
|
206
205
|
if isinstance(session_config.timeout, tuple):
|
|
207
206
|
# TODO: This is slightly lossy. Eventually, we want to modify the emitter
|
|
208
207
|
# to accept a tuple for timeout_sec, and then we'll be able to remove this.
|
|
@@ -220,6 +219,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
220
219
|
disable_ssl_verification=session_config.disable_ssl_verification,
|
|
221
220
|
ca_certificate_path=session_config.ca_certificate_path,
|
|
222
221
|
client_certificate_path=session_config.client_certificate_path,
|
|
222
|
+
client_mode=session_config.client_mode,
|
|
223
|
+
datahub_component=session_config.datahub_component,
|
|
223
224
|
)
|
|
224
225
|
)
|
|
225
226
|
|
|
@@ -1954,8 +1955,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1954
1955
|
super().close()
|
|
1955
1956
|
|
|
1956
1957
|
|
|
1957
|
-
|
|
1958
|
+
@functools.lru_cache(maxsize=None)
|
|
1959
|
+
def get_default_graph(
|
|
1960
|
+
client_mode: Optional[ClientMode] = None,
|
|
1961
|
+
datahub_component: Optional[str] = None,
|
|
1962
|
+
) -> DataHubGraph:
|
|
1958
1963
|
graph_config = config_utils.load_client_config()
|
|
1964
|
+
graph_config.client_mode = client_mode
|
|
1965
|
+
graph_config.datahub_component = datahub_component
|
|
1959
1966
|
graph = DataHubGraph(graph_config)
|
|
1960
1967
|
graph.test_connection()
|
|
1961
1968
|
telemetry_instance.set_context(server=graph)
|
|
@@ -1,8 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from enum import Enum, auto
|
|
1
3
|
from typing import Dict, List, Optional
|
|
2
4
|
|
|
3
5
|
from datahub.configuration.common import ConfigModel
|
|
4
6
|
|
|
5
7
|
|
|
8
|
+
class ClientMode(Enum):
|
|
9
|
+
INGESTION = auto()
|
|
10
|
+
CLI = auto()
|
|
11
|
+
SDK = auto()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
DATAHUB_COMPONENT_ENV: str = os.getenv("DATAHUB_COMPONENT", "datahub").lower()
|
|
15
|
+
|
|
16
|
+
|
|
6
17
|
class DatahubClientConfig(ConfigModel):
|
|
7
18
|
"""Configuration class for holding connectivity to datahub gms"""
|
|
8
19
|
|
|
@@ -18,3 +29,5 @@ class DatahubClientConfig(ConfigModel):
|
|
|
18
29
|
client_certificate_path: Optional[str] = None
|
|
19
30
|
disable_ssl_verification: bool = False
|
|
20
31
|
openapi_ingestion: Optional[bool] = None
|
|
32
|
+
client_mode: Optional[ClientMode] = None
|
|
33
|
+
datahub_component: Optional[str] = None
|
|
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
|
|
|
31
31
|
from datahub.ingestion.api.transform import Transformer
|
|
32
32
|
from datahub.ingestion.extractor.extractor_registry import extractor_registry
|
|
33
33
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
34
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
34
35
|
from datahub.ingestion.reporting.reporting_provider_registry import (
|
|
35
36
|
reporting_provider_registry,
|
|
36
37
|
)
|
|
@@ -136,9 +137,8 @@ class CliReport(Report):
|
|
|
136
137
|
|
|
137
138
|
|
|
138
139
|
def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
|
|
139
|
-
graph = get_default_graph()
|
|
140
|
+
graph = get_default_graph(ClientMode.INGESTION)
|
|
140
141
|
sink_config = graph._make_rest_sink_config()
|
|
141
|
-
|
|
142
142
|
return DatahubRestSink(ctx, sink_config)
|
|
143
143
|
|
|
144
144
|
|
|
@@ -175,6 +175,7 @@ class Pipeline:
|
|
|
175
175
|
self.graph: Optional[DataHubGraph] = None
|
|
176
176
|
with _add_init_error_context("connect to DataHub"):
|
|
177
177
|
if self.config.datahub_api:
|
|
178
|
+
self.config.datahub_api.client_mode = ClientMode.INGESTION
|
|
178
179
|
self.graph = exit_stack.enter_context(
|
|
179
180
|
DataHubGraph(self.config.datahub_api)
|
|
180
181
|
)
|
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
|
|
|
7
7
|
from pydantic import Field, validator
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import ConfigModel, DynamicTypedConfig
|
|
10
|
-
from datahub.ingestion.graph.
|
|
10
|
+
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
11
11
|
from datahub.ingestion.sink.file import FileSinkConfig
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|