acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2524 -2471
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +87 -87
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +16 -7
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/run/pipeline.py +3 -2
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/ge_data_profiler.py +2 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
- datahub/metadata/_urns/urn_defs.py +1786 -1786
- datahub/metadata/schema.avsc +17364 -16988
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/main_client.py +2 -2
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +378 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -5,7 +5,6 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import time
|
|
8
|
-
import warnings
|
|
9
8
|
from collections import defaultdict
|
|
10
9
|
from dataclasses import dataclass
|
|
11
10
|
from datetime import datetime, timedelta
|
|
@@ -41,23 +40,26 @@ from datahub.configuration.common import (
|
|
|
41
40
|
TraceTimeoutError,
|
|
42
41
|
TraceValidationError,
|
|
43
42
|
)
|
|
44
|
-
from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
|
|
45
43
|
from datahub.emitter.generic_emitter import Emitter
|
|
46
44
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
47
|
-
from datahub.emitter.request_helper import make_curl_command
|
|
45
|
+
from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
|
|
48
46
|
from datahub.emitter.response_helper import (
|
|
49
47
|
TraceData,
|
|
50
48
|
extract_trace_data,
|
|
51
49
|
extract_trace_data_from_mcps,
|
|
52
50
|
)
|
|
53
51
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
54
|
-
from datahub.errors import APITracingWarning
|
|
55
52
|
from datahub.ingestion.api.closeable import Closeable
|
|
53
|
+
from datahub.ingestion.graph.config import (
|
|
54
|
+
DATAHUB_COMPONENT_ENV,
|
|
55
|
+
ClientMode,
|
|
56
|
+
)
|
|
56
57
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
57
58
|
MetadataChangeEvent,
|
|
58
59
|
MetadataChangeProposal,
|
|
59
60
|
)
|
|
60
61
|
from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
|
|
62
|
+
from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
|
|
61
63
|
|
|
62
64
|
if TYPE_CHECKING:
|
|
63
65
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
@@ -80,6 +82,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
80
82
|
|
|
81
83
|
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
82
84
|
|
|
85
|
+
_DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
|
|
86
|
+
|
|
83
87
|
TRACE_PENDING_STATUS = "PENDING"
|
|
84
88
|
TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
85
89
|
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
@@ -134,12 +138,24 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
134
138
|
ca_certificate_path: Optional[str] = None
|
|
135
139
|
client_certificate_path: Optional[str] = None
|
|
136
140
|
disable_ssl_verification: bool = False
|
|
141
|
+
client_mode: Optional[ClientMode] = _DEFAULT_CLIENT_MODE
|
|
142
|
+
datahub_component: Optional[str] = None
|
|
137
143
|
|
|
138
144
|
def build_session(self) -> requests.Session:
|
|
139
145
|
session = requests.Session()
|
|
140
146
|
|
|
141
|
-
|
|
142
|
-
|
|
147
|
+
user_agent = self._get_user_agent_string(session)
|
|
148
|
+
|
|
149
|
+
base_headers = {
|
|
150
|
+
"User-Agent": user_agent,
|
|
151
|
+
"X-DataHub-Client-Mode": self.client_mode.name
|
|
152
|
+
if self.client_mode
|
|
153
|
+
else _DEFAULT_CLIENT_MODE.name,
|
|
154
|
+
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
headers = {**base_headers, **self.extra_headers}
|
|
158
|
+
session.headers.update(headers)
|
|
143
159
|
|
|
144
160
|
if self.client_certificate_path:
|
|
145
161
|
session.cert = self.client_certificate_path
|
|
@@ -187,6 +203,59 @@ class RequestsSessionConfig(ConfigModel):
|
|
|
187
203
|
|
|
188
204
|
return session
|
|
189
205
|
|
|
206
|
+
@classmethod
|
|
207
|
+
def get_client_mode_from_session(
|
|
208
|
+
cls, session: requests.Session
|
|
209
|
+
) -> Optional[ClientMode]:
|
|
210
|
+
"""
|
|
211
|
+
Extract the ClientMode enum from a requests Session by checking the headers.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
session: The requests.Session object to check
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
The corresponding ClientMode enum value if found, None otherwise
|
|
218
|
+
"""
|
|
219
|
+
# Check if the session has the X-DataHub-Client-Mode header
|
|
220
|
+
mode_str = session.headers.get("X-DataHub-Client-Mode")
|
|
221
|
+
|
|
222
|
+
if not mode_str:
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
# Try to convert the string value to enum
|
|
226
|
+
try:
|
|
227
|
+
# First ensure we're working with a str value
|
|
228
|
+
if isinstance(mode_str, bytes):
|
|
229
|
+
mode_str = mode_str.decode("utf-8")
|
|
230
|
+
|
|
231
|
+
# Then find the matching enum value
|
|
232
|
+
for mode in ClientMode:
|
|
233
|
+
if mode.name == mode_str:
|
|
234
|
+
return mode
|
|
235
|
+
|
|
236
|
+
# If we got here, no matching enum was found
|
|
237
|
+
return None
|
|
238
|
+
except Exception:
|
|
239
|
+
# Handle any other errors
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
def _get_user_agent_string(self, session: requests.Session) -> str:
|
|
243
|
+
"""Generate appropriate user agent string based on client mode"""
|
|
244
|
+
version = nice_version_name()
|
|
245
|
+
client_mode = self.client_mode if self.client_mode else _DEFAULT_CLIENT_MODE
|
|
246
|
+
|
|
247
|
+
if "User-Agent" in session.headers:
|
|
248
|
+
user_agent = session.headers["User-Agent"]
|
|
249
|
+
if isinstance(user_agent, bytes):
|
|
250
|
+
requests_user_agent = " " + user_agent.decode("utf-8")
|
|
251
|
+
else:
|
|
252
|
+
requests_user_agent = " " + user_agent
|
|
253
|
+
else:
|
|
254
|
+
requests_user_agent = ""
|
|
255
|
+
|
|
256
|
+
# 1.0 refers to the user agent string version
|
|
257
|
+
return f"DataHub-Client/1.0 ({client_mode.name.lower()}; {self.datahub_component if self.datahub_component else DATAHUB_COMPONENT_ENV}; {version}){requests_user_agent}"
|
|
258
|
+
|
|
190
259
|
|
|
191
260
|
@dataclass
|
|
192
261
|
class _Chunk:
|
|
@@ -212,8 +281,9 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
212
281
|
_gms_server: str
|
|
213
282
|
_token: Optional[str]
|
|
214
283
|
_session: requests.Session
|
|
215
|
-
_openapi_ingestion: bool
|
|
284
|
+
_openapi_ingestion: Optional[bool]
|
|
216
285
|
_default_trace_mode: bool
|
|
286
|
+
server_config: RestServiceConfig
|
|
217
287
|
|
|
218
288
|
def __init__(
|
|
219
289
|
self,
|
|
@@ -229,10 +299,10 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
229
299
|
ca_certificate_path: Optional[str] = None,
|
|
230
300
|
client_certificate_path: Optional[str] = None,
|
|
231
301
|
disable_ssl_verification: bool = False,
|
|
232
|
-
openapi_ingestion: bool =
|
|
233
|
-
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
234
|
-
),
|
|
302
|
+
openapi_ingestion: Optional[bool] = None,
|
|
235
303
|
default_trace_mode: bool = False,
|
|
304
|
+
client_mode: Optional[ClientMode] = None,
|
|
305
|
+
datahub_component: Optional[str] = None,
|
|
236
306
|
):
|
|
237
307
|
if not gms_server:
|
|
238
308
|
raise ConfigurationError("gms server is required")
|
|
@@ -244,13 +314,10 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
244
314
|
|
|
245
315
|
self._gms_server = fixup_gms_url(gms_server)
|
|
246
316
|
self._token = token
|
|
247
|
-
self.server_config: Dict[str, Any] = {}
|
|
248
|
-
self._openapi_ingestion = openapi_ingestion
|
|
249
317
|
self._default_trace_mode = default_trace_mode
|
|
250
318
|
self._session = requests.Session()
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
319
|
+
self._openapi_ingestion = (
|
|
320
|
+
openapi_ingestion # Re-evaluated after test connection
|
|
254
321
|
)
|
|
255
322
|
|
|
256
323
|
if self._default_trace_mode:
|
|
@@ -258,7 +325,6 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
258
325
|
|
|
259
326
|
headers = {
|
|
260
327
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
261
|
-
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
262
328
|
"Content-Type": "application/json",
|
|
263
329
|
}
|
|
264
330
|
if token:
|
|
@@ -304,37 +370,54 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
304
370
|
ca_certificate_path=ca_certificate_path,
|
|
305
371
|
client_certificate_path=client_certificate_path,
|
|
306
372
|
disable_ssl_verification=disable_ssl_verification,
|
|
373
|
+
client_mode=client_mode,
|
|
374
|
+
datahub_component=datahub_component,
|
|
307
375
|
)
|
|
308
376
|
|
|
309
377
|
self._session = self._session_config.build_session()
|
|
310
378
|
|
|
311
379
|
def test_connection(self) -> None:
|
|
312
380
|
url = f"{self._gms_server}/config"
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
config
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
381
|
+
try:
|
|
382
|
+
# Create a config instance with session and URL
|
|
383
|
+
config = RestServiceConfig(session=self._session, url=url)
|
|
384
|
+
# Attempt to load config, which will throw ConfigurationError if there's an issue
|
|
385
|
+
config.fetch_config()
|
|
386
|
+
self.server_config = config
|
|
387
|
+
|
|
388
|
+
# Determine OpenAPI mode
|
|
389
|
+
if self._openapi_ingestion is None:
|
|
390
|
+
# No constructor parameter
|
|
391
|
+
if (
|
|
392
|
+
not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
|
|
393
|
+
and self._session_config.client_mode == ClientMode.SDK
|
|
394
|
+
and self.server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
|
|
395
|
+
):
|
|
396
|
+
# Enable if SDK client and no environment variable specified
|
|
397
|
+
self._openapi_ingestion = True
|
|
398
|
+
else:
|
|
399
|
+
# The system env is specifying the value
|
|
400
|
+
self._openapi_ingestion = (
|
|
401
|
+
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
402
|
+
)
|
|
319
403
|
|
|
320
|
-
else:
|
|
321
|
-
raise ConfigurationError(
|
|
322
|
-
"You seem to have connected to the frontend service instead of the GMS endpoint. "
|
|
323
|
-
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
|
|
324
|
-
"For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
|
|
325
|
-
)
|
|
326
|
-
else:
|
|
327
404
|
logger.debug(
|
|
328
|
-
f"
|
|
405
|
+
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
329
406
|
)
|
|
330
|
-
if response.status_code == 401:
|
|
331
|
-
message = f"Unable to connect to {url} - got an authentication error: {response.text}."
|
|
332
|
-
else:
|
|
333
|
-
message = f"Unable to connect to {url} with status_code: {response.status_code}."
|
|
334
|
-
message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
|
|
335
|
-
raise ConfigurationError(message)
|
|
336
407
|
|
|
337
|
-
|
|
408
|
+
# Set default tracing for SDK
|
|
409
|
+
if (
|
|
410
|
+
self._session_config.client_mode == ClientMode.SDK
|
|
411
|
+
and self.server_config.supports_feature(ServiceFeature.API_TRACING)
|
|
412
|
+
):
|
|
413
|
+
# Enable tracing if using SDK & server supported
|
|
414
|
+
self._default_trace_mode = True
|
|
415
|
+
|
|
416
|
+
except ConfigurationError as e:
|
|
417
|
+
# Just re-raise the exception
|
|
418
|
+
raise e
|
|
419
|
+
|
|
420
|
+
def get_server_config(self) -> RestServiceConfig:
|
|
338
421
|
self.test_connection()
|
|
339
422
|
return self.server_config
|
|
340
423
|
|
|
@@ -348,43 +431,24 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
348
431
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
349
432
|
async_flag: Optional[bool] = None,
|
|
350
433
|
async_default: bool = False,
|
|
351
|
-
) -> Optional[
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
async_flag if async_flag is not None else async_default
|
|
355
|
-
)
|
|
356
|
-
url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
|
|
434
|
+
) -> Optional[OpenApiRequest]:
|
|
435
|
+
"""
|
|
436
|
+
Convert a MetadataChangeProposal to an OpenAPI request format.
|
|
357
437
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
aspect_value = pre_json_transform(obj)
|
|
373
|
-
return (
|
|
374
|
-
url,
|
|
375
|
-
[
|
|
376
|
-
{
|
|
377
|
-
"urn": mcp.entityUrn,
|
|
378
|
-
mcp.aspectName: {
|
|
379
|
-
"value": aspect_value,
|
|
380
|
-
"systemMetadata": mcp.systemMetadata.to_obj()
|
|
381
|
-
if mcp.systemMetadata
|
|
382
|
-
else None,
|
|
383
|
-
},
|
|
384
|
-
}
|
|
385
|
-
],
|
|
386
|
-
)
|
|
387
|
-
return None
|
|
438
|
+
Args:
|
|
439
|
+
mcp: The metadata change proposal
|
|
440
|
+
async_flag: Optional flag to override async behavior
|
|
441
|
+
async_default: Default async behavior if not specified
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
An OpenApiRequest object or None if the MCP doesn't have required fields
|
|
445
|
+
"""
|
|
446
|
+
return OpenApiRequest.from_mcp(
|
|
447
|
+
mcp=mcp,
|
|
448
|
+
gms_server=self._gms_server,
|
|
449
|
+
async_flag=async_flag,
|
|
450
|
+
async_default=async_default,
|
|
451
|
+
)
|
|
388
452
|
|
|
389
453
|
def emit(
|
|
390
454
|
self,
|
|
@@ -448,7 +512,9 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
448
512
|
if self._openapi_ingestion:
|
|
449
513
|
request = self._to_openapi_request(mcp, async_flag, async_default=False)
|
|
450
514
|
if request:
|
|
451
|
-
response = self._emit_generic(
|
|
515
|
+
response = self._emit_generic(
|
|
516
|
+
request.url, payload=request.payload, method=request.method
|
|
517
|
+
)
|
|
452
518
|
|
|
453
519
|
if self._should_trace(async_flag, trace_flag):
|
|
454
520
|
trace_data = extract_trace_data(response) if response else None
|
|
@@ -503,31 +569,36 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
503
569
|
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
504
570
|
) -> int:
|
|
505
571
|
"""
|
|
506
|
-
1. Grouping MCPs by their entity URL
|
|
572
|
+
1. Grouping MCPs by their HTTP method and entity URL and HTTP method
|
|
507
573
|
2. Breaking down large batches into smaller chunks based on both:
|
|
508
574
|
* Total byte size (INGEST_MAX_PAYLOAD_BYTES)
|
|
509
575
|
* Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
|
|
510
576
|
|
|
511
577
|
The Chunk class encapsulates both the items and their byte size tracking
|
|
512
|
-
Serializing the items only once with json.dumps(request
|
|
578
|
+
Serializing the items only once with json.dumps(request.payload) and reusing that
|
|
513
579
|
The chunking logic handles edge cases (always accepting at least one item per chunk)
|
|
514
580
|
The joining logic is efficient with a simple string concatenation
|
|
515
581
|
|
|
516
582
|
:param mcps: metadata change proposals to transmit
|
|
517
583
|
:param async_flag: the mode
|
|
584
|
+
:param trace_flag: whether to trace the requests
|
|
585
|
+
:param trace_timeout: timeout for tracing
|
|
518
586
|
:return: number of requests
|
|
519
587
|
"""
|
|
520
|
-
#
|
|
521
|
-
batches: Dict[str, List[_Chunk]] = defaultdict(
|
|
588
|
+
# Group by entity URL and HTTP method
|
|
589
|
+
batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
|
|
522
590
|
lambda: [_Chunk(items=[])]
|
|
523
591
|
) # Initialize with one empty Chunk
|
|
524
592
|
|
|
525
593
|
for mcp in mcps:
|
|
526
594
|
request = self._to_openapi_request(mcp, async_flag, async_default=True)
|
|
527
595
|
if request:
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
596
|
+
# Create a composite key with both method and URL
|
|
597
|
+
key = (request.method, request.url)
|
|
598
|
+
current_chunk = batches[key][-1] # Get the last chunk
|
|
599
|
+
|
|
600
|
+
# Only serialize once - we're serializing a single payload item
|
|
601
|
+
serialized_item = json.dumps(request.payload[0])
|
|
531
602
|
item_bytes = len(serialized_item.encode())
|
|
532
603
|
|
|
533
604
|
# If adding this item would exceed max_bytes, create a new chunk
|
|
@@ -537,15 +608,17 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
537
608
|
or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
538
609
|
):
|
|
539
610
|
new_chunk = _Chunk(items=[])
|
|
540
|
-
batches[
|
|
611
|
+
batches[key].append(new_chunk)
|
|
541
612
|
current_chunk = new_chunk
|
|
542
613
|
|
|
543
614
|
current_chunk.add_item(serialized_item)
|
|
544
615
|
|
|
545
616
|
responses = []
|
|
546
|
-
for url, chunks in batches.items():
|
|
617
|
+
for (method, url), chunks in batches.items():
|
|
547
618
|
for chunk in chunks:
|
|
548
|
-
response = self._emit_generic(
|
|
619
|
+
response = self._emit_generic(
|
|
620
|
+
url, payload=_Chunk.join(chunk), method=method
|
|
621
|
+
)
|
|
549
622
|
responses.append(response)
|
|
550
623
|
|
|
551
624
|
if self._should_trace(async_flag, trace_flag, async_default=True):
|
|
@@ -618,11 +691,13 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
618
691
|
payload = json.dumps(snapshot)
|
|
619
692
|
self._emit_generic(url, payload)
|
|
620
693
|
|
|
621
|
-
def _emit_generic(
|
|
694
|
+
def _emit_generic(
|
|
695
|
+
self, url: str, payload: Union[str, Any], method: str = "POST"
|
|
696
|
+
) -> requests.Response:
|
|
622
697
|
if not isinstance(payload, str):
|
|
623
698
|
payload = json.dumps(payload)
|
|
624
699
|
|
|
625
|
-
curl_command = make_curl_command(self._session,
|
|
700
|
+
curl_command = make_curl_command(self._session, method, url, payload)
|
|
626
701
|
payload_size = len(payload)
|
|
627
702
|
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
628
703
|
# since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
|
|
@@ -635,7 +710,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
635
710
|
curl_command,
|
|
636
711
|
)
|
|
637
712
|
try:
|
|
638
|
-
|
|
713
|
+
method_func = getattr(self._session, method.lower())
|
|
714
|
+
response = method_func(url, data=payload) if payload else method_func(url)
|
|
639
715
|
response.raise_for_status()
|
|
640
716
|
return response
|
|
641
717
|
except HTTPError as e:
|
|
@@ -759,12 +835,6 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
759
835
|
trace_flag if trace_flag is not None else self._default_trace_mode
|
|
760
836
|
)
|
|
761
837
|
resolved_async_flag = async_flag if async_flag is not None else async_default
|
|
762
|
-
if resolved_trace_flag and not resolved_async_flag:
|
|
763
|
-
warnings.warn(
|
|
764
|
-
"API tracing is only available with async ingestion. For sync mode, API errors will be surfaced as exceptions.",
|
|
765
|
-
APITracingWarning,
|
|
766
|
-
stacklevel=3,
|
|
767
|
-
)
|
|
768
838
|
return resolved_trace_flag and resolved_async_flag
|
|
769
839
|
|
|
770
840
|
def __repr__(self) -> str:
|
datahub/entrypoints.py
CHANGED
|
@@ -37,6 +37,7 @@ from datahub.cli.telemetry import telemetry as telemetry_cli
|
|
|
37
37
|
from datahub.cli.timeline_cli import timeline
|
|
38
38
|
from datahub.configuration.common import should_show_stack_trace
|
|
39
39
|
from datahub.ingestion.graph.client import get_default_graph
|
|
40
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
40
41
|
from datahub.telemetry import telemetry
|
|
41
42
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
42
43
|
from datahub.utilities.logging_manager import configure_logging
|
|
@@ -117,7 +118,7 @@ def version(include_server: bool = False) -> None:
|
|
|
117
118
|
click.echo(f"Models: {model_version_name()}")
|
|
118
119
|
click.echo(f"Python version: {sys.version}")
|
|
119
120
|
if include_server:
|
|
120
|
-
server_config = get_default_graph().get_config()
|
|
121
|
+
server_config = get_default_graph(ClientMode.CLI).get_config()
|
|
121
122
|
click.echo(f"Server config: {server_config}")
|
|
122
123
|
|
|
123
124
|
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -420,12 +420,9 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
420
420
|
Run in order, first in list is applied first. Be careful with order when overriding.
|
|
421
421
|
"""
|
|
422
422
|
browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
|
|
423
|
-
if
|
|
424
|
-
self.ctx.pipeline_config
|
|
425
|
-
and self.ctx.pipeline_config.flags.generate_browse_path_v2
|
|
426
|
-
):
|
|
423
|
+
if self.ctx.flags.generate_browse_path_v2:
|
|
427
424
|
browse_path_processor = self._get_browse_path_processor(
|
|
428
|
-
self.ctx.
|
|
425
|
+
self.ctx.flags.generate_browse_path_v2_dry_run
|
|
429
426
|
)
|
|
430
427
|
|
|
431
428
|
auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
|
|
@@ -319,8 +319,10 @@ def classification_workunit_processor(
|
|
|
319
319
|
partial(
|
|
320
320
|
data_reader.get_sample_data_for_table,
|
|
321
321
|
table_id,
|
|
322
|
-
|
|
323
|
-
|
|
322
|
+
int(
|
|
323
|
+
classification_handler.config.classification.sample_size
|
|
324
|
+
* SAMPLE_SIZE_MULTIPLIER
|
|
325
|
+
),
|
|
324
326
|
**(data_reader_kwargs or {}),
|
|
325
327
|
)
|
|
326
328
|
if data_reader
|
|
@@ -34,14 +34,13 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
|
|
|
34
34
|
from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
|
|
35
35
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
36
36
|
from datahub.emitter.rest_emitter import (
|
|
37
|
-
DEFAULT_REST_EMITTER_ENDPOINT,
|
|
38
37
|
DEFAULT_REST_TRACE_MODE,
|
|
39
38
|
DatahubRestEmitter,
|
|
40
|
-
RestSinkEndpoint,
|
|
41
39
|
RestTraceMode,
|
|
42
40
|
)
|
|
43
41
|
from datahub.emitter.serialization_helper import post_json_transform
|
|
44
42
|
from datahub.ingestion.graph.config import (
|
|
43
|
+
ClientMode,
|
|
45
44
|
DatahubClientConfig as DatahubClientConfig,
|
|
46
45
|
)
|
|
47
46
|
from datahub.ingestion.graph.connections import (
|
|
@@ -158,11 +157,12 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
158
157
|
ca_certificate_path=self.config.ca_certificate_path,
|
|
159
158
|
client_certificate_path=self.config.client_certificate_path,
|
|
160
159
|
disable_ssl_verification=self.config.disable_ssl_verification,
|
|
161
|
-
openapi_ingestion=
|
|
160
|
+
openapi_ingestion=self.config.openapi_ingestion,
|
|
162
161
|
default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
|
|
162
|
+
client_mode=config.client_mode,
|
|
163
|
+
datahub_component=config.datahub_component,
|
|
163
164
|
)
|
|
164
|
-
|
|
165
|
-
self.server_id = _MISSING_SERVER_ID
|
|
165
|
+
self.server_id: str = _MISSING_SERVER_ID
|
|
166
166
|
|
|
167
167
|
def test_connection(self) -> None:
|
|
168
168
|
super().test_connection()
|
|
@@ -193,7 +193,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
193
193
|
if not self.server_config:
|
|
194
194
|
self.test_connection()
|
|
195
195
|
|
|
196
|
-
base_url = self.server_config.get("baseUrl")
|
|
196
|
+
base_url = self.server_config.raw_config.get("baseUrl")
|
|
197
197
|
if not base_url:
|
|
198
198
|
raise ValueError("baseUrl not found in server config")
|
|
199
199
|
return base_url
|
|
@@ -201,6 +201,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
201
201
|
@classmethod
|
|
202
202
|
def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
|
|
203
203
|
session_config = emitter._session_config
|
|
204
|
+
|
|
204
205
|
if isinstance(session_config.timeout, tuple):
|
|
205
206
|
# TODO: This is slightly lossy. Eventually, we want to modify the emitter
|
|
206
207
|
# to accept a tuple for timeout_sec, and then we'll be able to remove this.
|
|
@@ -218,6 +219,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
218
219
|
disable_ssl_verification=session_config.disable_ssl_verification,
|
|
219
220
|
ca_certificate_path=session_config.ca_certificate_path,
|
|
220
221
|
client_certificate_path=session_config.client_certificate_path,
|
|
222
|
+
client_mode=session_config.client_mode,
|
|
223
|
+
datahub_component=session_config.datahub_component,
|
|
221
224
|
)
|
|
222
225
|
)
|
|
223
226
|
|
|
@@ -1952,8 +1955,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1952
1955
|
super().close()
|
|
1953
1956
|
|
|
1954
1957
|
|
|
1955
|
-
|
|
1958
|
+
@functools.lru_cache(maxsize=None)
|
|
1959
|
+
def get_default_graph(
|
|
1960
|
+
client_mode: Optional[ClientMode] = None,
|
|
1961
|
+
datahub_component: Optional[str] = None,
|
|
1962
|
+
) -> DataHubGraph:
|
|
1956
1963
|
graph_config = config_utils.load_client_config()
|
|
1964
|
+
graph_config.client_mode = client_mode
|
|
1965
|
+
graph_config.datahub_component = datahub_component
|
|
1957
1966
|
graph = DataHubGraph(graph_config)
|
|
1958
1967
|
graph.test_connection()
|
|
1959
1968
|
telemetry_instance.set_context(server=graph)
|
|
@@ -1,8 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from enum import Enum, auto
|
|
1
3
|
from typing import Dict, List, Optional
|
|
2
4
|
|
|
3
5
|
from datahub.configuration.common import ConfigModel
|
|
4
6
|
|
|
5
7
|
|
|
8
|
+
class ClientMode(Enum):
|
|
9
|
+
INGESTION = auto()
|
|
10
|
+
CLI = auto()
|
|
11
|
+
SDK = auto()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
DATAHUB_COMPONENT_ENV: str = os.getenv("DATAHUB_COMPONENT", "datahub").lower()
|
|
15
|
+
|
|
16
|
+
|
|
6
17
|
class DatahubClientConfig(ConfigModel):
|
|
7
18
|
"""Configuration class for holding connectivity to datahub gms"""
|
|
8
19
|
|
|
@@ -17,3 +28,6 @@ class DatahubClientConfig(ConfigModel):
|
|
|
17
28
|
ca_certificate_path: Optional[str] = None
|
|
18
29
|
client_certificate_path: Optional[str] = None
|
|
19
30
|
disable_ssl_verification: bool = False
|
|
31
|
+
openapi_ingestion: Optional[bool] = None
|
|
32
|
+
client_mode: Optional[ClientMode] = None
|
|
33
|
+
datahub_component: Optional[str] = None
|
|
@@ -18,7 +18,7 @@ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
|
|
|
18
18
|
# This can be put directly into the orFilters parameter in GraphQL.
|
|
19
19
|
RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
|
|
20
20
|
|
|
21
|
-
# Mirrors our GraphQL enum: https://
|
|
21
|
+
# Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
|
|
22
22
|
FilterOperator: TypeAlias = Literal[
|
|
23
23
|
"CONTAIN",
|
|
24
24
|
"EQUAL",
|
|
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
|
|
|
31
31
|
from datahub.ingestion.api.transform import Transformer
|
|
32
32
|
from datahub.ingestion.extractor.extractor_registry import extractor_registry
|
|
33
33
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
34
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
34
35
|
from datahub.ingestion.reporting.reporting_provider_registry import (
|
|
35
36
|
reporting_provider_registry,
|
|
36
37
|
)
|
|
@@ -136,9 +137,8 @@ class CliReport(Report):
|
|
|
136
137
|
|
|
137
138
|
|
|
138
139
|
def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
|
|
139
|
-
graph = get_default_graph()
|
|
140
|
+
graph = get_default_graph(ClientMode.INGESTION)
|
|
140
141
|
sink_config = graph._make_rest_sink_config()
|
|
141
|
-
|
|
142
142
|
return DatahubRestSink(ctx, sink_config)
|
|
143
143
|
|
|
144
144
|
|
|
@@ -175,6 +175,7 @@ class Pipeline:
|
|
|
175
175
|
self.graph: Optional[DataHubGraph] = None
|
|
176
176
|
with _add_init_error_context("connect to DataHub"):
|
|
177
177
|
if self.config.datahub_api:
|
|
178
|
+
self.config.datahub_api.client_mode = ClientMode.INGESTION
|
|
178
179
|
self.graph = exit_stack.enter_context(
|
|
179
180
|
DataHubGraph(self.config.datahub_api)
|
|
180
181
|
)
|
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
|
|
|
7
7
|
from pydantic import Field, validator
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import ConfigModel, DynamicTypedConfig
|
|
10
|
-
from datahub.ingestion.graph.
|
|
10
|
+
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
11
11
|
from datahub.ingestion.sink.file import FileSinkConfig
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
@@ -34,7 +34,7 @@ from datahub.ingestion.api.sink import (
|
|
|
34
34
|
WriteCallback,
|
|
35
35
|
)
|
|
36
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
|
-
from datahub.ingestion.graph.
|
|
37
|
+
from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
|
|
38
38
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
39
39
|
MetadataChangeEvent,
|
|
40
40
|
MetadataChangeProposal,
|
|
@@ -140,11 +140,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
140
140
|
f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
|
|
141
141
|
) from exc
|
|
142
142
|
|
|
143
|
-
self.report.gms_version =
|
|
144
|
-
gms_config.get("versions", {})
|
|
145
|
-
.get("acryldata/datahub", {})
|
|
146
|
-
.get("version", None)
|
|
147
|
-
)
|
|
143
|
+
self.report.gms_version = gms_config.service_version
|
|
148
144
|
self.report.mode = self.config.mode
|
|
149
145
|
self.report.max_threads = self.config.max_threads
|
|
150
146
|
logger.debug("Setting env variables to override config")
|
|
@@ -180,6 +176,8 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
180
176
|
disable_ssl_verification=config.disable_ssl_verification,
|
|
181
177
|
openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
|
|
182
178
|
default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
|
|
179
|
+
client_mode=config.client_mode,
|
|
180
|
+
datahub_component=config.datahub_component,
|
|
183
181
|
)
|
|
184
182
|
|
|
185
183
|
@property
|
|
@@ -190,6 +188,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
190
188
|
# https://github.com/psf/requests/issues/1871#issuecomment-32751346
|
|
191
189
|
thread_local = self._emitter_thread_local
|
|
192
190
|
if not hasattr(thread_local, "emitter"):
|
|
191
|
+
self.config.client_mode = ClientMode.INGESTION
|
|
193
192
|
thread_local.emitter = DatahubRestSink._make_emitter(self.config)
|
|
194
193
|
return thread_local.emitter
|
|
195
194
|
|