acryl-datahub 1.0.0.3rc12__py3-none-any.whl → 1.0.0.4rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/METADATA +2509 -2512
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/RECORD +36 -33
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/emitter/request_helper.py +10 -5
- datahub/emitter/rest_emitter.py +183 -106
- datahub/ingestion/extractor/schema_util.py +17 -1
- datahub/ingestion/graph/client.py +17 -4
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/sink/datahub_rest.py +11 -10
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/ge_data_profiler.py +25 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +1 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +5 -17
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/metadata/_internal_schema_classes.py +503 -490
- datahub/metadata/_urns/urn_defs.py +1528 -1528
- datahub/metadata/schema.avsc +15431 -15414
- datahub/metadata/schemas/Operation.avsc +17 -0
- datahub/sdk/main_client.py +15 -0
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -2
- datahub/utilities/server_config_util.py +14 -75
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc1.dist-info}/top_level.txt +0 -0
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -20,6 +20,7 @@ from typing import (
|
|
|
20
20
|
Sequence,
|
|
21
21
|
Tuple,
|
|
22
22
|
Union,
|
|
23
|
+
overload,
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
import pydantic
|
|
@@ -103,9 +104,28 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
103
104
|
)
|
|
104
105
|
|
|
105
106
|
|
|
106
|
-
class
|
|
107
|
-
|
|
108
|
-
|
|
107
|
+
class EmitMode(ConfigEnum):
|
|
108
|
+
# Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
|
|
109
|
+
# Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
|
|
110
|
+
# searchability and consistent reads are required.
|
|
111
|
+
SYNC_WAIT = auto()
|
|
112
|
+
# Synchronously updates the primary storage (SQL) but asynchronously updates search storage (Elasticsearch). Provides
|
|
113
|
+
# a balance between consistency and performance. Suitable for updates that need to be immediately reflected in direct
|
|
114
|
+
# entity retrievals but where search index consistency can be slightly delayed.
|
|
115
|
+
SYNC_PRIMARY = auto()
|
|
116
|
+
# Queues the metadata change for asynchronous processing and returns immediately. The client continues execution without
|
|
117
|
+
# waiting for the change to be fully processed. Best for high-throughput scenarios where eventual consistency is acceptable.
|
|
118
|
+
ASYNC = auto()
|
|
119
|
+
# Queues the metadata change asynchronously but blocks until confirmation that the write has been fully persisted.
|
|
120
|
+
# More efficient than fully synchronous operations due to backend parallelization and batching while still providing
|
|
121
|
+
# strong consistency guarantees. Useful when you need confirmation of successful persistence without sacrificing performance.
|
|
122
|
+
ASYNC_WAIT = auto()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
_DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
|
|
126
|
+
EmitMode,
|
|
127
|
+
os.getenv("DATAHUB_EMIT_MODE", EmitMode.SYNC_PRIMARY),
|
|
128
|
+
)
|
|
109
129
|
|
|
110
130
|
|
|
111
131
|
class RestSinkEndpoint(ConfigEnum):
|
|
@@ -119,13 +139,6 @@ DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
|
|
|
119
139
|
)
|
|
120
140
|
|
|
121
141
|
|
|
122
|
-
# Supported with v1.0
|
|
123
|
-
DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
|
|
124
|
-
RestTraceMode,
|
|
125
|
-
os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
|
|
129
142
|
class RequestsSessionConfig(ConfigModel):
|
|
130
143
|
timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
|
|
131
144
|
|
|
@@ -282,8 +295,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
282
295
|
_token: Optional[str]
|
|
283
296
|
_session: requests.Session
|
|
284
297
|
_openapi_ingestion: Optional[bool]
|
|
285
|
-
|
|
286
|
-
server_config: RestServiceConfig
|
|
298
|
+
_server_config: RestServiceConfig
|
|
287
299
|
|
|
288
300
|
def __init__(
|
|
289
301
|
self,
|
|
@@ -300,7 +312,6 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
300
312
|
client_certificate_path: Optional[str] = None,
|
|
301
313
|
disable_ssl_verification: bool = False,
|
|
302
314
|
openapi_ingestion: Optional[bool] = None,
|
|
303
|
-
default_trace_mode: bool = False,
|
|
304
315
|
client_mode: Optional[ClientMode] = None,
|
|
305
316
|
datahub_component: Optional[str] = None,
|
|
306
317
|
):
|
|
@@ -314,15 +325,11 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
314
325
|
|
|
315
326
|
self._gms_server = fixup_gms_url(gms_server)
|
|
316
327
|
self._token = token
|
|
317
|
-
self._default_trace_mode = default_trace_mode
|
|
318
328
|
self._session = requests.Session()
|
|
319
329
|
self._openapi_ingestion = (
|
|
320
330
|
openapi_ingestion # Re-evaluated after test connection
|
|
321
331
|
)
|
|
322
332
|
|
|
323
|
-
if self._default_trace_mode:
|
|
324
|
-
logger.debug("Using API Tracing for ingestion.")
|
|
325
|
-
|
|
326
333
|
headers = {
|
|
327
334
|
"X-RestLi-Protocol-Version": "2.0.0",
|
|
328
335
|
"Content-Type": "application/json",
|
|
@@ -376,50 +383,88 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
376
383
|
|
|
377
384
|
self._session = self._session_config.build_session()
|
|
378
385
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
386
|
+
@property
|
|
387
|
+
def server_config(self) -> RestServiceConfig:
|
|
388
|
+
return self.fetch_server_config()
|
|
389
|
+
|
|
390
|
+
# TODO: This should move to DataHubGraph once it no longer inherits from DataHubRestEmitter
|
|
391
|
+
def fetch_server_config(self) -> RestServiceConfig:
|
|
392
|
+
"""
|
|
393
|
+
Fetch configuration from the server if not already loaded.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
The configuration dictionary
|
|
397
|
+
|
|
398
|
+
Raises:
|
|
399
|
+
ConfigurationError: If there's an error fetching or validating the configuration
|
|
400
|
+
"""
|
|
401
|
+
if not hasattr(self, "_server_config") or not self._server_config:
|
|
402
|
+
if self._session is None or self._gms_server is None:
|
|
403
|
+
raise ConfigurationError(
|
|
404
|
+
"Session and URL are required to load configuration"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
url = f"{self._gms_server}/config"
|
|
408
|
+
response = self._session.get(url)
|
|
409
|
+
|
|
410
|
+
if response.status_code == 200:
|
|
411
|
+
raw_config = response.json()
|
|
412
|
+
|
|
413
|
+
# Validate that we're connected to the correct service
|
|
414
|
+
if not raw_config.get("noCode") == "true":
|
|
415
|
+
raise ConfigurationError(
|
|
416
|
+
"You seem to have connected to the frontend service instead of the GMS endpoint. "
|
|
417
|
+
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
|
|
418
|
+
"For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
|
|
402
419
|
)
|
|
403
420
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
)
|
|
421
|
+
self._server_config = RestServiceConfig(raw_config=raw_config)
|
|
422
|
+
self._post_fetch_server_config()
|
|
407
423
|
|
|
408
|
-
|
|
424
|
+
else:
|
|
425
|
+
logger.debug(
|
|
426
|
+
f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
if response.status_code == 401:
|
|
430
|
+
message = f"Unable to connect to {url} - got an authentication error: {response.text}."
|
|
431
|
+
else:
|
|
432
|
+
message = f"Unable to connect to {url} with status_code: {response.status_code}."
|
|
433
|
+
|
|
434
|
+
message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
|
|
435
|
+
raise ConfigurationError(message)
|
|
436
|
+
|
|
437
|
+
return self._server_config
|
|
438
|
+
|
|
439
|
+
def _post_fetch_server_config(self) -> None:
|
|
440
|
+
# Determine OpenAPI mode
|
|
441
|
+
if self._openapi_ingestion is None:
|
|
442
|
+
# No constructor parameter
|
|
409
443
|
if (
|
|
410
|
-
|
|
411
|
-
and self.
|
|
444
|
+
not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
|
|
445
|
+
and self._session_config.client_mode == ClientMode.SDK
|
|
446
|
+
and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
|
|
412
447
|
):
|
|
413
|
-
# Enable
|
|
414
|
-
self.
|
|
448
|
+
# Enable if SDK client and no environment variable specified
|
|
449
|
+
self._openapi_ingestion = True
|
|
450
|
+
else:
|
|
451
|
+
# The system env is specifying the value
|
|
452
|
+
self._openapi_ingestion = (
|
|
453
|
+
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
logger.debug(
|
|
457
|
+
f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
|
|
458
|
+
)
|
|
459
|
+
logger.debug(
|
|
460
|
+
f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
|
|
461
|
+
)
|
|
415
462
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
raise e
|
|
463
|
+
def test_connection(self) -> None:
|
|
464
|
+
self.fetch_server_config()
|
|
419
465
|
|
|
420
|
-
def get_server_config(self) ->
|
|
421
|
-
self.
|
|
422
|
-
return self.server_config
|
|
466
|
+
def get_server_config(self) -> dict:
|
|
467
|
+
return self.server_config.raw_config
|
|
423
468
|
|
|
424
469
|
def to_graph(self) -> "DataHubGraph":
|
|
425
470
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
@@ -429,16 +474,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
429
474
|
def _to_openapi_request(
|
|
430
475
|
self,
|
|
431
476
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
432
|
-
|
|
433
|
-
async_default: bool = False,
|
|
477
|
+
emit_mode: EmitMode,
|
|
434
478
|
) -> Optional[OpenApiRequest]:
|
|
435
479
|
"""
|
|
436
480
|
Convert a MetadataChangeProposal to an OpenAPI request format.
|
|
437
481
|
|
|
438
482
|
Args:
|
|
439
483
|
mcp: The metadata change proposal
|
|
440
|
-
|
|
441
|
-
async_default: Default async behavior if not specified
|
|
484
|
+
emit_mode: Client emit mode
|
|
442
485
|
|
|
443
486
|
Returns:
|
|
444
487
|
An OpenApiRequest object or None if the MCP doesn't have required fields
|
|
@@ -446,8 +489,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
446
489
|
return OpenApiRequest.from_mcp(
|
|
447
490
|
mcp=mcp,
|
|
448
491
|
gms_server=self._gms_server,
|
|
449
|
-
async_flag=
|
|
450
|
-
|
|
492
|
+
async_flag=emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT),
|
|
493
|
+
search_sync_flag=emit_mode == EmitMode.SYNC_WAIT,
|
|
451
494
|
)
|
|
452
495
|
|
|
453
496
|
def emit(
|
|
@@ -459,7 +502,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
459
502
|
UsageAggregation,
|
|
460
503
|
],
|
|
461
504
|
callback: Optional[Callable[[Exception, str], None]] = None,
|
|
462
|
-
|
|
505
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
463
506
|
) -> None:
|
|
464
507
|
try:
|
|
465
508
|
if isinstance(item, UsageAggregation):
|
|
@@ -467,7 +510,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
467
510
|
elif isinstance(
|
|
468
511
|
item, (MetadataChangeProposal, MetadataChangeProposalWrapper)
|
|
469
512
|
):
|
|
470
|
-
self.emit_mcp(item,
|
|
513
|
+
self.emit_mcp(item, emit_mode=emit_mode)
|
|
471
514
|
else:
|
|
472
515
|
self.emit_mce(item)
|
|
473
516
|
except Exception as e:
|
|
@@ -498,41 +541,64 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
498
541
|
|
|
499
542
|
self._emit_generic(url, payload)
|
|
500
543
|
|
|
544
|
+
@overload
|
|
545
|
+
@deprecated("Use emit_mode instead of async_flag")
|
|
501
546
|
def emit_mcp(
|
|
502
547
|
self,
|
|
503
548
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
549
|
+
*,
|
|
504
550
|
async_flag: Optional[bool] = None,
|
|
505
|
-
|
|
506
|
-
|
|
551
|
+
) -> None: ...
|
|
552
|
+
|
|
553
|
+
@overload
|
|
554
|
+
def emit_mcp(
|
|
555
|
+
self,
|
|
556
|
+
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
557
|
+
*,
|
|
558
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
559
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
560
|
+
) -> None: ...
|
|
561
|
+
|
|
562
|
+
def emit_mcp(
|
|
563
|
+
self,
|
|
564
|
+
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
565
|
+
async_flag: Optional[bool] = None,
|
|
566
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
567
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
507
568
|
) -> None:
|
|
569
|
+
if async_flag is True:
|
|
570
|
+
emit_mode = EmitMode.ASYNC
|
|
571
|
+
|
|
508
572
|
ensure_has_system_metadata(mcp)
|
|
509
573
|
|
|
510
574
|
trace_data = None
|
|
511
575
|
|
|
512
576
|
if self._openapi_ingestion:
|
|
513
|
-
request = self._to_openapi_request(mcp,
|
|
577
|
+
request = self._to_openapi_request(mcp, emit_mode)
|
|
514
578
|
if request:
|
|
515
579
|
response = self._emit_generic(
|
|
516
580
|
request.url, payload=request.payload, method=request.method
|
|
517
581
|
)
|
|
518
582
|
|
|
519
|
-
if self._should_trace(
|
|
583
|
+
if self._should_trace(emit_mode):
|
|
520
584
|
trace_data = extract_trace_data(response) if response else None
|
|
521
585
|
|
|
522
586
|
else:
|
|
523
587
|
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
524
588
|
|
|
525
589
|
mcp_obj = pre_json_transform(mcp.to_obj())
|
|
526
|
-
payload_dict = {
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
590
|
+
payload_dict = {
|
|
591
|
+
"proposal": mcp_obj,
|
|
592
|
+
"async": "true"
|
|
593
|
+
if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
|
|
594
|
+
else "false",
|
|
595
|
+
}
|
|
530
596
|
|
|
531
597
|
payload = json.dumps(payload_dict)
|
|
532
598
|
|
|
533
599
|
response = self._emit_generic(url, payload)
|
|
534
600
|
|
|
535
|
-
if self._should_trace(
|
|
601
|
+
if self._should_trace(emit_mode):
|
|
536
602
|
trace_data = (
|
|
537
603
|
extract_trace_data_from_mcps(response, [mcp]) if response else None
|
|
538
604
|
)
|
|
@@ -540,15 +606,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
540
606
|
if trace_data:
|
|
541
607
|
self._await_status(
|
|
542
608
|
[trace_data],
|
|
543
|
-
|
|
609
|
+
wait_timeout,
|
|
544
610
|
)
|
|
545
611
|
|
|
546
612
|
def emit_mcps(
|
|
547
613
|
self,
|
|
548
614
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
615
|
+
emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
|
|
616
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
552
617
|
) -> int:
|
|
553
618
|
if _DATAHUB_EMITTER_TRACE:
|
|
554
619
|
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
@@ -557,16 +622,15 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
557
622
|
ensure_has_system_metadata(mcp)
|
|
558
623
|
|
|
559
624
|
if self._openapi_ingestion:
|
|
560
|
-
return self._emit_openapi_mcps(mcps,
|
|
625
|
+
return self._emit_openapi_mcps(mcps, emit_mode, wait_timeout)
|
|
561
626
|
else:
|
|
562
|
-
return self._emit_restli_mcps(mcps,
|
|
627
|
+
return self._emit_restli_mcps(mcps, emit_mode)
|
|
563
628
|
|
|
564
629
|
def _emit_openapi_mcps(
|
|
565
630
|
self,
|
|
566
631
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
632
|
+
emit_mode: EmitMode,
|
|
633
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
570
634
|
) -> int:
|
|
571
635
|
"""
|
|
572
636
|
1. Grouping MCPs by their HTTP method and entity URL and HTTP method
|
|
@@ -580,9 +644,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
580
644
|
The joining logic is efficient with a simple string concatenation
|
|
581
645
|
|
|
582
646
|
:param mcps: metadata change proposals to transmit
|
|
583
|
-
:param
|
|
584
|
-
:param
|
|
585
|
-
:param trace_timeout: timeout for tracing
|
|
647
|
+
:param emit_mode: the mode to emit the MCPs
|
|
648
|
+
:param wait_timeout: timeout for blocking queue
|
|
586
649
|
:return: number of requests
|
|
587
650
|
"""
|
|
588
651
|
# Group by entity URL and HTTP method
|
|
@@ -591,7 +654,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
591
654
|
) # Initialize with one empty Chunk
|
|
592
655
|
|
|
593
656
|
for mcp in mcps:
|
|
594
|
-
request = self._to_openapi_request(mcp,
|
|
657
|
+
request = self._to_openapi_request(mcp, emit_mode)
|
|
595
658
|
if request:
|
|
596
659
|
# Create a composite key with both method and URL
|
|
597
660
|
key = (request.method, request.url)
|
|
@@ -621,7 +684,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
621
684
|
)
|
|
622
685
|
responses.append(response)
|
|
623
686
|
|
|
624
|
-
if self._should_trace(
|
|
687
|
+
if self._should_trace(emit_mode):
|
|
625
688
|
trace_data = []
|
|
626
689
|
for response in responses:
|
|
627
690
|
data = extract_trace_data(response) if response else None
|
|
@@ -629,14 +692,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
629
692
|
trace_data.append(data)
|
|
630
693
|
|
|
631
694
|
if trace_data:
|
|
632
|
-
self._await_status(trace_data,
|
|
695
|
+
self._await_status(trace_data, wait_timeout)
|
|
633
696
|
|
|
634
697
|
return len(responses)
|
|
635
698
|
|
|
636
699
|
def _emit_restli_mcps(
|
|
637
700
|
self,
|
|
638
701
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
639
|
-
|
|
702
|
+
emit_mode: EmitMode,
|
|
640
703
|
) -> int:
|
|
641
704
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
642
705
|
|
|
@@ -671,9 +734,12 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
671
734
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
672
735
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
673
736
|
# the size when chunking, and again for the actual request.
|
|
674
|
-
payload_dict: dict = {
|
|
675
|
-
|
|
676
|
-
|
|
737
|
+
payload_dict: dict = {
|
|
738
|
+
"proposals": mcp_obj_chunk,
|
|
739
|
+
"async": "true"
|
|
740
|
+
if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
|
|
741
|
+
else "false",
|
|
742
|
+
}
|
|
677
743
|
|
|
678
744
|
payload = json.dumps(payload_dict)
|
|
679
745
|
self._emit_generic(url, payload)
|
|
@@ -747,7 +813,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
747
813
|
def _await_status(
|
|
748
814
|
self,
|
|
749
815
|
trace_data: List[TraceData],
|
|
750
|
-
|
|
816
|
+
wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
751
817
|
) -> None:
|
|
752
818
|
"""Verify the status of asynchronous write operations.
|
|
753
819
|
Args:
|
|
@@ -757,8 +823,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
757
823
|
TraceTimeoutError: If verification fails or times out
|
|
758
824
|
TraceValidationError: Expected write was not completed successfully
|
|
759
825
|
"""
|
|
760
|
-
if
|
|
761
|
-
raise ValueError("
|
|
826
|
+
if wait_timeout is None:
|
|
827
|
+
raise ValueError("wait_timeout cannot be None")
|
|
762
828
|
|
|
763
829
|
try:
|
|
764
830
|
if not trace_data:
|
|
@@ -771,9 +837,9 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
771
837
|
current_backoff = TRACE_INITIAL_BACKOFF
|
|
772
838
|
|
|
773
839
|
while trace.data:
|
|
774
|
-
if datetime.now() - start_time >
|
|
840
|
+
if datetime.now() - start_time > wait_timeout:
|
|
775
841
|
raise TraceTimeoutError(
|
|
776
|
-
f"Timeout waiting for async write completion after {
|
|
842
|
+
f"Timeout waiting for async write completion after {wait_timeout.total_seconds()} seconds"
|
|
777
843
|
)
|
|
778
844
|
|
|
779
845
|
base_url = f"{self._gms_server}/openapi/v1/trace/write"
|
|
@@ -825,17 +891,28 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
825
891
|
logger.error(f"Error during status verification: {str(e)}")
|
|
826
892
|
raise
|
|
827
893
|
|
|
828
|
-
def _should_trace(
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
894
|
+
def _should_trace(self, emit_mode: EmitMode, warn: bool = True) -> bool:
|
|
895
|
+
if emit_mode == EmitMode.ASYNC_WAIT:
|
|
896
|
+
if not bool(self._openapi_ingestion):
|
|
897
|
+
if warn:
|
|
898
|
+
logger.warning(
|
|
899
|
+
f"{emit_mode} requested but is only available when using OpenAPI."
|
|
900
|
+
)
|
|
901
|
+
return False
|
|
902
|
+
elif getattr(
|
|
903
|
+
self, "server_config", None
|
|
904
|
+
) is None or not self.server_config.supports_feature(
|
|
905
|
+
ServiceFeature.API_TRACING
|
|
906
|
+
):
|
|
907
|
+
if warn:
|
|
908
|
+
logger.warning(
|
|
909
|
+
f"{emit_mode} requested but is only available with a newer GMS version."
|
|
910
|
+
)
|
|
911
|
+
return False
|
|
912
|
+
else:
|
|
913
|
+
return True
|
|
914
|
+
else:
|
|
915
|
+
return False
|
|
839
916
|
|
|
840
917
|
def __repr__(self) -> str:
|
|
841
918
|
token_str = (
|
|
@@ -290,6 +290,12 @@ class AvroToMceSchemaConverter:
|
|
|
290
290
|
This way we can use the type/description of the non-null type if needed.
|
|
291
291
|
"""
|
|
292
292
|
|
|
293
|
+
# props to skip when building jsonProps
|
|
294
|
+
json_props_to_skip = [
|
|
295
|
+
"_nullable",
|
|
296
|
+
"native_data_type",
|
|
297
|
+
]
|
|
298
|
+
|
|
293
299
|
def __init__(
|
|
294
300
|
self,
|
|
295
301
|
schema: SchemaOrField,
|
|
@@ -407,6 +413,16 @@ class AvroToMceSchemaConverter:
|
|
|
407
413
|
or self._actual_schema.props.get("logicalType"),
|
|
408
414
|
)
|
|
409
415
|
|
|
416
|
+
json_props: Optional[Dict[str, Any]] = (
|
|
417
|
+
{
|
|
418
|
+
k: v
|
|
419
|
+
for k, v in merged_props.items()
|
|
420
|
+
if k not in self.json_props_to_skip
|
|
421
|
+
}
|
|
422
|
+
if merged_props
|
|
423
|
+
else None
|
|
424
|
+
)
|
|
425
|
+
|
|
410
426
|
field = SchemaField(
|
|
411
427
|
fieldPath=field_path,
|
|
412
428
|
# Populate it with the simple native type for now.
|
|
@@ -421,7 +437,7 @@ class AvroToMceSchemaConverter:
|
|
|
421
437
|
isPartOfKey=self._converter._is_key_schema,
|
|
422
438
|
globalTags=tags_aspect,
|
|
423
439
|
glossaryTerms=meta_terms_aspect,
|
|
424
|
-
jsonProps=json.dumps(
|
|
440
|
+
jsonProps=json.dumps(json_props) if json_props else None,
|
|
425
441
|
)
|
|
426
442
|
yield field
|
|
427
443
|
|
|
@@ -34,9 +34,7 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
|
|
|
34
34
|
from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
|
|
35
35
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
36
36
|
from datahub.emitter.rest_emitter import (
|
|
37
|
-
DEFAULT_REST_TRACE_MODE,
|
|
38
37
|
DatahubRestEmitter,
|
|
39
|
-
RestTraceMode,
|
|
40
38
|
)
|
|
41
39
|
from datahub.emitter.serialization_helper import post_json_transform
|
|
42
40
|
from datahub.ingestion.graph.config import (
|
|
@@ -54,6 +52,7 @@ from datahub.ingestion.graph.filters import (
|
|
|
54
52
|
RemovedStatusFilter,
|
|
55
53
|
generate_filter,
|
|
56
54
|
)
|
|
55
|
+
from datahub.ingestion.graph.links import make_url_for_urn
|
|
57
56
|
from datahub.ingestion.source.state.checkpoint import Checkpoint
|
|
58
57
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
59
58
|
MetadataChangeEvent,
|
|
@@ -158,7 +157,6 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
158
157
|
client_certificate_path=self.config.client_certificate_path,
|
|
159
158
|
disable_ssl_verification=self.config.disable_ssl_verification,
|
|
160
159
|
openapi_ingestion=self.config.openapi_ingestion,
|
|
161
|
-
default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
|
|
162
160
|
client_mode=config.client_mode,
|
|
163
161
|
datahub_component=config.datahub_component,
|
|
164
162
|
)
|
|
@@ -187,6 +185,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
187
185
|
"""Get the public-facing base url of the frontend
|
|
188
186
|
|
|
189
187
|
This url can be used to construct links to the frontend. The url will not include a trailing slash.
|
|
188
|
+
|
|
190
189
|
Note: Only supported with DataHub Cloud.
|
|
191
190
|
"""
|
|
192
191
|
|
|
@@ -198,6 +197,20 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
198
197
|
raise ValueError("baseUrl not found in server config")
|
|
199
198
|
return base_url
|
|
200
199
|
|
|
200
|
+
def url_for(self, entity_urn: Union[str, Urn]) -> str:
|
|
201
|
+
"""Get the UI url for an entity.
|
|
202
|
+
|
|
203
|
+
Note: Only supported with DataHub Cloud.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
entity_urn: The urn of the entity to get the url for.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
The public-facing url for the entity.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
return make_url_for_urn(self.frontend_base_url, str(entity_urn))
|
|
213
|
+
|
|
201
214
|
@classmethod
|
|
202
215
|
def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
|
|
203
216
|
session_config = emitter._session_config
|
|
@@ -361,7 +374,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
361
374
|
)
|
|
362
375
|
|
|
363
376
|
def get_config(self) -> Dict[str, Any]:
|
|
364
|
-
return self.
|
|
377
|
+
return self.server_config.raw_config
|
|
365
378
|
|
|
366
379
|
def get_ownership(self, entity_urn: str) -> Optional[OwnershipClass]:
|
|
367
380
|
return self.get_aspect(entity_urn=entity_urn, aspect_type=OwnershipClass)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import datahub.metadata.urns as urns
|
|
4
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
5
|
+
|
|
6
|
+
_url_prefixes = {
|
|
7
|
+
# Atypical mappings.
|
|
8
|
+
urns.DataJobUrn.ENTITY_TYPE: "tasks",
|
|
9
|
+
urns.DataFlowUrn.ENTITY_TYPE: "pipelines",
|
|
10
|
+
urns.CorpUserUrn.ENTITY_TYPE: "user",
|
|
11
|
+
urns.CorpGroupUrn.ENTITY_TYPE: "group",
|
|
12
|
+
# Normal mappings - matches the entity type.
|
|
13
|
+
urns.ChartUrn.ENTITY_TYPE: "chart",
|
|
14
|
+
urns.ContainerUrn.ENTITY_TYPE: "container",
|
|
15
|
+
urns.DataProductUrn.ENTITY_TYPE: "dataProduct",
|
|
16
|
+
urns.DatasetUrn.ENTITY_TYPE: "dataset",
|
|
17
|
+
urns.DashboardUrn.ENTITY_TYPE: "dashboard",
|
|
18
|
+
urns.DomainUrn.ENTITY_TYPE: "domain",
|
|
19
|
+
urns.GlossaryNodeUrn.ENTITY_TYPE: "glossaryNode",
|
|
20
|
+
urns.GlossaryTermUrn.ENTITY_TYPE: "glossaryTerm",
|
|
21
|
+
urns.TagUrn.ENTITY_TYPE: "tag",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_url_for_urn(
|
|
26
|
+
frontend_base_url: str,
|
|
27
|
+
entity_urn: str,
|
|
28
|
+
*,
|
|
29
|
+
tab: Optional[str] = None,
|
|
30
|
+
) -> str:
|
|
31
|
+
"""Build the public-facing URL for an entity urn.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
frontend_url: The public-facing base url of the frontend.
|
|
35
|
+
entity_urn: The urn of the entity to get the url for.
|
|
36
|
+
tab: The tab to deep link into. If not provided, the default tab for the entity will be shown.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The public-facing url for the entity.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
>>> make_url_for_urn("https://demo.datahub.com", "urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992", tab="Contents")
|
|
43
|
+
'https://demo.datahub.com/container/urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992/Contents'
|
|
44
|
+
>>> make_url_for_urn("https://demo.datahub.com", "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)")
|
|
45
|
+
'https://demo.datahub.com/dataset/urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)/'
|
|
46
|
+
"""
|
|
47
|
+
entity_type = guess_entity_type(entity_urn)
|
|
48
|
+
|
|
49
|
+
url_prefix = _url_prefixes.get(entity_type, entity_type)
|
|
50
|
+
url = f"{frontend_base_url}/{url_prefix}/{entity_urn}/"
|
|
51
|
+
if tab:
|
|
52
|
+
url += f"{tab}"
|
|
53
|
+
return url
|