acryl-datahub 1.0.0.3rc12__py3-none-any.whl → 1.0.0.4rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (37) hide show
  1. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/METADATA +2529 -2527
  2. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/RECORD +37 -34
  3. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/emitter/request_helper.py +10 -5
  6. datahub/emitter/rest_emitter.py +183 -106
  7. datahub/ingestion/extractor/schema_util.py +17 -1
  8. datahub/ingestion/graph/client.py +17 -4
  9. datahub/ingestion/graph/links.py +53 -0
  10. datahub/ingestion/sink/datahub_rest.py +11 -10
  11. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  12. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  13. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  14. datahub/ingestion/source/fivetran/config.py +1 -1
  15. datahub/ingestion/source/ge_data_profiler.py +25 -0
  16. datahub/ingestion/source/snowflake/snowflake_config.py +1 -12
  17. datahub/ingestion/source/snowflake/snowflake_connection.py +5 -17
  18. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  19. datahub/ingestion/source/sql/athena.py +2 -1
  20. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  21. datahub/ingestion/source/sql/mssql/source.py +1 -1
  22. datahub/ingestion/source/sql/sql_config.py +1 -34
  23. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  24. datahub/ingestion/source/sql/stored_procedures/lineage.py +1 -0
  25. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  26. datahub/ingestion/source/tableau/tableau.py +4 -2
  27. datahub/ingestion/source/unity/config.py +2 -1
  28. datahub/metadata/_internal_schema_classes.py +13 -0
  29. datahub/metadata/schema.avsc +17 -0
  30. datahub/metadata/schemas/Operation.avsc +17 -0
  31. datahub/sdk/main_client.py +15 -0
  32. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  33. datahub/sql_parsing/sql_parsing_aggregator.py +3 -2
  34. datahub/utilities/server_config_util.py +14 -75
  35. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/entry_points.txt +0 -0
  36. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/licenses/LICENSE +0 -0
  37. {acryl_datahub-1.0.0.3rc12.dist-info → acryl_datahub-1.0.0.4rc2.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from typing import (
20
20
  Sequence,
21
21
  Tuple,
22
22
  Union,
23
+ overload,
23
24
  )
24
25
 
25
26
  import pydantic
@@ -103,9 +104,28 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
103
104
  )
104
105
 
105
106
 
106
- class RestTraceMode(ConfigEnum):
107
- ENABLED = auto()
108
- DISABLED = auto()
107
+ class EmitMode(ConfigEnum):
108
+ # Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
109
+ # Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
110
+ # searchability and consistent reads are required.
111
+ SYNC_WAIT = auto()
112
+ # Synchronously updates the primary storage (SQL) but asynchronously updates search storage (Elasticsearch). Provides
113
+ # a balance between consistency and performance. Suitable for updates that need to be immediately reflected in direct
114
+ # entity retrievals but where search index consistency can be slightly delayed.
115
+ SYNC_PRIMARY = auto()
116
+ # Queues the metadata change for asynchronous processing and returns immediately. The client continues execution without
117
+ # waiting for the change to be fully processed. Best for high-throughput scenarios where eventual consistency is acceptable.
118
+ ASYNC = auto()
119
+ # Queues the metadata change asynchronously but blocks until confirmation that the write has been fully persisted.
120
+ # More efficient than fully synchronous operations due to backend parallelization and batching while still providing
121
+ # strong consistency guarantees. Useful when you need confirmation of successful persistence without sacrificing performance.
122
+ ASYNC_WAIT = auto()
123
+
124
+
125
+ _DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
126
+ EmitMode,
127
+ os.getenv("DATAHUB_EMIT_MODE", EmitMode.SYNC_PRIMARY),
128
+ )
109
129
 
110
130
 
111
131
  class RestSinkEndpoint(ConfigEnum):
@@ -119,13 +139,6 @@ DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
119
139
  )
120
140
 
121
141
 
122
- # Supported with v1.0
123
- DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
124
- RestTraceMode,
125
- os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
126
- )
127
-
128
-
129
142
  class RequestsSessionConfig(ConfigModel):
130
143
  timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
131
144
 
@@ -282,8 +295,7 @@ class DataHubRestEmitter(Closeable, Emitter):
282
295
  _token: Optional[str]
283
296
  _session: requests.Session
284
297
  _openapi_ingestion: Optional[bool]
285
- _default_trace_mode: bool
286
- server_config: RestServiceConfig
298
+ _server_config: RestServiceConfig
287
299
 
288
300
  def __init__(
289
301
  self,
@@ -300,7 +312,6 @@ class DataHubRestEmitter(Closeable, Emitter):
300
312
  client_certificate_path: Optional[str] = None,
301
313
  disable_ssl_verification: bool = False,
302
314
  openapi_ingestion: Optional[bool] = None,
303
- default_trace_mode: bool = False,
304
315
  client_mode: Optional[ClientMode] = None,
305
316
  datahub_component: Optional[str] = None,
306
317
  ):
@@ -314,15 +325,11 @@ class DataHubRestEmitter(Closeable, Emitter):
314
325
 
315
326
  self._gms_server = fixup_gms_url(gms_server)
316
327
  self._token = token
317
- self._default_trace_mode = default_trace_mode
318
328
  self._session = requests.Session()
319
329
  self._openapi_ingestion = (
320
330
  openapi_ingestion # Re-evaluated after test connection
321
331
  )
322
332
 
323
- if self._default_trace_mode:
324
- logger.debug("Using API Tracing for ingestion.")
325
-
326
333
  headers = {
327
334
  "X-RestLi-Protocol-Version": "2.0.0",
328
335
  "Content-Type": "application/json",
@@ -376,50 +383,88 @@ class DataHubRestEmitter(Closeable, Emitter):
376
383
 
377
384
  self._session = self._session_config.build_session()
378
385
 
379
- def test_connection(self) -> None:
380
- url = f"{self._gms_server}/config"
381
- try:
382
- # Create a config instance with session and URL
383
- config = RestServiceConfig(session=self._session, url=url)
384
- # Attempt to load config, which will throw ConfigurationError if there's an issue
385
- config.fetch_config()
386
- self.server_config = config
387
-
388
- # Determine OpenAPI mode
389
- if self._openapi_ingestion is None:
390
- # No constructor parameter
391
- if (
392
- not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
393
- and self._session_config.client_mode == ClientMode.SDK
394
- and self.server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
395
- ):
396
- # Enable if SDK client and no environment variable specified
397
- self._openapi_ingestion = True
398
- else:
399
- # The system env is specifying the value
400
- self._openapi_ingestion = (
401
- DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
386
+ @property
387
+ def server_config(self) -> RestServiceConfig:
388
+ return self.fetch_server_config()
389
+
390
+ # TODO: This should move to DataHubGraph once it no longer inherits from DataHubRestEmitter
391
+ def fetch_server_config(self) -> RestServiceConfig:
392
+ """
393
+ Fetch configuration from the server if not already loaded.
394
+
395
+ Returns:
396
+ The configuration dictionary
397
+
398
+ Raises:
399
+ ConfigurationError: If there's an error fetching or validating the configuration
400
+ """
401
+ if not hasattr(self, "_server_config") or not self._server_config:
402
+ if self._session is None or self._gms_server is None:
403
+ raise ConfigurationError(
404
+ "Session and URL are required to load configuration"
405
+ )
406
+
407
+ url = f"{self._gms_server}/config"
408
+ response = self._session.get(url)
409
+
410
+ if response.status_code == 200:
411
+ raw_config = response.json()
412
+
413
+ # Validate that we're connected to the correct service
414
+ if not raw_config.get("noCode") == "true":
415
+ raise ConfigurationError(
416
+ "You seem to have connected to the frontend service instead of the GMS endpoint. "
417
+ "The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
418
+ "For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
402
419
  )
403
420
 
404
- logger.debug(
405
- f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
406
- )
421
+ self._server_config = RestServiceConfig(raw_config=raw_config)
422
+ self._post_fetch_server_config()
407
423
 
408
- # Set default tracing for SDK
424
+ else:
425
+ logger.debug(
426
+ f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
427
+ )
428
+
429
+ if response.status_code == 401:
430
+ message = f"Unable to connect to {url} - got an authentication error: {response.text}."
431
+ else:
432
+ message = f"Unable to connect to {url} with status_code: {response.status_code}."
433
+
434
+ message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
435
+ raise ConfigurationError(message)
436
+
437
+ return self._server_config
438
+
439
+ def _post_fetch_server_config(self) -> None:
440
+ # Determine OpenAPI mode
441
+ if self._openapi_ingestion is None:
442
+ # No constructor parameter
409
443
  if (
410
- self._session_config.client_mode == ClientMode.SDK
411
- and self.server_config.supports_feature(ServiceFeature.API_TRACING)
444
+ not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
445
+ and self._session_config.client_mode == ClientMode.SDK
446
+ and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
412
447
  ):
413
- # Enable tracing if using SDK & server supported
414
- self._default_trace_mode = True
448
+ # Enable if SDK client and no environment variable specified
449
+ self._openapi_ingestion = True
450
+ else:
451
+ # The system env is specifying the value
452
+ self._openapi_ingestion = (
453
+ DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
454
+ )
455
+
456
+ logger.debug(
457
+ f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
458
+ )
459
+ logger.debug(
460
+ f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
461
+ )
415
462
 
416
- except ConfigurationError as e:
417
- # Just re-raise the exception
418
- raise e
463
+ def test_connection(self) -> None:
464
+ self.fetch_server_config()
419
465
 
420
- def get_server_config(self) -> RestServiceConfig:
421
- self.test_connection()
422
- return self.server_config
466
+ def get_server_config(self) -> dict:
467
+ return self.server_config.raw_config
423
468
 
424
469
  def to_graph(self) -> "DataHubGraph":
425
470
  from datahub.ingestion.graph.client import DataHubGraph
@@ -429,16 +474,14 @@ class DataHubRestEmitter(Closeable, Emitter):
429
474
  def _to_openapi_request(
430
475
  self,
431
476
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
432
- async_flag: Optional[bool] = None,
433
- async_default: bool = False,
477
+ emit_mode: EmitMode,
434
478
  ) -> Optional[OpenApiRequest]:
435
479
  """
436
480
  Convert a MetadataChangeProposal to an OpenAPI request format.
437
481
 
438
482
  Args:
439
483
  mcp: The metadata change proposal
440
- async_flag: Optional flag to override async behavior
441
- async_default: Default async behavior if not specified
484
+ emit_mode: Client emit mode
442
485
 
443
486
  Returns:
444
487
  An OpenApiRequest object or None if the MCP doesn't have required fields
@@ -446,8 +489,8 @@ class DataHubRestEmitter(Closeable, Emitter):
446
489
  return OpenApiRequest.from_mcp(
447
490
  mcp=mcp,
448
491
  gms_server=self._gms_server,
449
- async_flag=async_flag,
450
- async_default=async_default,
492
+ async_flag=emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT),
493
+ search_sync_flag=emit_mode == EmitMode.SYNC_WAIT,
451
494
  )
452
495
 
453
496
  def emit(
@@ -459,7 +502,7 @@ class DataHubRestEmitter(Closeable, Emitter):
459
502
  UsageAggregation,
460
503
  ],
461
504
  callback: Optional[Callable[[Exception, str], None]] = None,
462
- async_flag: Optional[bool] = None,
505
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
463
506
  ) -> None:
464
507
  try:
465
508
  if isinstance(item, UsageAggregation):
@@ -467,7 +510,7 @@ class DataHubRestEmitter(Closeable, Emitter):
467
510
  elif isinstance(
468
511
  item, (MetadataChangeProposal, MetadataChangeProposalWrapper)
469
512
  ):
470
- self.emit_mcp(item, async_flag=async_flag)
513
+ self.emit_mcp(item, emit_mode=emit_mode)
471
514
  else:
472
515
  self.emit_mce(item)
473
516
  except Exception as e:
@@ -498,41 +541,64 @@ class DataHubRestEmitter(Closeable, Emitter):
498
541
 
499
542
  self._emit_generic(url, payload)
500
543
 
544
+ @overload
545
+ @deprecated("Use emit_mode instead of async_flag")
501
546
  def emit_mcp(
502
547
  self,
503
548
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
549
+ *,
504
550
  async_flag: Optional[bool] = None,
505
- trace_flag: Optional[bool] = None,
506
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
551
+ ) -> None: ...
552
+
553
+ @overload
554
+ def emit_mcp(
555
+ self,
556
+ mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
557
+ *,
558
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
559
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
560
+ ) -> None: ...
561
+
562
+ def emit_mcp(
563
+ self,
564
+ mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
565
+ async_flag: Optional[bool] = None,
566
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
567
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
507
568
  ) -> None:
569
+ if async_flag is True:
570
+ emit_mode = EmitMode.ASYNC
571
+
508
572
  ensure_has_system_metadata(mcp)
509
573
 
510
574
  trace_data = None
511
575
 
512
576
  if self._openapi_ingestion:
513
- request = self._to_openapi_request(mcp, async_flag, async_default=False)
577
+ request = self._to_openapi_request(mcp, emit_mode)
514
578
  if request:
515
579
  response = self._emit_generic(
516
580
  request.url, payload=request.payload, method=request.method
517
581
  )
518
582
 
519
- if self._should_trace(async_flag, trace_flag):
583
+ if self._should_trace(emit_mode):
520
584
  trace_data = extract_trace_data(response) if response else None
521
585
 
522
586
  else:
523
587
  url = f"{self._gms_server}/aspects?action=ingestProposal"
524
588
 
525
589
  mcp_obj = pre_json_transform(mcp.to_obj())
526
- payload_dict = {"proposal": mcp_obj}
527
-
528
- if async_flag is not None:
529
- payload_dict["async"] = "true" if async_flag else "false"
590
+ payload_dict = {
591
+ "proposal": mcp_obj,
592
+ "async": "true"
593
+ if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
594
+ else "false",
595
+ }
530
596
 
531
597
  payload = json.dumps(payload_dict)
532
598
 
533
599
  response = self._emit_generic(url, payload)
534
600
 
535
- if self._should_trace(async_flag, trace_flag):
601
+ if self._should_trace(emit_mode):
536
602
  trace_data = (
537
603
  extract_trace_data_from_mcps(response, [mcp]) if response else None
538
604
  )
@@ -540,15 +606,14 @@ class DataHubRestEmitter(Closeable, Emitter):
540
606
  if trace_data:
541
607
  self._await_status(
542
608
  [trace_data],
543
- trace_timeout,
609
+ wait_timeout,
544
610
  )
545
611
 
546
612
  def emit_mcps(
547
613
  self,
548
614
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
549
- async_flag: Optional[bool] = None,
550
- trace_flag: Optional[bool] = None,
551
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
615
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
616
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
552
617
  ) -> int:
553
618
  if _DATAHUB_EMITTER_TRACE:
554
619
  logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
@@ -557,16 +622,15 @@ class DataHubRestEmitter(Closeable, Emitter):
557
622
  ensure_has_system_metadata(mcp)
558
623
 
559
624
  if self._openapi_ingestion:
560
- return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
625
+ return self._emit_openapi_mcps(mcps, emit_mode, wait_timeout)
561
626
  else:
562
- return self._emit_restli_mcps(mcps, async_flag)
627
+ return self._emit_restli_mcps(mcps, emit_mode)
563
628
 
564
629
  def _emit_openapi_mcps(
565
630
  self,
566
631
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
567
- async_flag: Optional[bool] = None,
568
- trace_flag: Optional[bool] = None,
569
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
632
+ emit_mode: EmitMode,
633
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
570
634
  ) -> int:
571
635
  """
572
636
  1. Grouping MCPs by their HTTP method and entity URL and HTTP method
@@ -580,9 +644,8 @@ class DataHubRestEmitter(Closeable, Emitter):
580
644
  The joining logic is efficient with a simple string concatenation
581
645
 
582
646
  :param mcps: metadata change proposals to transmit
583
- :param async_flag: the mode
584
- :param trace_flag: whether to trace the requests
585
- :param trace_timeout: timeout for tracing
647
+ :param emit_mode: the mode to emit the MCPs
648
+ :param wait_timeout: timeout for blocking queue
586
649
  :return: number of requests
587
650
  """
588
651
  # Group by entity URL and HTTP method
@@ -591,7 +654,7 @@ class DataHubRestEmitter(Closeable, Emitter):
591
654
  ) # Initialize with one empty Chunk
592
655
 
593
656
  for mcp in mcps:
594
- request = self._to_openapi_request(mcp, async_flag, async_default=True)
657
+ request = self._to_openapi_request(mcp, emit_mode)
595
658
  if request:
596
659
  # Create a composite key with both method and URL
597
660
  key = (request.method, request.url)
@@ -621,7 +684,7 @@ class DataHubRestEmitter(Closeable, Emitter):
621
684
  )
622
685
  responses.append(response)
623
686
 
624
- if self._should_trace(async_flag, trace_flag, async_default=True):
687
+ if self._should_trace(emit_mode):
625
688
  trace_data = []
626
689
  for response in responses:
627
690
  data = extract_trace_data(response) if response else None
@@ -629,14 +692,14 @@ class DataHubRestEmitter(Closeable, Emitter):
629
692
  trace_data.append(data)
630
693
 
631
694
  if trace_data:
632
- self._await_status(trace_data, trace_timeout)
695
+ self._await_status(trace_data, wait_timeout)
633
696
 
634
697
  return len(responses)
635
698
 
636
699
  def _emit_restli_mcps(
637
700
  self,
638
701
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
639
- async_flag: Optional[bool] = None,
702
+ emit_mode: EmitMode,
640
703
  ) -> int:
641
704
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
642
705
 
@@ -671,9 +734,12 @@ class DataHubRestEmitter(Closeable, Emitter):
671
734
  for mcp_obj_chunk in mcp_obj_chunks:
672
735
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
673
736
  # the size when chunking, and again for the actual request.
674
- payload_dict: dict = {"proposals": mcp_obj_chunk}
675
- if async_flag is not None:
676
- payload_dict["async"] = "true" if async_flag else "false"
737
+ payload_dict: dict = {
738
+ "proposals": mcp_obj_chunk,
739
+ "async": "true"
740
+ if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
741
+ else "false",
742
+ }
677
743
 
678
744
  payload = json.dumps(payload_dict)
679
745
  self._emit_generic(url, payload)
@@ -747,7 +813,7 @@ class DataHubRestEmitter(Closeable, Emitter):
747
813
  def _await_status(
748
814
  self,
749
815
  trace_data: List[TraceData],
750
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
816
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
751
817
  ) -> None:
752
818
  """Verify the status of asynchronous write operations.
753
819
  Args:
@@ -757,8 +823,8 @@ class DataHubRestEmitter(Closeable, Emitter):
757
823
  TraceTimeoutError: If verification fails or times out
758
824
  TraceValidationError: Expected write was not completed successfully
759
825
  """
760
- if trace_timeout is None:
761
- raise ValueError("trace_timeout cannot be None")
826
+ if wait_timeout is None:
827
+ raise ValueError("wait_timeout cannot be None")
762
828
 
763
829
  try:
764
830
  if not trace_data:
@@ -771,9 +837,9 @@ class DataHubRestEmitter(Closeable, Emitter):
771
837
  current_backoff = TRACE_INITIAL_BACKOFF
772
838
 
773
839
  while trace.data:
774
- if datetime.now() - start_time > trace_timeout:
840
+ if datetime.now() - start_time > wait_timeout:
775
841
  raise TraceTimeoutError(
776
- f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
842
+ f"Timeout waiting for async write completion after {wait_timeout.total_seconds()} seconds"
777
843
  )
778
844
 
779
845
  base_url = f"{self._gms_server}/openapi/v1/trace/write"
@@ -825,17 +891,28 @@ class DataHubRestEmitter(Closeable, Emitter):
825
891
  logger.error(f"Error during status verification: {str(e)}")
826
892
  raise
827
893
 
828
- def _should_trace(
829
- self,
830
- async_flag: Optional[bool] = None,
831
- trace_flag: Optional[bool] = None,
832
- async_default: bool = False,
833
- ) -> bool:
834
- resolved_trace_flag = (
835
- trace_flag if trace_flag is not None else self._default_trace_mode
836
- )
837
- resolved_async_flag = async_flag if async_flag is not None else async_default
838
- return resolved_trace_flag and resolved_async_flag
894
+ def _should_trace(self, emit_mode: EmitMode, warn: bool = True) -> bool:
895
+ if emit_mode == EmitMode.ASYNC_WAIT:
896
+ if not bool(self._openapi_ingestion):
897
+ if warn:
898
+ logger.warning(
899
+ f"{emit_mode} requested but is only available when using OpenAPI."
900
+ )
901
+ return False
902
+ elif getattr(
903
+ self, "server_config", None
904
+ ) is None or not self.server_config.supports_feature(
905
+ ServiceFeature.API_TRACING
906
+ ):
907
+ if warn:
908
+ logger.warning(
909
+ f"{emit_mode} requested but is only available with a newer GMS version."
910
+ )
911
+ return False
912
+ else:
913
+ return True
914
+ else:
915
+ return False
839
916
 
840
917
  def __repr__(self) -> str:
841
918
  token_str = (
@@ -290,6 +290,12 @@ class AvroToMceSchemaConverter:
290
290
  This way we can use the type/description of the non-null type if needed.
291
291
  """
292
292
 
293
+ # props to skip when building jsonProps
294
+ json_props_to_skip = [
295
+ "_nullable",
296
+ "native_data_type",
297
+ ]
298
+
293
299
  def __init__(
294
300
  self,
295
301
  schema: SchemaOrField,
@@ -407,6 +413,16 @@ class AvroToMceSchemaConverter:
407
413
  or self._actual_schema.props.get("logicalType"),
408
414
  )
409
415
 
416
+ json_props: Optional[Dict[str, Any]] = (
417
+ {
418
+ k: v
419
+ for k, v in merged_props.items()
420
+ if k not in self.json_props_to_skip
421
+ }
422
+ if merged_props
423
+ else None
424
+ )
425
+
410
426
  field = SchemaField(
411
427
  fieldPath=field_path,
412
428
  # Populate it with the simple native type for now.
@@ -421,7 +437,7 @@ class AvroToMceSchemaConverter:
421
437
  isPartOfKey=self._converter._is_key_schema,
422
438
  globalTags=tags_aspect,
423
439
  glossaryTerms=meta_terms_aspect,
424
- jsonProps=json.dumps(merged_props) if merged_props else None,
440
+ jsonProps=json.dumps(json_props) if json_props else None,
425
441
  )
426
442
  yield field
427
443
 
@@ -34,9 +34,7 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
34
34
  from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
35
35
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
36
36
  from datahub.emitter.rest_emitter import (
37
- DEFAULT_REST_TRACE_MODE,
38
37
  DatahubRestEmitter,
39
- RestTraceMode,
40
38
  )
41
39
  from datahub.emitter.serialization_helper import post_json_transform
42
40
  from datahub.ingestion.graph.config import (
@@ -54,6 +52,7 @@ from datahub.ingestion.graph.filters import (
54
52
  RemovedStatusFilter,
55
53
  generate_filter,
56
54
  )
55
+ from datahub.ingestion.graph.links import make_url_for_urn
57
56
  from datahub.ingestion.source.state.checkpoint import Checkpoint
58
57
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
59
58
  MetadataChangeEvent,
@@ -158,7 +157,6 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
158
157
  client_certificate_path=self.config.client_certificate_path,
159
158
  disable_ssl_verification=self.config.disable_ssl_verification,
160
159
  openapi_ingestion=self.config.openapi_ingestion,
161
- default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
162
160
  client_mode=config.client_mode,
163
161
  datahub_component=config.datahub_component,
164
162
  )
@@ -187,6 +185,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
187
185
  """Get the public-facing base url of the frontend
188
186
 
189
187
  This url can be used to construct links to the frontend. The url will not include a trailing slash.
188
+
190
189
  Note: Only supported with DataHub Cloud.
191
190
  """
192
191
 
@@ -198,6 +197,20 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
198
197
  raise ValueError("baseUrl not found in server config")
199
198
  return base_url
200
199
 
200
+ def url_for(self, entity_urn: Union[str, Urn]) -> str:
201
+ """Get the UI url for an entity.
202
+
203
+ Note: Only supported with DataHub Cloud.
204
+
205
+ Args:
206
+ entity_urn: The urn of the entity to get the url for.
207
+
208
+ Returns:
209
+ The public-facing url for the entity.
210
+ """
211
+
212
+ return make_url_for_urn(self.frontend_base_url, str(entity_urn))
213
+
201
214
  @classmethod
202
215
  def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
203
216
  session_config = emitter._session_config
@@ -361,7 +374,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
361
374
  )
362
375
 
363
376
  def get_config(self) -> Dict[str, Any]:
364
- return self.get_server_config().config
377
+ return self.server_config.raw_config
365
378
 
366
379
  def get_ownership(self, entity_urn: str) -> Optional[OwnershipClass]:
367
380
  return self.get_aspect(entity_urn=entity_urn, aspect_type=OwnershipClass)
@@ -0,0 +1,53 @@
1
+ from typing import Optional
2
+
3
+ import datahub.metadata.urns as urns
4
+ from datahub.utilities.urns.urn import guess_entity_type
5
+
6
+ _url_prefixes = {
7
+ # Atypical mappings.
8
+ urns.DataJobUrn.ENTITY_TYPE: "tasks",
9
+ urns.DataFlowUrn.ENTITY_TYPE: "pipelines",
10
+ urns.CorpUserUrn.ENTITY_TYPE: "user",
11
+ urns.CorpGroupUrn.ENTITY_TYPE: "group",
12
+ # Normal mappings - matches the entity type.
13
+ urns.ChartUrn.ENTITY_TYPE: "chart",
14
+ urns.ContainerUrn.ENTITY_TYPE: "container",
15
+ urns.DataProductUrn.ENTITY_TYPE: "dataProduct",
16
+ urns.DatasetUrn.ENTITY_TYPE: "dataset",
17
+ urns.DashboardUrn.ENTITY_TYPE: "dashboard",
18
+ urns.DomainUrn.ENTITY_TYPE: "domain",
19
+ urns.GlossaryNodeUrn.ENTITY_TYPE: "glossaryNode",
20
+ urns.GlossaryTermUrn.ENTITY_TYPE: "glossaryTerm",
21
+ urns.TagUrn.ENTITY_TYPE: "tag",
22
+ }
23
+
24
+
25
+ def make_url_for_urn(
26
+ frontend_base_url: str,
27
+ entity_urn: str,
28
+ *,
29
+ tab: Optional[str] = None,
30
+ ) -> str:
31
+ """Build the public-facing URL for an entity urn.
32
+
33
+ Args:
34
+ frontend_url: The public-facing base url of the frontend.
35
+ entity_urn: The urn of the entity to get the url for.
36
+ tab: The tab to deep link into. If not provided, the default tab for the entity will be shown.
37
+
38
+ Returns:
39
+ The public-facing url for the entity.
40
+
41
+ Examples:
42
+ >>> make_url_for_urn("https://demo.datahub.com", "urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992", tab="Contents")
43
+ 'https://demo.datahub.com/container/urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992/Contents'
44
+ >>> make_url_for_urn("https://demo.datahub.com", "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)")
45
+ 'https://demo.datahub.com/dataset/urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)/'
46
+ """
47
+ entity_type = guess_entity_type(entity_urn)
48
+
49
+ url_prefix = _url_prefixes.get(entity_type, entity_type)
50
+ url = f"{frontend_base_url}/{url_prefix}/{entity_urn}/"
51
+ if tab:
52
+ url += f"{tab}"
53
+ return url