acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (106) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2391 -2392
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +105 -88
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/cli/specific/dataset_cli.py +26 -10
  8. datahub/emitter/mce_builder.py +1 -3
  9. datahub/emitter/mcp_builder.py +8 -0
  10. datahub/emitter/request_helper.py +19 -14
  11. datahub/emitter/response_helper.py +25 -18
  12. datahub/emitter/rest_emitter.py +23 -7
  13. datahub/errors.py +8 -0
  14. datahub/ingestion/api/source.py +7 -2
  15. datahub/ingestion/api/source_helpers.py +14 -2
  16. datahub/ingestion/extractor/schema_util.py +1 -0
  17. datahub/ingestion/graph/client.py +26 -20
  18. datahub/ingestion/graph/filters.py +62 -17
  19. datahub/ingestion/sink/datahub_rest.py +2 -2
  20. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  21. datahub/ingestion/source/common/data_platforms.py +23 -0
  22. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  23. datahub/ingestion/source/common/subtypes.py +17 -1
  24. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  25. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  26. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  27. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  28. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  29. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  30. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  31. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  32. datahub/ingestion/source/ge_data_profiler.py +11 -1
  33. datahub/ingestion/source/hex/__init__.py +0 -0
  34. datahub/ingestion/source/hex/api.py +394 -0
  35. datahub/ingestion/source/hex/constants.py +3 -0
  36. datahub/ingestion/source/hex/hex.py +167 -0
  37. datahub/ingestion/source/hex/mapper.py +372 -0
  38. datahub/ingestion/source/hex/model.py +68 -0
  39. datahub/ingestion/source/iceberg/iceberg.py +193 -140
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  41. datahub/ingestion/source/mlflow.py +217 -8
  42. datahub/ingestion/source/mode.py +11 -1
  43. datahub/ingestion/source/openapi.py +69 -34
  44. datahub/ingestion/source/powerbi/config.py +31 -4
  45. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  46. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  47. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  48. datahub/ingestion/source/powerbi/powerbi.py +41 -24
  49. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  50. datahub/ingestion/source/redshift/lineage_v2.py +9 -1
  51. datahub/ingestion/source/redshift/query.py +1 -1
  52. datahub/ingestion/source/s3/source.py +11 -0
  53. datahub/ingestion/source/sigma/config.py +3 -4
  54. datahub/ingestion/source/sigma/sigma.py +10 -6
  55. datahub/ingestion/source/slack/slack.py +399 -82
  56. datahub/ingestion/source/snowflake/constants.py +1 -0
  57. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  58. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  59. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  60. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  61. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  62. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  63. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  64. datahub/ingestion/source/sql/mssql/source.py +8 -4
  65. datahub/ingestion/source/sql/oracle.py +51 -4
  66. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  67. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  68. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  69. datahub/ingestion/source/superset.py +291 -35
  70. datahub/ingestion/source/usage/usage_common.py +0 -65
  71. datahub/ingestion/source/vertexai/__init__.py +0 -0
  72. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  73. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  74. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  75. datahub/metadata/_schema_classes.py +472 -1
  76. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  77. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  78. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  79. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  80. datahub/metadata/schema.avsc +313 -2
  81. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  82. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  83. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  84. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  85. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  86. datahub/metadata/schemas/Deprecation.avsc +2 -0
  87. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  88. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  89. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  90. datahub/metadata/schemas/Siblings.avsc +2 -0
  91. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  92. datahub/sdk/__init__.py +1 -0
  93. datahub/sdk/dataset.py +122 -0
  94. datahub/sdk/entity.py +99 -3
  95. datahub/sdk/entity_client.py +27 -3
  96. datahub/sdk/main_client.py +24 -1
  97. datahub/sdk/search_client.py +81 -8
  98. datahub/sdk/search_filters.py +94 -37
  99. datahub/sql_parsing/split_statements.py +17 -3
  100. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  101. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  102. datahub/testing/mcp_diff.py +1 -18
  103. datahub/utilities/threaded_iterator_executor.py +16 -3
  104. datahub/ingestion/source/vertexai.py +0 -697
  105. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
  106. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
@@ -506,7 +506,7 @@ class Dataset(StrictModel):
506
506
  # We don't check references for tags
507
507
  return list(set(references))
508
508
 
509
- def generate_mcp( # noqa: C901
509
+ def generate_mcp(
510
510
  self,
511
511
  ) -> Iterable[Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]]:
512
512
  mcp = MetadataChangeProposalWrapper(
@@ -643,33 +643,6 @@ class Dataset(StrictModel):
643
643
  )
644
644
  assert field_urn.startswith("urn:li:schemaField:")
645
645
 
646
- if field.globalTags:
647
- mcp = MetadataChangeProposalWrapper(
648
- entityUrn=field_urn,
649
- aspect=GlobalTagsClass(
650
- tags=[
651
- TagAssociationClass(tag=make_tag_urn(tag))
652
- for tag in field.globalTags
653
- ]
654
- ),
655
- )
656
- yield mcp
657
-
658
- if field.glossaryTerms:
659
- mcp = MetadataChangeProposalWrapper(
660
- entityUrn=field_urn,
661
- aspect=GlossaryTermsClass(
662
- terms=[
663
- GlossaryTermAssociationClass(
664
- urn=make_term_urn(term)
665
- )
666
- for term in field.glossaryTerms
667
- ],
668
- auditStamp=self._mint_auditstamp("yaml"),
669
- ),
670
- )
671
- yield mcp
672
-
673
646
  if field.structured_properties:
674
647
  urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
675
648
  mcp = MetadataChangeProposalWrapper(
@@ -29,13 +29,16 @@ def dataset() -> None:
29
29
  name="upsert",
30
30
  )
31
31
  @click.option("-f", "--file", required=True, type=click.Path(exists=True))
32
+ @click.option(
33
+ "-n", "--dry-run", type=bool, is_flag=True, default=False, help="Perform a dry run"
34
+ )
32
35
  @upgrade.check_upgrade
33
36
  @telemetry.with_telemetry()
34
- def upsert(file: Path) -> None:
37
+ def upsert(file: Path, dry_run: bool) -> None:
35
38
  """Upsert attributes to a Dataset in DataHub."""
36
39
  # Call the sync command with to_datahub=True to perform the upsert operation
37
40
  ctx = click.get_current_context()
38
- ctx.invoke(sync, file=str(file), to_datahub=True)
41
+ ctx.invoke(sync, file=str(file), dry_run=dry_run, to_datahub=True)
39
42
 
40
43
 
41
44
  @dataset.command(
@@ -167,11 +170,16 @@ def file(lintcheck: bool, lintfix: bool, file: str) -> None:
167
170
  )
168
171
  @click.option("-f", "--file", required=True, type=click.Path(exists=True))
169
172
  @click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
173
+ @click.option(
174
+ "-n", "--dry-run", type=bool, is_flag=True, default=False, help="Perform a dry run"
175
+ )
170
176
  @upgrade.check_upgrade
171
177
  @telemetry.with_telemetry()
172
- def sync(file: str, to_datahub: bool) -> None:
178
+ def sync(file: str, to_datahub: bool, dry_run: bool) -> None:
173
179
  """Sync a Dataset file to/from DataHub"""
174
180
 
181
+ dry_run_prefix = "[dry-run]: " if dry_run else "" # prefix to use in messages
182
+
175
183
  failures: List[str] = []
176
184
  with get_default_graph() as graph:
177
185
  datasets = Dataset.from_yaml(file)
@@ -189,7 +197,7 @@ def sync(file: str, to_datahub: bool) -> None:
189
197
  click.secho(
190
198
  "\n\t- ".join(
191
199
  [
192
- f"Skipping Dataset {dataset.urn} due to missing entity references: "
200
+ f"{dry_run_prefix}Skipping Dataset {dataset.urn} due to missing entity references: "
193
201
  ]
194
202
  + missing_entity_references
195
203
  ),
@@ -199,13 +207,18 @@ def sync(file: str, to_datahub: bool) -> None:
199
207
  continue
200
208
  try:
201
209
  for mcp in dataset.generate_mcp():
202
- graph.emit(mcp)
203
- click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
210
+ if not dry_run:
211
+ graph.emit(mcp)
212
+ click.secho(
213
+ f"{dry_run_prefix}Update succeeded for urn {dataset.urn}.",
214
+ fg="green",
215
+ )
204
216
  except Exception as e:
205
217
  click.secho(
206
- f"Update failed for id {id}. due to {e}",
218
+ f"{dry_run_prefix}Update failed for id {id}. due to {e}",
207
219
  fg="red",
208
220
  )
221
+ failures.append(dataset.urn)
209
222
  else:
210
223
  # Sync from DataHub
211
224
  if graph.exists(dataset.urn):
@@ -215,13 +228,16 @@ def sync(file: str, to_datahub: bool) -> None:
215
228
  existing_dataset: Dataset = Dataset.from_datahub(
216
229
  graph=graph, urn=dataset.urn, config=dataset_get_config
217
230
  )
218
- existing_dataset.to_yaml(Path(file))
231
+ if not dry_run:
232
+ existing_dataset.to_yaml(Path(file))
233
+ else:
234
+ click.secho(f"{dry_run_prefix}Will update file {file}")
219
235
  else:
220
- click.secho(f"Dataset {dataset.urn} does not exist")
236
+ click.secho(f"{dry_run_prefix}Dataset {dataset.urn} does not exist")
221
237
  failures.append(dataset.urn)
222
238
  if failures:
223
239
  click.secho(
224
- f"\nFailed to sync the following Datasets: {', '.join(failures)}",
240
+ f"\n{dry_run_prefix}Failed to sync the following Datasets: {', '.join(failures)}",
225
241
  fg="red",
226
242
  )
227
243
  raise click.Abort()
@@ -125,9 +125,7 @@ def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
125
125
 
126
126
 
127
127
  def make_data_platform_urn(platform: str) -> str:
128
- if platform.startswith("urn:li:dataPlatform:"):
129
- return platform
130
- return DataPlatformUrn.create_from_id(platform).urn()
128
+ return DataPlatformUrn(platform).urn()
131
129
 
132
130
 
133
131
  def make_dataset_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
@@ -117,6 +117,14 @@ class ContainerKey(DatahubKey):
117
117
  PlatformKey = ContainerKey
118
118
 
119
119
 
120
+ class NamespaceKey(ContainerKey):
121
+ """
122
+ For Iceberg namespaces (databases/schemas)
123
+ """
124
+
125
+ namespace: str
126
+
127
+
120
128
  class DatabaseKey(ContainerKey):
121
129
  database: str
122
130
 
@@ -1,8 +1,8 @@
1
- import itertools
2
1
  import shlex
3
- from typing import List, Union
2
+ from typing import List, Optional, Union
4
3
 
5
4
  import requests
5
+ from requests.auth import HTTPBasicAuth
6
6
 
7
7
 
8
8
  def _format_header(name: str, value: Union[str, bytes]) -> str:
@@ -12,17 +12,22 @@ def _format_header(name: str, value: Union[str, bytes]) -> str:
12
12
 
13
13
 
14
14
  def make_curl_command(
15
- session: requests.Session, method: str, url: str, payload: str
15
+ session: requests.Session, method: str, url: str, payload: Optional[str] = None
16
16
  ) -> str:
17
- fragments: List[str] = [
18
- "curl",
19
- *itertools.chain(
20
- *[
21
- ("-X", method),
22
- *[("-H", _format_header(k, v)) for (k, v) in session.headers.items()],
23
- ("--data", payload),
24
- ]
25
- ),
26
- url,
27
- ]
17
+ fragments: List[str] = ["curl", "-X", method]
18
+
19
+ for header_name, header_value in session.headers.items():
20
+ fragments.extend(["-H", _format_header(header_name, header_value)])
21
+
22
+ if session.auth:
23
+ if isinstance(session.auth, HTTPBasicAuth):
24
+ fragments.extend(["-u", f"{session.auth.username}:<redacted>"])
25
+ else:
26
+ # For other auth types, they should be handled via headers
27
+ fragments.extend(["-H", "<unknown auth type>"])
28
+
29
+ if payload:
30
+ fragments.extend(["--data", payload])
31
+
32
+ fragments.append(url)
28
33
  return shlex.join(fragments)
@@ -1,17 +1,21 @@
1
1
  import json
2
2
  import logging
3
+ import warnings
3
4
  from dataclasses import dataclass
4
5
  from typing import Dict, List, Optional, Sequence, Union
5
6
 
6
7
  from requests import Response
7
8
 
8
9
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
10
+ from datahub.errors import APITracingWarning
9
11
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
10
12
  MetadataChangeProposal,
11
13
  )
12
14
 
13
15
  logger = logging.getLogger(__name__)
14
16
 
17
+ _TRACE_HEADER_NAME = "traceparent"
18
+
15
19
 
16
20
  @dataclass
17
21
  class TraceData:
@@ -25,14 +29,11 @@ class TraceData:
25
29
  raise TypeError("data must be a dictionary")
26
30
 
27
31
 
28
- def _extract_trace_id(
29
- response: Response, trace_header: str = "traceparent"
30
- ) -> Optional[str]:
32
+ def _extract_trace_id(response: Response) -> Optional[str]:
31
33
  """
32
34
  Extract trace ID from response headers.
33
35
  Args:
34
36
  response: HTTP response object
35
- trace_header: Name of the trace header to use
36
37
  Returns:
37
38
  Trace ID if found and response is valid, None otherwise
38
39
  """
@@ -40,9 +41,17 @@ def _extract_trace_id(
40
41
  logger.debug(f"Invalid status code: {response.status_code}")
41
42
  return None
42
43
 
43
- trace_id = response.headers.get(trace_header)
44
+ trace_id = response.headers.get(_TRACE_HEADER_NAME)
44
45
  if not trace_id:
45
- logger.debug(f"Missing trace header: {trace_header}")
46
+ # This will only be printed if
47
+ # 1. we're in async mode (checked by the caller)
48
+ # 2. the server did not return a trace ID
49
+ logger.debug(f"Missing trace header: {_TRACE_HEADER_NAME}")
50
+ warnings.warn(
51
+ "No trace ID found in response headers. API tracing is not active - likely due to an outdated server version.",
52
+ APITracingWarning,
53
+ stacklevel=3,
54
+ )
46
55
  return None
47
56
 
48
57
  return trace_id
@@ -51,20 +60,19 @@ def _extract_trace_id(
51
60
  def extract_trace_data(
52
61
  response: Response,
53
62
  aspects_to_trace: Optional[List[str]] = None,
54
- trace_header: str = "traceparent",
55
63
  ) -> Optional[TraceData]:
56
- """
57
- Extract trace data from a response object.
64
+ """Extract trace data from a response object.
65
+
66
+ If we run into a JSONDecodeError, we'll log an error and return None.
67
+
58
68
  Args:
59
69
  response: HTTP response object
60
70
  aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
61
- trace_header: Name of the trace header to use (default: "traceparent")
71
+
62
72
  Returns:
63
73
  TraceData object if successful, None otherwise
64
- Raises:
65
- JSONDecodeError: If response body cannot be decoded as JSON
66
74
  """
67
- trace_id = _extract_trace_id(response, trace_header)
75
+ trace_id = _extract_trace_id(response)
68
76
  if not trace_id:
69
77
  return None
70
78
 
@@ -104,19 +112,18 @@ def extract_trace_data_from_mcps(
104
112
  response: Response,
105
113
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
106
114
  aspects_to_trace: Optional[List[str]] = None,
107
- trace_header: str = "traceparent",
108
115
  ) -> Optional[TraceData]:
109
- """
110
- Extract trace data from a response object and populate data from provided MCPs.
116
+ """Extract trace data from a response object and populate data from provided MCPs.
117
+
111
118
  Args:
112
119
  response: HTTP response object used only for trace_id extraction
113
120
  mcps: List of MCP URN and aspect data
114
121
  aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
115
- trace_header: Name of the trace header to use (default: "traceparent")
122
+
116
123
  Returns:
117
124
  TraceData object if successful, None otherwise
118
125
  """
119
- trace_id = _extract_trace_id(response, trace_header)
126
+ trace_id = _extract_trace_id(response)
120
127
  if not trace_id:
121
128
  return None
122
129
 
@@ -5,6 +5,7 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import time
8
+ import warnings
8
9
  from collections import defaultdict
9
10
  from dataclasses import dataclass
10
11
  from datetime import datetime, timedelta
@@ -24,9 +25,9 @@ from typing import (
24
25
 
25
26
  import pydantic
26
27
  import requests
27
- from deprecated import deprecated
28
28
  from requests.adapters import HTTPAdapter, Retry
29
29
  from requests.exceptions import HTTPError, RequestException
30
+ from typing_extensions import deprecated
30
31
 
31
32
  from datahub._version import nice_version_name
32
33
  from datahub.cli import config_utils
@@ -40,7 +41,7 @@ from datahub.configuration.common import (
40
41
  TraceTimeoutError,
41
42
  TraceValidationError,
42
43
  )
43
- from datahub.emitter.aspect import JSON_CONTENT_TYPE
44
+ from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
44
45
  from datahub.emitter.generic_emitter import Emitter
45
46
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
46
47
  from datahub.emitter.request_helper import make_curl_command
@@ -50,6 +51,7 @@ from datahub.emitter.response_helper import (
50
51
  extract_trace_data_from_mcps,
51
52
  )
52
53
  from datahub.emitter.serialization_helper import pre_json_transform
54
+ from datahub.errors import APITracingWarning
53
55
  from datahub.ingestion.api.closeable import Closeable
54
56
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
55
57
  MetadataChangeEvent,
@@ -107,9 +109,9 @@ class RestSinkEndpoint(ConfigEnum):
107
109
  OPENAPI = auto()
108
110
 
109
111
 
110
- DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
112
+ DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
111
113
  RestSinkEndpoint,
112
- os.getenv("DATAHUB_REST_SINK_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
114
+ os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
113
115
  )
114
116
 
115
117
 
@@ -227,7 +229,9 @@ class DataHubRestEmitter(Closeable, Emitter):
227
229
  ca_certificate_path: Optional[str] = None,
228
230
  client_certificate_path: Optional[str] = None,
229
231
  disable_ssl_verification: bool = False,
230
- openapi_ingestion: bool = False,
232
+ openapi_ingestion: bool = (
233
+ DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
234
+ ),
231
235
  default_trace_mode: bool = False,
232
236
  ):
233
237
  if not gms_server:
@@ -357,8 +361,14 @@ class DataHubRestEmitter(Closeable, Emitter):
357
361
  )["aspect"]["json"]
358
362
  else:
359
363
  obj = mcp.aspect.to_obj()
360
- if obj.get("value") and obj.get("contentType") == JSON_CONTENT_TYPE:
364
+ content_type = obj.get("contentType")
365
+ if obj.get("value") and content_type == JSON_CONTENT_TYPE:
366
+ # Undo double serialization.
361
367
  obj = json.loads(obj["value"])
368
+ elif content_type == JSON_PATCH_CONTENT_TYPE:
369
+ raise NotImplementedError(
370
+ "Patches are not supported for OpenAPI ingestion. Set the endpoint to RESTLI."
371
+ )
362
372
  aspect_value = pre_json_transform(obj)
363
373
  return (
364
374
  url,
@@ -597,7 +607,7 @@ class DataHubRestEmitter(Closeable, Emitter):
597
607
 
598
608
  return len(mcp_obj_chunks)
599
609
 
600
- @deprecated
610
+ @deprecated("Use emit with a datasetUsageStatistics aspect instead")
601
611
  def emit_usage(self, usageStats: UsageAggregation) -> None:
602
612
  url = f"{self._gms_server}/usageStats?action=batchIngest"
603
613
 
@@ -749,6 +759,12 @@ class DataHubRestEmitter(Closeable, Emitter):
749
759
  trace_flag if trace_flag is not None else self._default_trace_mode
750
760
  )
751
761
  resolved_async_flag = async_flag if async_flag is not None else async_default
762
+ if resolved_trace_flag and not resolved_async_flag:
763
+ warnings.warn(
764
+ "API tracing is only available with async ingestion. For sync mode, API errors will be surfaced as exceptions.",
765
+ APITracingWarning,
766
+ stacklevel=3,
767
+ )
752
768
  return resolved_trace_flag and resolved_async_flag
753
769
 
754
770
  def __repr__(self) -> str:
datahub/errors.py CHANGED
@@ -31,5 +31,13 @@ class MultipleSubtypesWarning(Warning):
31
31
  pass
32
32
 
33
33
 
34
+ class SearchFilterWarning(Warning):
35
+ pass
36
+
37
+
34
38
  class ExperimentalWarning(Warning):
35
39
  pass
40
+
41
+
42
+ class APITracingWarning(Warning):
43
+ pass
@@ -27,6 +27,7 @@ from typing_extensions import LiteralString, Self
27
27
 
28
28
  from datahub.configuration.common import ConfigModel
29
29
  from datahub.configuration.source_common import PlatformInstanceConfigMixin
30
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
30
31
  from datahub.emitter.mcp_builder import mcps_from_mce
31
32
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
32
33
  auto_patch_last_modified,
@@ -44,11 +45,13 @@ from datahub.ingestion.api.source_helpers import (
44
45
  auto_lowercase_urns,
45
46
  auto_materialize_referenced_tags_terms,
46
47
  auto_status_aspect,
48
+ auto_workunit,
47
49
  auto_workunit_reporter,
48
50
  )
49
51
  from datahub.ingestion.api.workunit import MetadataWorkUnit
50
52
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
51
53
  from datahub.metadata.schema_classes import UpstreamLineageClass
54
+ from datahub.sdk.entity import Entity
52
55
  from datahub.utilities.lossy_collections import LossyDict, LossyList
53
56
  from datahub.utilities.type_annotations import get_class_from_annotation
54
57
 
@@ -473,10 +476,12 @@ class Source(Closeable, metaclass=ABCMeta):
473
476
 
474
477
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
475
478
  return self._apply_workunit_processors(
476
- self.get_workunit_processors(), self.get_workunits_internal()
479
+ self.get_workunit_processors(), auto_workunit(self.get_workunits_internal())
477
480
  )
478
481
 
479
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
482
+ def get_workunits_internal(
483
+ self,
484
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
480
485
  raise NotImplementedError(
481
486
  "get_workunits_internal must be implemented if get_workunits is not overriden."
482
487
  )
@@ -35,6 +35,7 @@ from datahub.metadata.schema_classes import (
35
35
  TimeWindowSizeClass,
36
36
  )
37
37
  from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
38
+ from datahub.sdk.entity import Entity
38
39
  from datahub.specific.dataset import DatasetPatchBuilder
39
40
  from datahub.telemetry import telemetry
40
41
  from datahub.utilities.urns.error import InvalidUrnError
@@ -48,7 +49,14 @@ logger = logging.getLogger(__name__)
48
49
 
49
50
 
50
51
  def auto_workunit(
51
- stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]],
52
+ stream: Iterable[
53
+ Union[
54
+ MetadataChangeEventClass,
55
+ MetadataChangeProposalWrapper,
56
+ MetadataWorkUnit,
57
+ Entity,
58
+ ]
59
+ ],
52
60
  ) -> Iterable[MetadataWorkUnit]:
53
61
  """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
54
62
 
@@ -58,8 +66,12 @@ def auto_workunit(
58
66
  id=MetadataWorkUnit.generate_workunit_id(item),
59
67
  mce=item,
60
68
  )
61
- else:
69
+ elif isinstance(item, MetadataChangeProposalWrapper):
62
70
  yield item.as_workunit()
71
+ elif isinstance(item, Entity):
72
+ yield from item.as_workunits()
73
+ else:
74
+ yield item
63
75
 
64
76
 
65
77
  def create_dataset_props_patch_builder(
@@ -362,6 +362,7 @@ class AvroToMceSchemaConverter:
362
362
  merged_props: Dict[str, Any] = {}
363
363
  merged_props.update(self._schema.other_props)
364
364
  merged_props.update(schema.other_props)
365
+ merged_props.update(actual_schema.other_props)
365
366
 
366
367
  # Parse meta_mapping
367
368
  meta_aspects: Dict[str, Any] = {}
@@ -23,9 +23,9 @@ from typing import (
23
23
  )
24
24
 
25
25
  from avro.schema import RecordSchema
26
- from deprecated import deprecated
27
26
  from pydantic import BaseModel
28
27
  from requests.models import HTTPError
28
+ from typing_extensions import deprecated
29
29
 
30
30
  from datahub.cli import config_utils
31
31
  from datahub.configuration.common import ConfigModel, GraphError, OperationalError
@@ -33,7 +33,7 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
33
33
  from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
34
34
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
35
35
  from datahub.emitter.rest_emitter import (
36
- DEFAULT_REST_SINK_ENDPOINT,
36
+ DEFAULT_REST_EMITTER_ENDPOINT,
37
37
  DEFAULT_REST_TRACE_MODE,
38
38
  DatahubRestEmitter,
39
39
  RestSinkEndpoint,
@@ -49,6 +49,7 @@ from datahub.ingestion.graph.connections import (
49
49
  )
50
50
  from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
51
51
  from datahub.ingestion.graph.filters import (
52
+ RawSearchFilter,
52
53
  RawSearchFilterRule,
53
54
  RemovedStatusFilter,
54
55
  generate_filter,
@@ -75,10 +76,11 @@ from datahub.metadata.schema_classes import (
75
76
  SystemMetadataClass,
76
77
  TelemetryClientIdClass,
77
78
  )
79
+ from datahub.metadata.urns import CorpUserUrn, Urn
78
80
  from datahub.telemetry.telemetry import telemetry_instance
79
81
  from datahub.utilities.perf_timer import PerfTimer
80
82
  from datahub.utilities.str_enum import StrEnum
81
- from datahub.utilities.urns.urn import Urn, guess_entity_type
83
+ from datahub.utilities.urns.urn import guess_entity_type
82
84
 
83
85
  if TYPE_CHECKING:
84
86
  from datahub.ingestion.sink.datahub_rest import (
@@ -116,7 +118,7 @@ def entity_type_to_graphql(entity_type: str) -> str:
116
118
  """Convert the entity types into GraphQL "EntityType" enum values."""
117
119
 
118
120
  # Hard-coded special cases.
119
- if entity_type == "corpuser":
121
+ if entity_type == CorpUserUrn.ENTITY_TYPE:
120
122
  return "CORP_USER"
121
123
 
122
124
  # Convert camelCase to UPPER_UNDERSCORE.
@@ -133,6 +135,14 @@ def entity_type_to_graphql(entity_type: str) -> str:
133
135
  return entity_type
134
136
 
135
137
 
138
+ def flexible_entity_type_to_graphql(entity_type: str) -> str:
139
+ if entity_type.upper() == entity_type:
140
+ # Assume that we were passed a graphql EntityType enum value,
141
+ # so no conversion is needed.
142
+ return entity_type
143
+ return entity_type_to_graphql(entity_type)
144
+
145
+
136
146
  class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
137
147
  def __init__(self, config: DatahubClientConfig) -> None:
138
148
  self.config = config
@@ -147,7 +157,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
147
157
  ca_certificate_path=self.config.ca_certificate_path,
148
158
  client_certificate_path=self.config.client_certificate_path,
149
159
  disable_ssl_verification=self.config.disable_ssl_verification,
150
- openapi_ingestion=DEFAULT_REST_SINK_ENDPOINT == RestSinkEndpoint.OPENAPI,
160
+ openapi_ingestion=DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI,
151
161
  default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
152
162
  )
153
163
 
@@ -330,7 +340,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
330
340
  f"Failed to find {aspect_type_name} in response {response_json}"
331
341
  )
332
342
 
333
- @deprecated(reason="Use get_aspect instead which makes aspect string name optional")
343
+ @deprecated("Use get_aspect instead which makes aspect string name optional")
334
344
  def get_aspect_v2(
335
345
  self,
336
346
  entity_urn: str,
@@ -355,7 +365,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
355
365
  def get_schema_metadata(self, entity_urn: str) -> Optional[SchemaMetadataClass]:
356
366
  return self.get_aspect(entity_urn=entity_urn, aspect_type=SchemaMetadataClass)
357
367
 
358
- @deprecated(reason="Use get_aspect directly.")
368
+ @deprecated("Use get_aspect directly.")
359
369
  def get_domain_properties(self, entity_urn: str) -> Optional[DomainPropertiesClass]:
360
370
  return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainPropertiesClass)
361
371
 
@@ -376,7 +386,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
376
386
  def get_domain(self, entity_urn: str) -> Optional[DomainsClass]:
377
387
  return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass)
378
388
 
379
- @deprecated(reason="Use get_aspect directly.")
389
+ @deprecated("Use get_aspect directly.")
380
390
  def get_browse_path(self, entity_urn: str) -> Optional[BrowsePathsClass]:
381
391
  return self.get_aspect(entity_urn=entity_urn, aspect_type=BrowsePathsClass)
382
392
 
@@ -505,7 +515,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
505
515
  return response.json()
506
516
 
507
517
  @deprecated(
508
- reason="Use get_aspect for a single aspect or get_entity_semityped for a full entity."
518
+ "Use get_aspect for a single aspect or get_entity_semityped for a full entity."
509
519
  )
510
520
  def get_aspects_for_entity(
511
521
  self,
@@ -635,9 +645,6 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
635
645
  def _aspect_count_endpoint(self):
636
646
  return f"{self.config.server}/aspects?action=getCount"
637
647
 
638
- # def _session(self) -> Session:
639
- # return super()._session
640
-
641
648
  def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
642
649
  """Retrieve a domain urn based on its name. Returns None if there is no match found"""
643
650
 
@@ -749,9 +756,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
749
756
 
750
757
  assert res["upsertConnection"]["urn"] == urn
751
758
 
752
- @deprecated(
753
- reason='Use get_urns_by_filter(entity_types=["container"], ...) instead'
754
- )
759
+ @deprecated('Use get_urns_by_filter(entity_types=["container"], ...) instead')
755
760
  def get_container_urns_by_filter(
756
761
  self,
757
762
  env: Optional[str] = None,
@@ -810,7 +815,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
810
815
 
811
816
  :return: An iterable of (urn, schema info) tuple that match the filters.
812
817
  """
813
- types = [entity_type_to_graphql("dataset")]
818
+ types = self._get_types(["dataset"])
814
819
 
815
820
  # Add the query default of * if no query is specified.
816
821
  query = query or "*"
@@ -878,10 +883,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
878
883
  env: Optional[str] = None,
879
884
  query: Optional[str] = None,
880
885
  container: Optional[str] = None,
881
- status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
886
+ status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
882
887
  batch_size: int = 10000,
883
888
  extraFilters: Optional[List[RawSearchFilterRule]] = None,
884
- extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
889
+ extra_or_filters: Optional[RawSearchFilter] = None,
885
890
  ) -> Iterable[str]:
886
891
  """Fetch all urns that match all of the given filters.
887
892
 
@@ -973,7 +978,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
973
978
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
974
979
  batch_size: int = 10000,
975
980
  extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
976
- extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
981
+ extra_or_filters: Optional[RawSearchFilter] = None,
977
982
  extra_source_fields: Optional[List[str]] = None,
978
983
  skip_cache: bool = False,
979
984
  ) -> Iterable[dict]:
@@ -1126,7 +1131,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1126
1131
  )
1127
1132
 
1128
1133
  types = [
1129
- entity_type_to_graphql(entity_type) for entity_type in entity_types
1134
+ flexible_entity_type_to_graphql(entity_type)
1135
+ for entity_type in entity_types
1130
1136
  ]
1131
1137
  return types
1132
1138