acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -506,7 +506,7 @@ class Dataset(StrictModel):
|
|
|
506
506
|
# We don't check references for tags
|
|
507
507
|
return list(set(references))
|
|
508
508
|
|
|
509
|
-
def generate_mcp(
|
|
509
|
+
def generate_mcp(
|
|
510
510
|
self,
|
|
511
511
|
) -> Iterable[Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]]:
|
|
512
512
|
mcp = MetadataChangeProposalWrapper(
|
|
@@ -643,33 +643,6 @@ class Dataset(StrictModel):
|
|
|
643
643
|
)
|
|
644
644
|
assert field_urn.startswith("urn:li:schemaField:")
|
|
645
645
|
|
|
646
|
-
if field.globalTags:
|
|
647
|
-
mcp = MetadataChangeProposalWrapper(
|
|
648
|
-
entityUrn=field_urn,
|
|
649
|
-
aspect=GlobalTagsClass(
|
|
650
|
-
tags=[
|
|
651
|
-
TagAssociationClass(tag=make_tag_urn(tag))
|
|
652
|
-
for tag in field.globalTags
|
|
653
|
-
]
|
|
654
|
-
),
|
|
655
|
-
)
|
|
656
|
-
yield mcp
|
|
657
|
-
|
|
658
|
-
if field.glossaryTerms:
|
|
659
|
-
mcp = MetadataChangeProposalWrapper(
|
|
660
|
-
entityUrn=field_urn,
|
|
661
|
-
aspect=GlossaryTermsClass(
|
|
662
|
-
terms=[
|
|
663
|
-
GlossaryTermAssociationClass(
|
|
664
|
-
urn=make_term_urn(term)
|
|
665
|
-
)
|
|
666
|
-
for term in field.glossaryTerms
|
|
667
|
-
],
|
|
668
|
-
auditStamp=self._mint_auditstamp("yaml"),
|
|
669
|
-
),
|
|
670
|
-
)
|
|
671
|
-
yield mcp
|
|
672
|
-
|
|
673
646
|
if field.structured_properties:
|
|
674
647
|
urn_prefix = f"{StructuredPropertyUrn.URN_PREFIX}:{StructuredPropertyUrn.LI_DOMAIN}:{StructuredPropertyUrn.ENTITY_TYPE}"
|
|
675
648
|
mcp = MetadataChangeProposalWrapper(
|
|
@@ -29,13 +29,16 @@ def dataset() -> None:
|
|
|
29
29
|
name="upsert",
|
|
30
30
|
)
|
|
31
31
|
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
|
|
32
|
+
@click.option(
|
|
33
|
+
"-n", "--dry-run", type=bool, is_flag=True, default=False, help="Perform a dry run"
|
|
34
|
+
)
|
|
32
35
|
@upgrade.check_upgrade
|
|
33
36
|
@telemetry.with_telemetry()
|
|
34
|
-
def upsert(file: Path) -> None:
|
|
37
|
+
def upsert(file: Path, dry_run: bool) -> None:
|
|
35
38
|
"""Upsert attributes to a Dataset in DataHub."""
|
|
36
39
|
# Call the sync command with to_datahub=True to perform the upsert operation
|
|
37
40
|
ctx = click.get_current_context()
|
|
38
|
-
ctx.invoke(sync, file=str(file), to_datahub=True)
|
|
41
|
+
ctx.invoke(sync, file=str(file), dry_run=dry_run, to_datahub=True)
|
|
39
42
|
|
|
40
43
|
|
|
41
44
|
@dataset.command(
|
|
@@ -167,11 +170,16 @@ def file(lintcheck: bool, lintfix: bool, file: str) -> None:
|
|
|
167
170
|
)
|
|
168
171
|
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
|
|
169
172
|
@click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
|
|
173
|
+
@click.option(
|
|
174
|
+
"-n", "--dry-run", type=bool, is_flag=True, default=False, help="Perform a dry run"
|
|
175
|
+
)
|
|
170
176
|
@upgrade.check_upgrade
|
|
171
177
|
@telemetry.with_telemetry()
|
|
172
|
-
def sync(file: str, to_datahub: bool) -> None:
|
|
178
|
+
def sync(file: str, to_datahub: bool, dry_run: bool) -> None:
|
|
173
179
|
"""Sync a Dataset file to/from DataHub"""
|
|
174
180
|
|
|
181
|
+
dry_run_prefix = "[dry-run]: " if dry_run else "" # prefix to use in messages
|
|
182
|
+
|
|
175
183
|
failures: List[str] = []
|
|
176
184
|
with get_default_graph() as graph:
|
|
177
185
|
datasets = Dataset.from_yaml(file)
|
|
@@ -189,7 +197,7 @@ def sync(file: str, to_datahub: bool) -> None:
|
|
|
189
197
|
click.secho(
|
|
190
198
|
"\n\t- ".join(
|
|
191
199
|
[
|
|
192
|
-
f"Skipping Dataset {dataset.urn} due to missing entity references: "
|
|
200
|
+
f"{dry_run_prefix}Skipping Dataset {dataset.urn} due to missing entity references: "
|
|
193
201
|
]
|
|
194
202
|
+ missing_entity_references
|
|
195
203
|
),
|
|
@@ -199,13 +207,18 @@ def sync(file: str, to_datahub: bool) -> None:
|
|
|
199
207
|
continue
|
|
200
208
|
try:
|
|
201
209
|
for mcp in dataset.generate_mcp():
|
|
202
|
-
|
|
203
|
-
|
|
210
|
+
if not dry_run:
|
|
211
|
+
graph.emit(mcp)
|
|
212
|
+
click.secho(
|
|
213
|
+
f"{dry_run_prefix}Update succeeded for urn {dataset.urn}.",
|
|
214
|
+
fg="green",
|
|
215
|
+
)
|
|
204
216
|
except Exception as e:
|
|
205
217
|
click.secho(
|
|
206
|
-
f"Update failed for id {id}. due to {e}",
|
|
218
|
+
f"{dry_run_prefix}Update failed for id {id}. due to {e}",
|
|
207
219
|
fg="red",
|
|
208
220
|
)
|
|
221
|
+
failures.append(dataset.urn)
|
|
209
222
|
else:
|
|
210
223
|
# Sync from DataHub
|
|
211
224
|
if graph.exists(dataset.urn):
|
|
@@ -215,13 +228,16 @@ def sync(file: str, to_datahub: bool) -> None:
|
|
|
215
228
|
existing_dataset: Dataset = Dataset.from_datahub(
|
|
216
229
|
graph=graph, urn=dataset.urn, config=dataset_get_config
|
|
217
230
|
)
|
|
218
|
-
|
|
231
|
+
if not dry_run:
|
|
232
|
+
existing_dataset.to_yaml(Path(file))
|
|
233
|
+
else:
|
|
234
|
+
click.secho(f"{dry_run_prefix}Will update file {file}")
|
|
219
235
|
else:
|
|
220
|
-
click.secho(f"Dataset {dataset.urn} does not exist")
|
|
236
|
+
click.secho(f"{dry_run_prefix}Dataset {dataset.urn} does not exist")
|
|
221
237
|
failures.append(dataset.urn)
|
|
222
238
|
if failures:
|
|
223
239
|
click.secho(
|
|
224
|
-
f"\
|
|
240
|
+
f"\n{dry_run_prefix}Failed to sync the following Datasets: {', '.join(failures)}",
|
|
225
241
|
fg="red",
|
|
226
242
|
)
|
|
227
243
|
raise click.Abort()
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -125,9 +125,7 @@ def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
|
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
def make_data_platform_urn(platform: str) -> str:
|
|
128
|
-
|
|
129
|
-
return platform
|
|
130
|
-
return DataPlatformUrn.create_from_id(platform).urn()
|
|
128
|
+
return DataPlatformUrn(platform).urn()
|
|
131
129
|
|
|
132
130
|
|
|
133
131
|
def make_dataset_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
|
datahub/emitter/mcp_builder.py
CHANGED
|
@@ -117,6 +117,14 @@ class ContainerKey(DatahubKey):
|
|
|
117
117
|
PlatformKey = ContainerKey
|
|
118
118
|
|
|
119
119
|
|
|
120
|
+
class NamespaceKey(ContainerKey):
|
|
121
|
+
"""
|
|
122
|
+
For Iceberg namespaces (databases/schemas)
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
namespace: str
|
|
126
|
+
|
|
127
|
+
|
|
120
128
|
class DatabaseKey(ContainerKey):
|
|
121
129
|
database: str
|
|
122
130
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import itertools
|
|
2
1
|
import shlex
|
|
3
|
-
from typing import List, Union
|
|
2
|
+
from typing import List, Optional, Union
|
|
4
3
|
|
|
5
4
|
import requests
|
|
5
|
+
from requests.auth import HTTPBasicAuth
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def _format_header(name: str, value: Union[str, bytes]) -> str:
|
|
@@ -12,17 +12,22 @@ def _format_header(name: str, value: Union[str, bytes]) -> str:
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def make_curl_command(
|
|
15
|
-
session: requests.Session, method: str, url: str, payload: str
|
|
15
|
+
session: requests.Session, method: str, url: str, payload: Optional[str] = None
|
|
16
16
|
) -> str:
|
|
17
|
-
fragments: List[str] = [
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
]
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
17
|
+
fragments: List[str] = ["curl", "-X", method]
|
|
18
|
+
|
|
19
|
+
for header_name, header_value in session.headers.items():
|
|
20
|
+
fragments.extend(["-H", _format_header(header_name, header_value)])
|
|
21
|
+
|
|
22
|
+
if session.auth:
|
|
23
|
+
if isinstance(session.auth, HTTPBasicAuth):
|
|
24
|
+
fragments.extend(["-u", f"{session.auth.username}:<redacted>"])
|
|
25
|
+
else:
|
|
26
|
+
# For other auth types, they should be handled via headers
|
|
27
|
+
fragments.extend(["-H", "<unknown auth type>"])
|
|
28
|
+
|
|
29
|
+
if payload:
|
|
30
|
+
fragments.extend(["--data", payload])
|
|
31
|
+
|
|
32
|
+
fragments.append(url)
|
|
28
33
|
return shlex.join(fragments)
|
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from typing import Dict, List, Optional, Sequence, Union
|
|
5
6
|
|
|
6
7
|
from requests import Response
|
|
7
8
|
|
|
8
9
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
10
|
+
from datahub.errors import APITracingWarning
|
|
9
11
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
10
12
|
MetadataChangeProposal,
|
|
11
13
|
)
|
|
12
14
|
|
|
13
15
|
logger = logging.getLogger(__name__)
|
|
14
16
|
|
|
17
|
+
_TRACE_HEADER_NAME = "traceparent"
|
|
18
|
+
|
|
15
19
|
|
|
16
20
|
@dataclass
|
|
17
21
|
class TraceData:
|
|
@@ -25,14 +29,11 @@ class TraceData:
|
|
|
25
29
|
raise TypeError("data must be a dictionary")
|
|
26
30
|
|
|
27
31
|
|
|
28
|
-
def _extract_trace_id(
|
|
29
|
-
response: Response, trace_header: str = "traceparent"
|
|
30
|
-
) -> Optional[str]:
|
|
32
|
+
def _extract_trace_id(response: Response) -> Optional[str]:
|
|
31
33
|
"""
|
|
32
34
|
Extract trace ID from response headers.
|
|
33
35
|
Args:
|
|
34
36
|
response: HTTP response object
|
|
35
|
-
trace_header: Name of the trace header to use
|
|
36
37
|
Returns:
|
|
37
38
|
Trace ID if found and response is valid, None otherwise
|
|
38
39
|
"""
|
|
@@ -40,9 +41,17 @@ def _extract_trace_id(
|
|
|
40
41
|
logger.debug(f"Invalid status code: {response.status_code}")
|
|
41
42
|
return None
|
|
42
43
|
|
|
43
|
-
trace_id = response.headers.get(
|
|
44
|
+
trace_id = response.headers.get(_TRACE_HEADER_NAME)
|
|
44
45
|
if not trace_id:
|
|
45
|
-
|
|
46
|
+
# This will only be printed if
|
|
47
|
+
# 1. we're in async mode (checked by the caller)
|
|
48
|
+
# 2. the server did not return a trace ID
|
|
49
|
+
logger.debug(f"Missing trace header: {_TRACE_HEADER_NAME}")
|
|
50
|
+
warnings.warn(
|
|
51
|
+
"No trace ID found in response headers. API tracing is not active - likely due to an outdated server version.",
|
|
52
|
+
APITracingWarning,
|
|
53
|
+
stacklevel=3,
|
|
54
|
+
)
|
|
46
55
|
return None
|
|
47
56
|
|
|
48
57
|
return trace_id
|
|
@@ -51,20 +60,19 @@ def _extract_trace_id(
|
|
|
51
60
|
def extract_trace_data(
|
|
52
61
|
response: Response,
|
|
53
62
|
aspects_to_trace: Optional[List[str]] = None,
|
|
54
|
-
trace_header: str = "traceparent",
|
|
55
63
|
) -> Optional[TraceData]:
|
|
56
|
-
"""
|
|
57
|
-
|
|
64
|
+
"""Extract trace data from a response object.
|
|
65
|
+
|
|
66
|
+
If we run into a JSONDecodeError, we'll log an error and return None.
|
|
67
|
+
|
|
58
68
|
Args:
|
|
59
69
|
response: HTTP response object
|
|
60
70
|
aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
|
|
61
|
-
|
|
71
|
+
|
|
62
72
|
Returns:
|
|
63
73
|
TraceData object if successful, None otherwise
|
|
64
|
-
Raises:
|
|
65
|
-
JSONDecodeError: If response body cannot be decoded as JSON
|
|
66
74
|
"""
|
|
67
|
-
trace_id = _extract_trace_id(response
|
|
75
|
+
trace_id = _extract_trace_id(response)
|
|
68
76
|
if not trace_id:
|
|
69
77
|
return None
|
|
70
78
|
|
|
@@ -104,19 +112,18 @@ def extract_trace_data_from_mcps(
|
|
|
104
112
|
response: Response,
|
|
105
113
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
106
114
|
aspects_to_trace: Optional[List[str]] = None,
|
|
107
|
-
trace_header: str = "traceparent",
|
|
108
115
|
) -> Optional[TraceData]:
|
|
109
|
-
"""
|
|
110
|
-
|
|
116
|
+
"""Extract trace data from a response object and populate data from provided MCPs.
|
|
117
|
+
|
|
111
118
|
Args:
|
|
112
119
|
response: HTTP response object used only for trace_id extraction
|
|
113
120
|
mcps: List of MCP URN and aspect data
|
|
114
121
|
aspects_to_trace: Optional list of aspect names to extract. If None, extracts all aspects.
|
|
115
|
-
|
|
122
|
+
|
|
116
123
|
Returns:
|
|
117
124
|
TraceData object if successful, None otherwise
|
|
118
125
|
"""
|
|
119
|
-
trace_id = _extract_trace_id(response
|
|
126
|
+
trace_id = _extract_trace_id(response)
|
|
120
127
|
if not trace_id:
|
|
121
128
|
return None
|
|
122
129
|
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -5,6 +5,7 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import time
|
|
8
|
+
import warnings
|
|
8
9
|
from collections import defaultdict
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from datetime import datetime, timedelta
|
|
@@ -24,9 +25,9 @@ from typing import (
|
|
|
24
25
|
|
|
25
26
|
import pydantic
|
|
26
27
|
import requests
|
|
27
|
-
from deprecated import deprecated
|
|
28
28
|
from requests.adapters import HTTPAdapter, Retry
|
|
29
29
|
from requests.exceptions import HTTPError, RequestException
|
|
30
|
+
from typing_extensions import deprecated
|
|
30
31
|
|
|
31
32
|
from datahub._version import nice_version_name
|
|
32
33
|
from datahub.cli import config_utils
|
|
@@ -40,7 +41,7 @@ from datahub.configuration.common import (
|
|
|
40
41
|
TraceTimeoutError,
|
|
41
42
|
TraceValidationError,
|
|
42
43
|
)
|
|
43
|
-
from datahub.emitter.aspect import JSON_CONTENT_TYPE
|
|
44
|
+
from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
|
|
44
45
|
from datahub.emitter.generic_emitter import Emitter
|
|
45
46
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
46
47
|
from datahub.emitter.request_helper import make_curl_command
|
|
@@ -50,6 +51,7 @@ from datahub.emitter.response_helper import (
|
|
|
50
51
|
extract_trace_data_from_mcps,
|
|
51
52
|
)
|
|
52
53
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
54
|
+
from datahub.errors import APITracingWarning
|
|
53
55
|
from datahub.ingestion.api.closeable import Closeable
|
|
54
56
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
55
57
|
MetadataChangeEvent,
|
|
@@ -107,9 +109,9 @@ class RestSinkEndpoint(ConfigEnum):
|
|
|
107
109
|
OPENAPI = auto()
|
|
108
110
|
|
|
109
111
|
|
|
110
|
-
|
|
112
|
+
DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
|
|
111
113
|
RestSinkEndpoint,
|
|
112
|
-
os.getenv("
|
|
114
|
+
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
|
|
113
115
|
)
|
|
114
116
|
|
|
115
117
|
|
|
@@ -227,7 +229,9 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
227
229
|
ca_certificate_path: Optional[str] = None,
|
|
228
230
|
client_certificate_path: Optional[str] = None,
|
|
229
231
|
disable_ssl_verification: bool = False,
|
|
230
|
-
openapi_ingestion: bool =
|
|
232
|
+
openapi_ingestion: bool = (
|
|
233
|
+
DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
|
|
234
|
+
),
|
|
231
235
|
default_trace_mode: bool = False,
|
|
232
236
|
):
|
|
233
237
|
if not gms_server:
|
|
@@ -357,8 +361,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
357
361
|
)["aspect"]["json"]
|
|
358
362
|
else:
|
|
359
363
|
obj = mcp.aspect.to_obj()
|
|
360
|
-
|
|
364
|
+
content_type = obj.get("contentType")
|
|
365
|
+
if obj.get("value") and content_type == JSON_CONTENT_TYPE:
|
|
366
|
+
# Undo double serialization.
|
|
361
367
|
obj = json.loads(obj["value"])
|
|
368
|
+
elif content_type == JSON_PATCH_CONTENT_TYPE:
|
|
369
|
+
raise NotImplementedError(
|
|
370
|
+
"Patches are not supported for OpenAPI ingestion. Set the endpoint to RESTLI."
|
|
371
|
+
)
|
|
362
372
|
aspect_value = pre_json_transform(obj)
|
|
363
373
|
return (
|
|
364
374
|
url,
|
|
@@ -597,7 +607,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
597
607
|
|
|
598
608
|
return len(mcp_obj_chunks)
|
|
599
609
|
|
|
600
|
-
@deprecated
|
|
610
|
+
@deprecated("Use emit with a datasetUsageStatistics aspect instead")
|
|
601
611
|
def emit_usage(self, usageStats: UsageAggregation) -> None:
|
|
602
612
|
url = f"{self._gms_server}/usageStats?action=batchIngest"
|
|
603
613
|
|
|
@@ -749,6 +759,12 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
749
759
|
trace_flag if trace_flag is not None else self._default_trace_mode
|
|
750
760
|
)
|
|
751
761
|
resolved_async_flag = async_flag if async_flag is not None else async_default
|
|
762
|
+
if resolved_trace_flag and not resolved_async_flag:
|
|
763
|
+
warnings.warn(
|
|
764
|
+
"API tracing is only available with async ingestion. For sync mode, API errors will be surfaced as exceptions.",
|
|
765
|
+
APITracingWarning,
|
|
766
|
+
stacklevel=3,
|
|
767
|
+
)
|
|
752
768
|
return resolved_trace_flag and resolved_async_flag
|
|
753
769
|
|
|
754
770
|
def __repr__(self) -> str:
|
datahub/errors.py
CHANGED
datahub/ingestion/api/source.py
CHANGED
|
@@ -27,6 +27,7 @@ from typing_extensions import LiteralString, Self
|
|
|
27
27
|
|
|
28
28
|
from datahub.configuration.common import ConfigModel
|
|
29
29
|
from datahub.configuration.source_common import PlatformInstanceConfigMixin
|
|
30
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
30
31
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
31
32
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
32
33
|
auto_patch_last_modified,
|
|
@@ -44,11 +45,13 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
44
45
|
auto_lowercase_urns,
|
|
45
46
|
auto_materialize_referenced_tags_terms,
|
|
46
47
|
auto_status_aspect,
|
|
48
|
+
auto_workunit,
|
|
47
49
|
auto_workunit_reporter,
|
|
48
50
|
)
|
|
49
51
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
50
52
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
51
53
|
from datahub.metadata.schema_classes import UpstreamLineageClass
|
|
54
|
+
from datahub.sdk.entity import Entity
|
|
52
55
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
53
56
|
from datahub.utilities.type_annotations import get_class_from_annotation
|
|
54
57
|
|
|
@@ -473,10 +476,12 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
473
476
|
|
|
474
477
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
475
478
|
return self._apply_workunit_processors(
|
|
476
|
-
self.get_workunit_processors(), self.get_workunits_internal()
|
|
479
|
+
self.get_workunit_processors(), auto_workunit(self.get_workunits_internal())
|
|
477
480
|
)
|
|
478
481
|
|
|
479
|
-
def get_workunits_internal(
|
|
482
|
+
def get_workunits_internal(
|
|
483
|
+
self,
|
|
484
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
|
|
480
485
|
raise NotImplementedError(
|
|
481
486
|
"get_workunits_internal must be implemented if get_workunits is not overriden."
|
|
482
487
|
)
|
|
@@ -35,6 +35,7 @@ from datahub.metadata.schema_classes import (
|
|
|
35
35
|
TimeWindowSizeClass,
|
|
36
36
|
)
|
|
37
37
|
from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
|
|
38
|
+
from datahub.sdk.entity import Entity
|
|
38
39
|
from datahub.specific.dataset import DatasetPatchBuilder
|
|
39
40
|
from datahub.telemetry import telemetry
|
|
40
41
|
from datahub.utilities.urns.error import InvalidUrnError
|
|
@@ -48,7 +49,14 @@ logger = logging.getLogger(__name__)
|
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
def auto_workunit(
|
|
51
|
-
stream: Iterable[
|
|
52
|
+
stream: Iterable[
|
|
53
|
+
Union[
|
|
54
|
+
MetadataChangeEventClass,
|
|
55
|
+
MetadataChangeProposalWrapper,
|
|
56
|
+
MetadataWorkUnit,
|
|
57
|
+
Entity,
|
|
58
|
+
]
|
|
59
|
+
],
|
|
52
60
|
) -> Iterable[MetadataWorkUnit]:
|
|
53
61
|
"""Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
|
|
54
62
|
|
|
@@ -58,8 +66,12 @@ def auto_workunit(
|
|
|
58
66
|
id=MetadataWorkUnit.generate_workunit_id(item),
|
|
59
67
|
mce=item,
|
|
60
68
|
)
|
|
61
|
-
|
|
69
|
+
elif isinstance(item, MetadataChangeProposalWrapper):
|
|
62
70
|
yield item.as_workunit()
|
|
71
|
+
elif isinstance(item, Entity):
|
|
72
|
+
yield from item.as_workunits()
|
|
73
|
+
else:
|
|
74
|
+
yield item
|
|
63
75
|
|
|
64
76
|
|
|
65
77
|
def create_dataset_props_patch_builder(
|
|
@@ -362,6 +362,7 @@ class AvroToMceSchemaConverter:
|
|
|
362
362
|
merged_props: Dict[str, Any] = {}
|
|
363
363
|
merged_props.update(self._schema.other_props)
|
|
364
364
|
merged_props.update(schema.other_props)
|
|
365
|
+
merged_props.update(actual_schema.other_props)
|
|
365
366
|
|
|
366
367
|
# Parse meta_mapping
|
|
367
368
|
meta_aspects: Dict[str, Any] = {}
|
|
@@ -23,9 +23,9 @@ from typing import (
|
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
from avro.schema import RecordSchema
|
|
26
|
-
from deprecated import deprecated
|
|
27
26
|
from pydantic import BaseModel
|
|
28
27
|
from requests.models import HTTPError
|
|
28
|
+
from typing_extensions import deprecated
|
|
29
29
|
|
|
30
30
|
from datahub.cli import config_utils
|
|
31
31
|
from datahub.configuration.common import ConfigModel, GraphError, OperationalError
|
|
@@ -33,7 +33,7 @@ from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
|
|
|
33
33
|
from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
|
|
34
34
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
35
35
|
from datahub.emitter.rest_emitter import (
|
|
36
|
-
|
|
36
|
+
DEFAULT_REST_EMITTER_ENDPOINT,
|
|
37
37
|
DEFAULT_REST_TRACE_MODE,
|
|
38
38
|
DatahubRestEmitter,
|
|
39
39
|
RestSinkEndpoint,
|
|
@@ -49,6 +49,7 @@ from datahub.ingestion.graph.connections import (
|
|
|
49
49
|
)
|
|
50
50
|
from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
|
|
51
51
|
from datahub.ingestion.graph.filters import (
|
|
52
|
+
RawSearchFilter,
|
|
52
53
|
RawSearchFilterRule,
|
|
53
54
|
RemovedStatusFilter,
|
|
54
55
|
generate_filter,
|
|
@@ -75,10 +76,11 @@ from datahub.metadata.schema_classes import (
|
|
|
75
76
|
SystemMetadataClass,
|
|
76
77
|
TelemetryClientIdClass,
|
|
77
78
|
)
|
|
79
|
+
from datahub.metadata.urns import CorpUserUrn, Urn
|
|
78
80
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
79
81
|
from datahub.utilities.perf_timer import PerfTimer
|
|
80
82
|
from datahub.utilities.str_enum import StrEnum
|
|
81
|
-
from datahub.utilities.urns.urn import
|
|
83
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
82
84
|
|
|
83
85
|
if TYPE_CHECKING:
|
|
84
86
|
from datahub.ingestion.sink.datahub_rest import (
|
|
@@ -116,7 +118,7 @@ def entity_type_to_graphql(entity_type: str) -> str:
|
|
|
116
118
|
"""Convert the entity types into GraphQL "EntityType" enum values."""
|
|
117
119
|
|
|
118
120
|
# Hard-coded special cases.
|
|
119
|
-
if entity_type ==
|
|
121
|
+
if entity_type == CorpUserUrn.ENTITY_TYPE:
|
|
120
122
|
return "CORP_USER"
|
|
121
123
|
|
|
122
124
|
# Convert camelCase to UPPER_UNDERSCORE.
|
|
@@ -133,6 +135,14 @@ def entity_type_to_graphql(entity_type: str) -> str:
|
|
|
133
135
|
return entity_type
|
|
134
136
|
|
|
135
137
|
|
|
138
|
+
def flexible_entity_type_to_graphql(entity_type: str) -> str:
|
|
139
|
+
if entity_type.upper() == entity_type:
|
|
140
|
+
# Assume that we were passed a graphql EntityType enum value,
|
|
141
|
+
# so no conversion is needed.
|
|
142
|
+
return entity_type
|
|
143
|
+
return entity_type_to_graphql(entity_type)
|
|
144
|
+
|
|
145
|
+
|
|
136
146
|
class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
137
147
|
def __init__(self, config: DatahubClientConfig) -> None:
|
|
138
148
|
self.config = config
|
|
@@ -147,7 +157,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
147
157
|
ca_certificate_path=self.config.ca_certificate_path,
|
|
148
158
|
client_certificate_path=self.config.client_certificate_path,
|
|
149
159
|
disable_ssl_verification=self.config.disable_ssl_verification,
|
|
150
|
-
openapi_ingestion=
|
|
160
|
+
openapi_ingestion=DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI,
|
|
151
161
|
default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
|
|
152
162
|
)
|
|
153
163
|
|
|
@@ -330,7 +340,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
330
340
|
f"Failed to find {aspect_type_name} in response {response_json}"
|
|
331
341
|
)
|
|
332
342
|
|
|
333
|
-
@deprecated(
|
|
343
|
+
@deprecated("Use get_aspect instead which makes aspect string name optional")
|
|
334
344
|
def get_aspect_v2(
|
|
335
345
|
self,
|
|
336
346
|
entity_urn: str,
|
|
@@ -355,7 +365,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
355
365
|
def get_schema_metadata(self, entity_urn: str) -> Optional[SchemaMetadataClass]:
|
|
356
366
|
return self.get_aspect(entity_urn=entity_urn, aspect_type=SchemaMetadataClass)
|
|
357
367
|
|
|
358
|
-
@deprecated(
|
|
368
|
+
@deprecated("Use get_aspect directly.")
|
|
359
369
|
def get_domain_properties(self, entity_urn: str) -> Optional[DomainPropertiesClass]:
|
|
360
370
|
return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainPropertiesClass)
|
|
361
371
|
|
|
@@ -376,7 +386,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
376
386
|
def get_domain(self, entity_urn: str) -> Optional[DomainsClass]:
|
|
377
387
|
return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass)
|
|
378
388
|
|
|
379
|
-
@deprecated(
|
|
389
|
+
@deprecated("Use get_aspect directly.")
|
|
380
390
|
def get_browse_path(self, entity_urn: str) -> Optional[BrowsePathsClass]:
|
|
381
391
|
return self.get_aspect(entity_urn=entity_urn, aspect_type=BrowsePathsClass)
|
|
382
392
|
|
|
@@ -505,7 +515,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
505
515
|
return response.json()
|
|
506
516
|
|
|
507
517
|
@deprecated(
|
|
508
|
-
|
|
518
|
+
"Use get_aspect for a single aspect or get_entity_semityped for a full entity."
|
|
509
519
|
)
|
|
510
520
|
def get_aspects_for_entity(
|
|
511
521
|
self,
|
|
@@ -635,9 +645,6 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
635
645
|
def _aspect_count_endpoint(self):
|
|
636
646
|
return f"{self.config.server}/aspects?action=getCount"
|
|
637
647
|
|
|
638
|
-
# def _session(self) -> Session:
|
|
639
|
-
# return super()._session
|
|
640
|
-
|
|
641
648
|
def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
|
|
642
649
|
"""Retrieve a domain urn based on its name. Returns None if there is no match found"""
|
|
643
650
|
|
|
@@ -749,9 +756,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
749
756
|
|
|
750
757
|
assert res["upsertConnection"]["urn"] == urn
|
|
751
758
|
|
|
752
|
-
@deprecated(
|
|
753
|
-
reason='Use get_urns_by_filter(entity_types=["container"], ...) instead'
|
|
754
|
-
)
|
|
759
|
+
@deprecated('Use get_urns_by_filter(entity_types=["container"], ...) instead')
|
|
755
760
|
def get_container_urns_by_filter(
|
|
756
761
|
self,
|
|
757
762
|
env: Optional[str] = None,
|
|
@@ -810,7 +815,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
810
815
|
|
|
811
816
|
:return: An iterable of (urn, schema info) tuple that match the filters.
|
|
812
817
|
"""
|
|
813
|
-
types = [
|
|
818
|
+
types = self._get_types(["dataset"])
|
|
814
819
|
|
|
815
820
|
# Add the query default of * if no query is specified.
|
|
816
821
|
query = query or "*"
|
|
@@ -878,10 +883,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
878
883
|
env: Optional[str] = None,
|
|
879
884
|
query: Optional[str] = None,
|
|
880
885
|
container: Optional[str] = None,
|
|
881
|
-
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
886
|
+
status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
882
887
|
batch_size: int = 10000,
|
|
883
888
|
extraFilters: Optional[List[RawSearchFilterRule]] = None,
|
|
884
|
-
extra_or_filters: Optional[
|
|
889
|
+
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
885
890
|
) -> Iterable[str]:
|
|
886
891
|
"""Fetch all urns that match all of the given filters.
|
|
887
892
|
|
|
@@ -973,7 +978,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
973
978
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
974
979
|
batch_size: int = 10000,
|
|
975
980
|
extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
|
|
976
|
-
extra_or_filters: Optional[
|
|
981
|
+
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
977
982
|
extra_source_fields: Optional[List[str]] = None,
|
|
978
983
|
skip_cache: bool = False,
|
|
979
984
|
) -> Iterable[dict]:
|
|
@@ -1126,7 +1131,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1126
1131
|
)
|
|
1127
1132
|
|
|
1128
1133
|
types = [
|
|
1129
|
-
|
|
1134
|
+
flexible_entity_type_to_graphql(entity_type)
|
|
1135
|
+
for entity_type in entity_types
|
|
1130
1136
|
]
|
|
1131
1137
|
return types
|
|
1132
1138
|
|