acryl-datahub 1.0.0.3rc8__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/METADATA +2466 -2466
- {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/RECORD +60 -60
- {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -35
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/delete_cli.py +1 -1
- datahub/cli/docker_cli.py +2 -2
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +44 -52
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +3 -1
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_connection.py +19 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
- datahub/metadata/_urns/urn_defs.py +1786 -1786
- datahub/metadata/schema.avsc +17364 -16988
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/specific/dataset.py +12 -0
- datahub/testing/check_imports.py +1 -1
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py
CHANGED
|
@@ -48,7 +48,7 @@ def delete() -> None:
|
|
|
48
48
|
|
|
49
49
|
See `datahub delete by-filter` for the list of available filters.
|
|
50
50
|
|
|
51
|
-
See https://
|
|
51
|
+
See https://docs.datahub.com/docs/how/delete-metadata for more detailed docs.
|
|
52
52
|
"""
|
|
53
53
|
pass
|
|
54
54
|
|
datahub/cli/docker_cli.py
CHANGED
|
@@ -811,7 +811,7 @@ def quickstart(
|
|
|
811
811
|
raise status.to_exception(
|
|
812
812
|
header="Unable to run quickstart - the following issues were detected:",
|
|
813
813
|
footer="If you think something went wrong, please file an issue at https://github.com/datahub-project/datahub/issues\n"
|
|
814
|
-
"or send a message in our Slack https://slack
|
|
814
|
+
"or send a message in our Slack https://datahub.com/slack/\n"
|
|
815
815
|
f"Be sure to attach the logs from {log_file.name}",
|
|
816
816
|
)
|
|
817
817
|
|
|
@@ -824,7 +824,7 @@ def quickstart(
|
|
|
824
824
|
fg="green",
|
|
825
825
|
)
|
|
826
826
|
click.secho(
|
|
827
|
-
"Need support? Get in touch on Slack: https://
|
|
827
|
+
"Need support? Get in touch on Slack: https://datahub.com/slack/",
|
|
828
828
|
fg="magenta",
|
|
829
829
|
)
|
|
830
830
|
|
datahub/configuration/common.py
CHANGED
|
@@ -16,7 +16,7 @@ class PlatformInstanceConfigMixin(ConfigModel):
|
|
|
16
16
|
default=None,
|
|
17
17
|
description="The instance of the platform that all assets produced by this recipe belong to. "
|
|
18
18
|
"This should be unique within the platform. "
|
|
19
|
-
"See https://
|
|
19
|
+
"See https://docs.datahub.com/docs/platform-instances/ for more details.",
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
|
|
@@ -1,14 +1,31 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import shlex
|
|
2
|
-
from
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
5
|
|
|
4
6
|
import requests
|
|
5
7
|
from requests.auth import HTTPBasicAuth
|
|
6
8
|
|
|
9
|
+
from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
|
|
10
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
11
|
+
from datahub.emitter.serialization_helper import pre_json_transform
|
|
12
|
+
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
13
|
+
MetadataChangeProposal,
|
|
14
|
+
)
|
|
15
|
+
from datahub.metadata.schema_classes import ChangeTypeClass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _decode_bytes(value: Union[str, bytes]) -> str:
|
|
19
|
+
"""Decode bytes to string, if necessary."""
|
|
20
|
+
if isinstance(value, bytes):
|
|
21
|
+
return value.decode()
|
|
22
|
+
return value
|
|
23
|
+
|
|
7
24
|
|
|
8
25
|
def _format_header(name: str, value: Union[str, bytes]) -> str:
|
|
9
26
|
if name == "Authorization":
|
|
10
27
|
return f"{name!s}: <redacted>"
|
|
11
|
-
return f"{name!s}: {value
|
|
28
|
+
return f"{name!s}: {_decode_bytes(value)}"
|
|
12
29
|
|
|
13
30
|
|
|
14
31
|
def make_curl_command(
|
|
@@ -21,7 +38,9 @@ def make_curl_command(
|
|
|
21
38
|
|
|
22
39
|
if session.auth:
|
|
23
40
|
if isinstance(session.auth, HTTPBasicAuth):
|
|
24
|
-
fragments.extend(
|
|
41
|
+
fragments.extend(
|
|
42
|
+
["-u", f"{_decode_bytes(session.auth.username)}:<redacted>"]
|
|
43
|
+
)
|
|
25
44
|
else:
|
|
26
45
|
# For other auth types, they should be handled via headers
|
|
27
46
|
fragments.extend(["-H", "<unknown auth type>"])
|
|
@@ -31,3 +50,97 @@ def make_curl_command(
|
|
|
31
50
|
|
|
32
51
|
fragments.append(url)
|
|
33
52
|
return shlex.join(fragments)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class OpenApiRequest:
|
|
57
|
+
"""Represents an OpenAPI request for entity operations."""
|
|
58
|
+
|
|
59
|
+
method: str
|
|
60
|
+
url: str
|
|
61
|
+
payload: List[Dict[str, Any]]
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def from_mcp(
|
|
65
|
+
cls,
|
|
66
|
+
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
67
|
+
gms_server: str,
|
|
68
|
+
async_flag: Optional[bool] = None,
|
|
69
|
+
async_default: bool = False,
|
|
70
|
+
) -> Optional["OpenApiRequest"]:
|
|
71
|
+
"""Factory method to create an OpenApiRequest from a MetadataChangeProposal."""
|
|
72
|
+
if not mcp.aspectName or (
|
|
73
|
+
mcp.changeType != ChangeTypeClass.DELETE and not mcp.aspect
|
|
74
|
+
):
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
resolved_async_flag = async_flag if async_flag is not None else async_default
|
|
78
|
+
|
|
79
|
+
method = "post"
|
|
80
|
+
url = f"{gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
|
|
81
|
+
payload = []
|
|
82
|
+
|
|
83
|
+
if mcp.changeType == ChangeTypeClass.DELETE:
|
|
84
|
+
method = "delete"
|
|
85
|
+
url = f"{gms_server}/openapi/v3/entity/{mcp.entityType}/{mcp.entityUrn}"
|
|
86
|
+
else:
|
|
87
|
+
if mcp.aspect:
|
|
88
|
+
if mcp.changeType == ChangeTypeClass.PATCH:
|
|
89
|
+
method = "patch"
|
|
90
|
+
obj = mcp.aspect.to_obj()
|
|
91
|
+
content_type = obj.get("contentType")
|
|
92
|
+
if obj.get("value") and content_type == JSON_PATCH_CONTENT_TYPE:
|
|
93
|
+
# Undo double serialization.
|
|
94
|
+
obj = json.loads(obj["value"])
|
|
95
|
+
patch_value = obj
|
|
96
|
+
else:
|
|
97
|
+
raise NotImplementedError(
|
|
98
|
+
f"ChangeType {mcp.changeType} only supports context type {JSON_PATCH_CONTENT_TYPE}, found {content_type}."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if isinstance(patch_value, list):
|
|
102
|
+
patch_value = {"patch": patch_value}
|
|
103
|
+
|
|
104
|
+
payload = [
|
|
105
|
+
{
|
|
106
|
+
"urn": mcp.entityUrn,
|
|
107
|
+
mcp.aspectName: {
|
|
108
|
+
"value": patch_value,
|
|
109
|
+
"systemMetadata": mcp.systemMetadata.to_obj()
|
|
110
|
+
if mcp.systemMetadata
|
|
111
|
+
else None,
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
]
|
|
115
|
+
else:
|
|
116
|
+
if isinstance(mcp, MetadataChangeProposalWrapper):
|
|
117
|
+
aspect_value = pre_json_transform(
|
|
118
|
+
mcp.to_obj(simplified_structure=True)
|
|
119
|
+
)["aspect"]["json"]
|
|
120
|
+
else:
|
|
121
|
+
obj = mcp.aspect.to_obj()
|
|
122
|
+
content_type = obj.get("contentType")
|
|
123
|
+
if obj.get("value") and content_type == JSON_CONTENT_TYPE:
|
|
124
|
+
# Undo double serialization.
|
|
125
|
+
obj = json.loads(obj["value"])
|
|
126
|
+
elif content_type == JSON_PATCH_CONTENT_TYPE:
|
|
127
|
+
raise NotImplementedError(
|
|
128
|
+
f"ChangeType {mcp.changeType} does not support patch."
|
|
129
|
+
)
|
|
130
|
+
aspect_value = pre_json_transform(obj)
|
|
131
|
+
|
|
132
|
+
payload = [
|
|
133
|
+
{
|
|
134
|
+
"urn": mcp.entityUrn,
|
|
135
|
+
mcp.aspectName: {
|
|
136
|
+
"value": aspect_value,
|
|
137
|
+
"systemMetadata": mcp.systemMetadata.to_obj()
|
|
138
|
+
if mcp.systemMetadata
|
|
139
|
+
else None,
|
|
140
|
+
},
|
|
141
|
+
}
|
|
142
|
+
]
|
|
143
|
+
else:
|
|
144
|
+
raise ValueError(f"ChangeType {mcp.changeType} requires a value.")
|
|
145
|
+
|
|
146
|
+
return cls(method=method, url=url, payload=payload)
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -41,10 +41,9 @@ from datahub.configuration.common import (
|
|
|
41
41
|
TraceTimeoutError,
|
|
42
42
|
TraceValidationError,
|
|
43
43
|
)
|
|
44
|
-
from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
|
|
45
44
|
from datahub.emitter.generic_emitter import Emitter
|
|
46
45
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
47
|
-
from datahub.emitter.request_helper import make_curl_command
|
|
46
|
+
from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
|
|
48
47
|
from datahub.emitter.response_helper import (
|
|
49
48
|
TraceData,
|
|
50
49
|
extract_trace_data,
|
|
@@ -348,43 +347,24 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
348
347
|
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
|
|
349
348
|
async_flag: Optional[bool] = None,
|
|
350
349
|
async_default: bool = False,
|
|
351
|
-
) -> Optional[
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
async_flag if async_flag is not None else async_default
|
|
355
|
-
)
|
|
356
|
-
url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
|
|
350
|
+
) -> Optional[OpenApiRequest]:
|
|
351
|
+
"""
|
|
352
|
+
Convert a MetadataChangeProposal to an OpenAPI request format.
|
|
357
353
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
aspect_value = pre_json_transform(obj)
|
|
373
|
-
return (
|
|
374
|
-
url,
|
|
375
|
-
[
|
|
376
|
-
{
|
|
377
|
-
"urn": mcp.entityUrn,
|
|
378
|
-
mcp.aspectName: {
|
|
379
|
-
"value": aspect_value,
|
|
380
|
-
"systemMetadata": mcp.systemMetadata.to_obj()
|
|
381
|
-
if mcp.systemMetadata
|
|
382
|
-
else None,
|
|
383
|
-
},
|
|
384
|
-
}
|
|
385
|
-
],
|
|
386
|
-
)
|
|
387
|
-
return None
|
|
354
|
+
Args:
|
|
355
|
+
mcp: The metadata change proposal
|
|
356
|
+
async_flag: Optional flag to override async behavior
|
|
357
|
+
async_default: Default async behavior if not specified
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
An OpenApiRequest object or None if the MCP doesn't have required fields
|
|
361
|
+
"""
|
|
362
|
+
return OpenApiRequest.from_mcp(
|
|
363
|
+
mcp=mcp,
|
|
364
|
+
gms_server=self._gms_server,
|
|
365
|
+
async_flag=async_flag,
|
|
366
|
+
async_default=async_default,
|
|
367
|
+
)
|
|
388
368
|
|
|
389
369
|
def emit(
|
|
390
370
|
self,
|
|
@@ -448,7 +428,9 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
448
428
|
if self._openapi_ingestion:
|
|
449
429
|
request = self._to_openapi_request(mcp, async_flag, async_default=False)
|
|
450
430
|
if request:
|
|
451
|
-
response = self._emit_generic(
|
|
431
|
+
response = self._emit_generic(
|
|
432
|
+
request.url, payload=request.payload, method=request.method
|
|
433
|
+
)
|
|
452
434
|
|
|
453
435
|
if self._should_trace(async_flag, trace_flag):
|
|
454
436
|
trace_data = extract_trace_data(response) if response else None
|
|
@@ -503,31 +485,36 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
503
485
|
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
|
|
504
486
|
) -> int:
|
|
505
487
|
"""
|
|
506
|
-
1. Grouping MCPs by their entity URL
|
|
488
|
+
1. Grouping MCPs by their HTTP method and entity URL
|
|
507
489
|
2. Breaking down large batches into smaller chunks based on both:
|
|
508
490
|
* Total byte size (INGEST_MAX_PAYLOAD_BYTES)
|
|
509
491
|
* Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
|
|
510
492
|
|
|
511
493
|
The Chunk class encapsulates both the items and their byte size tracking
|
|
512
|
-
Serializing the items only once with json.dumps(request
|
|
494
|
+
Serializing the items only once with json.dumps(request.payload) and reusing that
|
|
513
495
|
The chunking logic handles edge cases (always accepting at least one item per chunk)
|
|
514
496
|
The joining logic is efficient with a simple string concatenation
|
|
515
497
|
|
|
516
498
|
:param mcps: metadata change proposals to transmit
|
|
517
499
|
:param async_flag: the mode
|
|
500
|
+
:param trace_flag: whether to trace the requests
|
|
501
|
+
:param trace_timeout: timeout for tracing
|
|
518
502
|
:return: number of requests
|
|
519
503
|
"""
|
|
520
|
-
#
|
|
521
|
-
batches: Dict[str, List[_Chunk]] = defaultdict(
|
|
504
|
+
# Group by entity URL and HTTP method
|
|
505
|
+
batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
|
|
522
506
|
lambda: [_Chunk(items=[])]
|
|
523
507
|
) # Initialize with one empty Chunk
|
|
524
508
|
|
|
525
509
|
for mcp in mcps:
|
|
526
510
|
request = self._to_openapi_request(mcp, async_flag, async_default=True)
|
|
527
511
|
if request:
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
512
|
+
# Create a composite key with both method and URL
|
|
513
|
+
key = (request.method, request.url)
|
|
514
|
+
current_chunk = batches[key][-1] # Get the last chunk
|
|
515
|
+
|
|
516
|
+
# Only serialize once - we're serializing a single payload item
|
|
517
|
+
serialized_item = json.dumps(request.payload[0])
|
|
531
518
|
item_bytes = len(serialized_item.encode())
|
|
532
519
|
|
|
533
520
|
# If adding this item would exceed max_bytes, create a new chunk
|
|
@@ -537,15 +524,17 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
537
524
|
or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
538
525
|
):
|
|
539
526
|
new_chunk = _Chunk(items=[])
|
|
540
|
-
batches[
|
|
527
|
+
batches[key].append(new_chunk)
|
|
541
528
|
current_chunk = new_chunk
|
|
542
529
|
|
|
543
530
|
current_chunk.add_item(serialized_item)
|
|
544
531
|
|
|
545
532
|
responses = []
|
|
546
|
-
for url, chunks in batches.items():
|
|
533
|
+
for (method, url), chunks in batches.items():
|
|
547
534
|
for chunk in chunks:
|
|
548
|
-
response = self._emit_generic(
|
|
535
|
+
response = self._emit_generic(
|
|
536
|
+
url, payload=_Chunk.join(chunk), method=method
|
|
537
|
+
)
|
|
549
538
|
responses.append(response)
|
|
550
539
|
|
|
551
540
|
if self._should_trace(async_flag, trace_flag, async_default=True):
|
|
@@ -618,11 +607,13 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
618
607
|
payload = json.dumps(snapshot)
|
|
619
608
|
self._emit_generic(url, payload)
|
|
620
609
|
|
|
621
|
-
def _emit_generic(
|
|
610
|
+
def _emit_generic(
|
|
611
|
+
self, url: str, payload: Union[str, Any], method: str = "POST"
|
|
612
|
+
) -> requests.Response:
|
|
622
613
|
if not isinstance(payload, str):
|
|
623
614
|
payload = json.dumps(payload)
|
|
624
615
|
|
|
625
|
-
curl_command = make_curl_command(self._session,
|
|
616
|
+
curl_command = make_curl_command(self._session, method, url, payload)
|
|
626
617
|
payload_size = len(payload)
|
|
627
618
|
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
628
619
|
# since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
|
|
@@ -635,7 +626,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
635
626
|
curl_command,
|
|
636
627
|
)
|
|
637
628
|
try:
|
|
638
|
-
|
|
629
|
+
method_func = getattr(self._session, method.lower())
|
|
630
|
+
response = method_func(url, data=payload) if payload else method_func(url)
|
|
639
631
|
response.raise_for_status()
|
|
640
632
|
return response
|
|
641
633
|
except HTTPError as e:
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -420,12 +420,9 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
420
420
|
Run in order, first in list is applied first. Be careful with order when overriding.
|
|
421
421
|
"""
|
|
422
422
|
browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
|
|
423
|
-
if
|
|
424
|
-
self.ctx.pipeline_config
|
|
425
|
-
and self.ctx.pipeline_config.flags.generate_browse_path_v2
|
|
426
|
-
):
|
|
423
|
+
if self.ctx.flags.generate_browse_path_v2:
|
|
427
424
|
browse_path_processor = self._get_browse_path_processor(
|
|
428
|
-
self.ctx.
|
|
425
|
+
self.ctx.flags.generate_browse_path_v2_dry_run
|
|
429
426
|
)
|
|
430
427
|
|
|
431
428
|
auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
|
|
@@ -92,6 +92,7 @@ def create_dataset_props_patch_builder(
|
|
|
92
92
|
patch_builder.set_last_modified(dataset_properties.lastModified)
|
|
93
93
|
patch_builder.set_qualified_name(dataset_properties.qualifiedName)
|
|
94
94
|
patch_builder.add_custom_properties(dataset_properties.customProperties)
|
|
95
|
+
patch_builder.set_external_url(dataset_properties.externalUrl)
|
|
95
96
|
|
|
96
97
|
return patch_builder
|
|
97
98
|
|
|
@@ -319,8 +319,10 @@ def classification_workunit_processor(
|
|
|
319
319
|
partial(
|
|
320
320
|
data_reader.get_sample_data_for_table,
|
|
321
321
|
table_id,
|
|
322
|
-
|
|
323
|
-
|
|
322
|
+
int(
|
|
323
|
+
classification_handler.config.classification.sample_size
|
|
324
|
+
* SAMPLE_SIZE_MULTIPLIER
|
|
325
|
+
),
|
|
324
326
|
**(data_reader_kwargs or {}),
|
|
325
327
|
)
|
|
326
328
|
if data_reader
|
|
@@ -158,7 +158,9 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
158
158
|
ca_certificate_path=self.config.ca_certificate_path,
|
|
159
159
|
client_certificate_path=self.config.client_certificate_path,
|
|
160
160
|
disable_ssl_verification=self.config.disable_ssl_verification,
|
|
161
|
-
openapi_ingestion=
|
|
161
|
+
openapi_ingestion=self.config.openapi_ingestion
|
|
162
|
+
if self.config.openapi_ingestion is not None
|
|
163
|
+
else (DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI),
|
|
162
164
|
default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
|
|
163
165
|
)
|
|
164
166
|
|
|
@@ -18,7 +18,7 @@ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
|
|
|
18
18
|
# This can be put directly into the orFilters parameter in GraphQL.
|
|
19
19
|
RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
|
|
20
20
|
|
|
21
|
-
# Mirrors our GraphQL enum: https://
|
|
21
|
+
# Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
|
|
22
22
|
FilterOperator: TypeAlias = Literal[
|
|
23
23
|
"CONTAIN",
|
|
24
24
|
"EQUAL",
|
|
@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
|
|
|
205
205
|
textwrap.dedent(
|
|
206
206
|
f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
|
|
207
207
|
To view full table metadata, run Glue ingestion
|
|
208
|
-
(see https://
|
|
208
|
+
(see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
|
|
209
209
|
)
|
|
210
210
|
)
|
|
211
211
|
|
|
@@ -270,29 +270,30 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
270
270
|
):
|
|
271
271
|
return
|
|
272
272
|
|
|
273
|
-
with self.report.new_stage(
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
273
|
+
with self.report.new_stage(
|
|
274
|
+
f"*: {QUERIES_EXTRACTION}"
|
|
275
|
+
), BigQueryQueriesExtractor(
|
|
276
|
+
connection=self.config.get_bigquery_client(),
|
|
277
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
278
|
+
config=BigQueryQueriesExtractorConfig(
|
|
279
|
+
window=self.config,
|
|
280
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
281
|
+
include_lineage=self.config.include_table_lineage,
|
|
282
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
283
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
284
|
+
include_queries=self.config.include_queries,
|
|
285
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
286
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
287
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
288
|
+
),
|
|
289
|
+
structured_report=self.report,
|
|
290
|
+
filters=self.filters,
|
|
291
|
+
identifiers=self.identifiers,
|
|
292
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
293
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
294
|
+
) as queries_extractor:
|
|
295
|
+
self.report.queries_extractor = queries_extractor.report
|
|
296
|
+
yield from queries_extractor.get_workunits_internal()
|
|
296
297
|
else:
|
|
297
298
|
if self.config.include_usage_statistics:
|
|
298
299
|
yield from self.usage_extractor.get_usage_workunits(
|
|
@@ -70,30 +70,31 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
with self.report.new_stage(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
with self.report.new_stage(
|
|
74
|
+
f"{keyspace_name}: {PROFILING}"
|
|
75
|
+
), ThreadPoolExecutor(
|
|
76
|
+
max_workers=self.config.profiling.max_workers
|
|
77
|
+
) as executor:
|
|
78
|
+
future_to_dataset = {
|
|
79
|
+
executor.submit(
|
|
80
|
+
self.generate_profile,
|
|
81
|
+
keyspace_name,
|
|
82
|
+
table_name,
|
|
83
|
+
cassandra_data.columns.get(table_name, []),
|
|
84
|
+
): table_name
|
|
85
|
+
for table_name in tables
|
|
86
|
+
}
|
|
87
|
+
for future in as_completed(future_to_dataset):
|
|
88
|
+
table_name = future_to_dataset[future]
|
|
89
|
+
try:
|
|
90
|
+
yield from future.result()
|
|
91
|
+
except Exception as exc:
|
|
92
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
93
|
+
self.report.failure(
|
|
94
|
+
message="Failed to profile for table",
|
|
95
|
+
context=f"{keyspace_name}.{table_name}",
|
|
96
|
+
exc=exc,
|
|
97
|
+
)
|
|
97
98
|
|
|
98
99
|
def generate_profile(
|
|
99
100
|
self,
|
|
@@ -195,17 +195,18 @@ class DataHubDatabaseReader:
|
|
|
195
195
|
Yields:
|
|
196
196
|
Row objects containing URNs of soft-deleted entities
|
|
197
197
|
"""
|
|
198
|
-
with self.engine.connect() as conn
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
198
|
+
with self.engine.connect() as conn, contextlib.closing(
|
|
199
|
+
conn.connection.cursor()
|
|
200
|
+
) as cursor:
|
|
201
|
+
logger.debug("Polling soft-deleted urns from database")
|
|
202
|
+
cursor.execute(self.soft_deleted_urns_query)
|
|
203
|
+
columns = [desc[0] for desc in cursor.description]
|
|
204
|
+
while True:
|
|
205
|
+
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
206
|
+
if not rows:
|
|
207
|
+
return
|
|
208
|
+
for row in rows:
|
|
209
|
+
yield dict(zip(columns, row))
|
|
209
210
|
|
|
210
211
|
def _parse_row(
|
|
211
212
|
self, row: Dict[str, Any]
|
|
@@ -10,14 +10,12 @@ from pydantic import Field, root_validator
|
|
|
10
10
|
|
|
11
11
|
from datahub.ingestion.api.decorators import (
|
|
12
12
|
SupportStatus,
|
|
13
|
-
capability,
|
|
14
13
|
config_class,
|
|
15
14
|
platform_name,
|
|
16
15
|
support_status,
|
|
17
16
|
)
|
|
18
17
|
from datahub.ingestion.api.source import (
|
|
19
18
|
CapabilityReport,
|
|
20
|
-
SourceCapability,
|
|
21
19
|
TestableSource,
|
|
22
20
|
TestConnectionReport,
|
|
23
21
|
)
|
|
@@ -262,16 +260,14 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
|
|
|
262
260
|
|
|
263
261
|
@platform_name("dbt")
|
|
264
262
|
@config_class(DBTCloudConfig)
|
|
265
|
-
@support_status(SupportStatus.
|
|
266
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
267
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
263
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
268
264
|
class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
269
265
|
config: DBTCloudConfig
|
|
270
266
|
|
|
271
267
|
@classmethod
|
|
272
268
|
def create(cls, config_dict, ctx):
|
|
273
269
|
config = DBTCloudConfig.parse_obj(config_dict)
|
|
274
|
-
return cls(config, ctx
|
|
270
|
+
return cls(config, ctx)
|
|
275
271
|
|
|
276
272
|
@staticmethod
|
|
277
273
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|