acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/METADATA +2370 -2370
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/RECORD +23 -22
- datahub/_version.py +1 -1
- datahub/cli/ingest_cli.py +4 -4
- datahub/emitter/mcp_builder.py +4 -0
- datahub/ingestion/graph/client.py +104 -0
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +297 -0
- datahub/ingestion/source/iceberg/iceberg.py +9 -9
- datahub/ingestion/source/mlflow.py +3 -7
- datahub/ingestion/source/powerbi/powerbi.py +14 -1
- datahub/ingestion/source/sql/trino.py +4 -3
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/superset.py +108 -81
- datahub/ingestion/source/vertexai/vertexai.py +7 -7
- datahub/utilities/ingest_utils.py +2 -2
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from datahub.ingestion.api.source import SourceReport
|
|
8
|
+
from datahub.ingestion.source.hex.constants import (
|
|
9
|
+
DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
10
|
+
HEX_PLATFORM_URN,
|
|
11
|
+
)
|
|
12
|
+
from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
|
|
13
|
+
from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
|
|
14
|
+
from datahub.sdk.main_client import DataHubClient
|
|
15
|
+
from datahub.sdk.search_filters import FilterDsl as F
|
|
16
|
+
from datahub.utilities.time import datetime_to_ts_millis
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
|
21
|
+
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class QueryResponse:
|
|
26
|
+
"""This is the public response model for the HexQueryFetcher."""
|
|
27
|
+
|
|
28
|
+
urn: QueryUrn
|
|
29
|
+
hex_project_id: str
|
|
30
|
+
dataset_subjects: List[DatasetUrn] = field(default_factory=list)
|
|
31
|
+
schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class HexQueryFetcherReport(SourceReport):
|
|
36
|
+
start_datetime: Optional[datetime] = None
|
|
37
|
+
end_datetime: Optional[datetime] = None
|
|
38
|
+
fetched_query_urns: int = 0
|
|
39
|
+
fetched_query_objects: int = 0
|
|
40
|
+
filtered_out_queries_missing_metadata: int = 0
|
|
41
|
+
filtered_out_queries_different_workspace: int = 0
|
|
42
|
+
filtered_out_queries_no_subjects: int = 0
|
|
43
|
+
total_queries: int = 0
|
|
44
|
+
total_dataset_subjects: int = 0
|
|
45
|
+
total_schema_field_subjects: int = 0
|
|
46
|
+
num_calls_fetch_query_entities: int = 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class HexQueryFetcher:
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
datahub_client: DataHubClient,
|
|
53
|
+
workspace_name: str,
|
|
54
|
+
start_datetime: datetime,
|
|
55
|
+
end_datetime: datetime,
|
|
56
|
+
report: HexQueryFetcherReport,
|
|
57
|
+
page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
58
|
+
):
|
|
59
|
+
self.datahub_client = datahub_client
|
|
60
|
+
self.workspace_name = workspace_name
|
|
61
|
+
self.start_datetime = start_datetime
|
|
62
|
+
self.end_datetime = end_datetime
|
|
63
|
+
self.report = report
|
|
64
|
+
self.page_size = page_size
|
|
65
|
+
|
|
66
|
+
self.report.start_datetime = start_datetime
|
|
67
|
+
self.report.end_datetime = end_datetime
|
|
68
|
+
|
|
69
|
+
def fetch(self) -> Iterable[QueryResponse]:
|
|
70
|
+
try:
|
|
71
|
+
query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
|
|
72
|
+
assert all(isinstance(urn, QueryUrn) for urn in query_urns)
|
|
73
|
+
self.report.fetched_query_urns = len(query_urns)
|
|
74
|
+
|
|
75
|
+
entities_by_urn = self._fetch_query_entities(query_urns)
|
|
76
|
+
self.report.fetched_query_objects = len(entities_by_urn)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
self.report.failure(
|
|
79
|
+
title="Error fetching Queries for lineage",
|
|
80
|
+
message="Error fetching Queries will result on missing lineage",
|
|
81
|
+
context=str(
|
|
82
|
+
dict(
|
|
83
|
+
workspace_name=self.workspace_name,
|
|
84
|
+
start_datetime=self.start_datetime,
|
|
85
|
+
end_datetime=self.end_datetime,
|
|
86
|
+
)
|
|
87
|
+
),
|
|
88
|
+
exc=e,
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
if not query_urns or not entities_by_urn:
|
|
92
|
+
self.report.warning(
|
|
93
|
+
title="No Queries found with Hex as origin",
|
|
94
|
+
message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
|
|
95
|
+
context=str(
|
|
96
|
+
dict(
|
|
97
|
+
workspace_name=self.workspace_name,
|
|
98
|
+
start_datetime=self.start_datetime,
|
|
99
|
+
end_datetime=self.end_datetime,
|
|
100
|
+
)
|
|
101
|
+
),
|
|
102
|
+
)
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
for query_urn, (
|
|
106
|
+
query_properties,
|
|
107
|
+
query_subjects,
|
|
108
|
+
) in entities_by_urn.items():
|
|
109
|
+
maybe_query_response = self._build_query_response(
|
|
110
|
+
query_urn=query_urn,
|
|
111
|
+
query_properties=query_properties,
|
|
112
|
+
query_subjects=query_subjects,
|
|
113
|
+
)
|
|
114
|
+
if maybe_query_response:
|
|
115
|
+
yield maybe_query_response
|
|
116
|
+
|
|
117
|
+
def _fetch_query_entities(
|
|
118
|
+
self, query_urns: List[QueryUrn]
|
|
119
|
+
) -> Dict[
|
|
120
|
+
QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
|
|
121
|
+
]:
|
|
122
|
+
entities_by_urn: Dict[
|
|
123
|
+
QueryUrn,
|
|
124
|
+
Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
|
|
125
|
+
] = {}
|
|
126
|
+
for i in range(0, len(query_urns), self.page_size):
|
|
127
|
+
batch = query_urns[i : i + self.page_size]
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
|
|
130
|
+
entities = self.datahub_client._graph.get_entities(
|
|
131
|
+
entity_name=QueryUrn.ENTITY_TYPE,
|
|
132
|
+
urns=[urn.urn() for urn in batch],
|
|
133
|
+
aspects=[
|
|
134
|
+
QueryPropertiesClass.ASPECT_NAME,
|
|
135
|
+
QuerySubjectsClass.ASPECT_NAME,
|
|
136
|
+
],
|
|
137
|
+
with_system_metadata=False,
|
|
138
|
+
)
|
|
139
|
+
self.report.num_calls_fetch_query_entities += 1
|
|
140
|
+
logger.debug(f"Get entities response: {entities}")
|
|
141
|
+
|
|
142
|
+
for urn, entity in entities.items():
|
|
143
|
+
query_urn = QueryUrn.from_string(urn)
|
|
144
|
+
|
|
145
|
+
properties_tuple = entity.get(
|
|
146
|
+
QueryPropertiesClass.ASPECT_NAME, (None, None)
|
|
147
|
+
)
|
|
148
|
+
query_properties: Optional[QueryPropertiesClass] = None
|
|
149
|
+
if properties_tuple and properties_tuple[0]:
|
|
150
|
+
assert isinstance(properties_tuple[0], QueryPropertiesClass)
|
|
151
|
+
query_properties = properties_tuple[0]
|
|
152
|
+
|
|
153
|
+
subjects_tuple = entity.get(
|
|
154
|
+
QuerySubjectsClass.ASPECT_NAME, (None, None)
|
|
155
|
+
)
|
|
156
|
+
query_subjects: Optional[QuerySubjectsClass] = None
|
|
157
|
+
if subjects_tuple and subjects_tuple[0]:
|
|
158
|
+
assert isinstance(subjects_tuple[0], QuerySubjectsClass)
|
|
159
|
+
query_subjects = subjects_tuple[0]
|
|
160
|
+
|
|
161
|
+
entities_by_urn[query_urn] = (query_properties, query_subjects)
|
|
162
|
+
|
|
163
|
+
return entities_by_urn
|
|
164
|
+
|
|
165
|
+
def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
|
|
166
|
+
last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
|
|
167
|
+
last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
|
|
168
|
+
|
|
169
|
+
urns = self.datahub_client.search.get_urns(
|
|
170
|
+
filter=F.and_(
|
|
171
|
+
F.entity_type(QueryUrn.ENTITY_TYPE),
|
|
172
|
+
F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
|
|
173
|
+
F.custom_filter(
|
|
174
|
+
"lastModifiedAt",
|
|
175
|
+
"GREATER_THAN_OR_EQUAL_TO",
|
|
176
|
+
[str(last_modified_start_at_millis)],
|
|
177
|
+
),
|
|
178
|
+
F.custom_filter(
|
|
179
|
+
"lastModifiedAt",
|
|
180
|
+
"LESS_THAN_OR_EQUAL_TO",
|
|
181
|
+
[str(last_modified_end_at_millis)],
|
|
182
|
+
),
|
|
183
|
+
),
|
|
184
|
+
)
|
|
185
|
+
logger.debug(f"Get URNS by filter: {urns}")
|
|
186
|
+
return [QueryUrn.from_string(urn.urn()) for urn in urns]
|
|
187
|
+
|
|
188
|
+
def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
|
|
189
|
+
"""
|
|
190
|
+
Extract project ID and workspace name from SQL statement.
|
|
191
|
+
|
|
192
|
+
Looks for Hex metadata in SQL comments in the format:
|
|
193
|
+
-- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
|
|
194
|
+
|
|
195
|
+
Example:
|
|
196
|
+
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
|
|
197
|
+
|
|
198
|
+
# TODO: Consider supporting multiline metadata format in the future:
|
|
199
|
+
# -- Hex query metadata: {
|
|
200
|
+
# -- "categories": ["Scratchpad"],
|
|
201
|
+
# -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
|
|
202
|
+
# -- ...
|
|
203
|
+
# -- }
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
A tuple of (project_id, workspace_name) if both are successfully extracted
|
|
207
|
+
None if extraction fails for any reason
|
|
208
|
+
"""
|
|
209
|
+
# Extract both project_id and workspace name in a single regex operation
|
|
210
|
+
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
|
211
|
+
|
|
212
|
+
if not match:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
project_id = match.group(1)
|
|
217
|
+
workspace_name = match.group(2)
|
|
218
|
+
return project_id, workspace_name
|
|
219
|
+
except (IndexError, AttributeError) as e:
|
|
220
|
+
self.report.warning(
|
|
221
|
+
title="Failed to extract information from Hex query metadata",
|
|
222
|
+
message="Failed to extract information from Hex query metadata will result on missing lineage",
|
|
223
|
+
context=sql_statement,
|
|
224
|
+
exc=e,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
def _build_query_response(
|
|
230
|
+
self,
|
|
231
|
+
query_urn: QueryUrn,
|
|
232
|
+
query_properties: Optional[QueryPropertiesClass],
|
|
233
|
+
query_subjects: Optional[QuerySubjectsClass],
|
|
234
|
+
) -> Optional[QueryResponse]:
|
|
235
|
+
# Skip if missing required aspects
|
|
236
|
+
if (
|
|
237
|
+
not query_properties
|
|
238
|
+
or not query_properties.statement
|
|
239
|
+
or not query_properties.statement.value
|
|
240
|
+
or not query_subjects
|
|
241
|
+
or query_subjects.subjects is None # empty list is allowed
|
|
242
|
+
):
|
|
243
|
+
logger.debug(
|
|
244
|
+
f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
|
|
245
|
+
)
|
|
246
|
+
self.report.filtered_out_queries_missing_metadata += 1
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
# Extract hex metadata (project_id and workspace_name)
|
|
250
|
+
metadata_result = self._extract_hex_metadata(query_properties.statement.value)
|
|
251
|
+
if not metadata_result:
|
|
252
|
+
logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
|
|
253
|
+
self.report.filtered_out_queries_missing_metadata += 1
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
hex_project_id, workspace_from_url = metadata_result
|
|
257
|
+
|
|
258
|
+
# Validate workspace
|
|
259
|
+
if workspace_from_url != self.workspace_name:
|
|
260
|
+
logger.debug(
|
|
261
|
+
f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
|
|
262
|
+
)
|
|
263
|
+
self.report.filtered_out_queries_different_workspace += 1
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
# Extract subjects
|
|
267
|
+
dataset_subjects: List[DatasetUrn] = []
|
|
268
|
+
schema_field_subjects: List[SchemaFieldUrn] = []
|
|
269
|
+
for subject in query_subjects.subjects:
|
|
270
|
+
if subject.entity and subject.entity.startswith("urn:li:dataset:"):
|
|
271
|
+
dataset_subjects.append(DatasetUrn.from_string(subject.entity))
|
|
272
|
+
elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
|
|
273
|
+
schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
|
|
274
|
+
|
|
275
|
+
if not dataset_subjects and not schema_field_subjects:
|
|
276
|
+
self.report.filtered_out_queries_no_subjects += 1
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
# Create response
|
|
280
|
+
response = QueryResponse(
|
|
281
|
+
urn=query_urn,
|
|
282
|
+
hex_project_id=hex_project_id,
|
|
283
|
+
dataset_subjects=dataset_subjects,
|
|
284
|
+
schema_field_subjects=schema_field_subjects,
|
|
285
|
+
)
|
|
286
|
+
logger.debug(
|
|
287
|
+
f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
|
|
288
|
+
)
|
|
289
|
+
self.report.total_queries += 1
|
|
290
|
+
self.report.total_dataset_subjects += len(dataset_subjects)
|
|
291
|
+
self.report.total_schema_field_subjects += len(schema_field_subjects)
|
|
292
|
+
|
|
293
|
+
logger.debug(
|
|
294
|
+
f"Processed query {query_urn} with Hex project ID {hex_project_id}"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return response
|
|
@@ -425,23 +425,21 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
425
425
|
def _get_dataset_properties_aspect(
|
|
426
426
|
self, dataset_name: str, table: Table
|
|
427
427
|
) -> DatasetPropertiesClass:
|
|
428
|
-
|
|
428
|
+
created: Optional[TimeStampClass] = None
|
|
429
429
|
custom_properties = table.metadata.properties.copy()
|
|
430
430
|
custom_properties["location"] = table.metadata.location
|
|
431
431
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
432
432
|
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
433
|
+
last_modified: Optional[int] = table.metadata.last_updated_ms
|
|
433
434
|
if table.current_snapshot():
|
|
434
435
|
custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
|
|
435
436
|
custom_properties["manifest-list"] = table.current_snapshot().manifest_list
|
|
436
|
-
|
|
437
|
-
int(table.current_snapshot().timestamp_ms)
|
|
438
|
-
)
|
|
437
|
+
if not last_modified:
|
|
438
|
+
last_modified = int(table.current_snapshot().timestamp_ms)
|
|
439
439
|
if "created-at" in custom_properties:
|
|
440
440
|
try:
|
|
441
441
|
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
442
|
-
|
|
443
|
-
int(dt.timestamp() * 1000)
|
|
444
|
-
)
|
|
442
|
+
created = TimeStampClass(int(dt.timestamp() * 1000))
|
|
445
443
|
except Exception as ex:
|
|
446
444
|
LOGGER.warning(
|
|
447
445
|
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
@@ -451,8 +449,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
451
449
|
name=table.name()[-1],
|
|
452
450
|
description=table.metadata.properties.get("comment", None),
|
|
453
451
|
customProperties=custom_properties,
|
|
454
|
-
lastModified=
|
|
455
|
-
|
|
452
|
+
lastModified=TimeStampClass(last_modified)
|
|
453
|
+
if last_modified is not None
|
|
454
|
+
else None,
|
|
455
|
+
created=created,
|
|
456
456
|
qualifiedName=dataset_name,
|
|
457
457
|
)
|
|
458
458
|
|
|
@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
|
16
16
|
)
|
|
17
17
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
18
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
|
-
from datahub.emitter.mcp_builder import
|
|
19
|
+
from datahub.emitter.mcp_builder import ExperimentKey
|
|
20
20
|
from datahub.ingestion.api.common import PipelineContext
|
|
21
21
|
from datahub.ingestion.api.decorators import (
|
|
22
22
|
SupportStatus,
|
|
@@ -77,10 +77,6 @@ from datahub.sdk.dataset import Dataset
|
|
|
77
77
|
T = TypeVar("T")
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
class ContainerKeyWithId(ContainerKey):
|
|
81
|
-
id: str
|
|
82
|
-
|
|
83
|
-
|
|
84
80
|
class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
85
81
|
tracking_uri: Optional[str] = Field(
|
|
86
82
|
default=None,
|
|
@@ -252,7 +248,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
252
248
|
self, experiment: Experiment
|
|
253
249
|
) -> Iterable[MetadataWorkUnit]:
|
|
254
250
|
experiment_container = Container(
|
|
255
|
-
container_key=
|
|
251
|
+
container_key=ExperimentKey(
|
|
256
252
|
platform=str(DataPlatformUrn(platform_name=self.platform)),
|
|
257
253
|
id=experiment.name,
|
|
258
254
|
),
|
|
@@ -470,7 +466,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
470
466
|
def _get_run_workunits(
|
|
471
467
|
self, experiment: Experiment, run: Run
|
|
472
468
|
) -> Iterable[MetadataWorkUnit]:
|
|
473
|
-
experiment_key =
|
|
469
|
+
experiment_key = ExperimentKey(
|
|
474
470
|
platform=str(DataPlatformUrn(self.platform)), id=experiment.name
|
|
475
471
|
)
|
|
476
472
|
|
|
@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
|
|
|
94
94
|
UpstreamLineageClass,
|
|
95
95
|
ViewPropertiesClass,
|
|
96
96
|
)
|
|
97
|
-
from datahub.metadata.urns import ChartUrn
|
|
97
|
+
from datahub.metadata.urns import ChartUrn, DatasetUrn
|
|
98
98
|
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
|
|
99
99
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
100
100
|
from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
|
|
@@ -1083,6 +1083,7 @@ class Mapper:
|
|
|
1083
1083
|
report: powerbi_data_classes.Report,
|
|
1084
1084
|
chart_mcps: List[MetadataChangeProposalWrapper],
|
|
1085
1085
|
user_mcps: List[MetadataChangeProposalWrapper],
|
|
1086
|
+
dataset_edges: List[EdgeClass],
|
|
1086
1087
|
) -> List[MetadataChangeProposalWrapper]:
|
|
1087
1088
|
"""
|
|
1088
1089
|
Map PowerBi report to Datahub dashboard
|
|
@@ -1104,6 +1105,7 @@ class Mapper:
|
|
|
1104
1105
|
charts=chart_urn_list,
|
|
1105
1106
|
lastModified=ChangeAuditStamps(),
|
|
1106
1107
|
dashboardUrl=report.webUrl,
|
|
1108
|
+
datasetEdges=dataset_edges,
|
|
1107
1109
|
)
|
|
1108
1110
|
|
|
1109
1111
|
info_mcp = self.new_mcp(
|
|
@@ -1197,12 +1199,23 @@ class Mapper:
|
|
|
1197
1199
|
ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
|
|
1198
1200
|
chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
|
|
1199
1201
|
|
|
1202
|
+
# collect all upstream datasets; using a set to retain unique urns
|
|
1203
|
+
dataset_urns = {
|
|
1204
|
+
dataset.entityUrn
|
|
1205
|
+
for dataset in ds_mcps
|
|
1206
|
+
if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
|
|
1207
|
+
}
|
|
1208
|
+
dataset_edges = [
|
|
1209
|
+
EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
|
|
1210
|
+
]
|
|
1211
|
+
|
|
1200
1212
|
# Let's convert report to datahub dashboard
|
|
1201
1213
|
report_mcps = self.report_to_dashboard(
|
|
1202
1214
|
workspace=workspace,
|
|
1203
1215
|
report=report,
|
|
1204
1216
|
chart_mcps=chart_mcps,
|
|
1205
1217
|
user_mcps=user_mcps,
|
|
1218
|
+
dataset_edges=dataset_edges,
|
|
1206
1219
|
)
|
|
1207
1220
|
|
|
1208
1221
|
# Now add MCPs in sequence
|
|
@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
|
|
|
128
128
|
if catalog_name is None:
|
|
129
129
|
raise exc.NoSuchTableError("catalog is required in connection")
|
|
130
130
|
connector_name = get_catalog_connector_name(connection.engine, catalog_name)
|
|
131
|
-
if
|
|
132
|
-
|
|
133
|
-
|
|
131
|
+
if (
|
|
132
|
+
connector_name is not None
|
|
133
|
+
and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
|
|
134
|
+
):
|
|
134
135
|
properties_table = self._get_full_table(f"{table_name}$properties", schema)
|
|
135
136
|
query = f"SELECT * FROM {properties_table}"
|
|
136
137
|
row = connection.execute(sql.text(query)).fetchone()
|
|
@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
|
|
|
45
45
|
description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
|
|
46
46
|
le=100.0,
|
|
47
47
|
ge=0.0,
|
|
48
|
-
hidden_from_docs=True,
|
|
49
48
|
)
|
|
50
49
|
|
|
51
50
|
|