acryl-datahub 1.0.0.2rc1__py3-none-any.whl → 1.0.0.2rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc1.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/METADATA +2566 -2566
- {acryl_datahub-1.0.0.2rc1.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/RECORD +13 -12
- datahub/_version.py +1 -1
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +297 -0
- datahub/ingestion/source/superset.py +108 -81
- {acryl_datahub-1.0.0.2rc1.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.2rc1.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc1.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc1.dist-info → acryl_datahub-1.0.0.2rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from datahub.ingestion.api.source import SourceReport
|
|
8
|
+
from datahub.ingestion.source.hex.constants import (
|
|
9
|
+
DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
10
|
+
HEX_PLATFORM_URN,
|
|
11
|
+
)
|
|
12
|
+
from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
|
|
13
|
+
from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
|
|
14
|
+
from datahub.sdk.main_client import DataHubClient
|
|
15
|
+
from datahub.sdk.search_filters import FilterDsl as F
|
|
16
|
+
from datahub.utilities.time import datetime_to_ts_millis
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
|
21
|
+
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class QueryResponse:
|
|
26
|
+
"""This is the public response model for the HexQueryFetcher."""
|
|
27
|
+
|
|
28
|
+
urn: QueryUrn
|
|
29
|
+
hex_project_id: str
|
|
30
|
+
dataset_subjects: List[DatasetUrn] = field(default_factory=list)
|
|
31
|
+
schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class HexQueryFetcherReport(SourceReport):
|
|
36
|
+
start_datetime: Optional[datetime] = None
|
|
37
|
+
end_datetime: Optional[datetime] = None
|
|
38
|
+
fetched_query_urns: int = 0
|
|
39
|
+
fetched_query_objects: int = 0
|
|
40
|
+
filtered_out_queries_missing_metadata: int = 0
|
|
41
|
+
filtered_out_queries_different_workspace: int = 0
|
|
42
|
+
filtered_out_queries_no_subjects: int = 0
|
|
43
|
+
total_queries: int = 0
|
|
44
|
+
total_dataset_subjects: int = 0
|
|
45
|
+
total_schema_field_subjects: int = 0
|
|
46
|
+
num_calls_fetch_query_entities: int = 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class HexQueryFetcher:
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
datahub_client: DataHubClient,
|
|
53
|
+
workspace_name: str,
|
|
54
|
+
start_datetime: datetime,
|
|
55
|
+
end_datetime: datetime,
|
|
56
|
+
report: HexQueryFetcherReport,
|
|
57
|
+
page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
58
|
+
):
|
|
59
|
+
self.datahub_client = datahub_client
|
|
60
|
+
self.workspace_name = workspace_name
|
|
61
|
+
self.start_datetime = start_datetime
|
|
62
|
+
self.end_datetime = end_datetime
|
|
63
|
+
self.report = report
|
|
64
|
+
self.page_size = page_size
|
|
65
|
+
|
|
66
|
+
self.report.start_datetime = start_datetime
|
|
67
|
+
self.report.end_datetime = end_datetime
|
|
68
|
+
|
|
69
|
+
def fetch(self) -> Iterable[QueryResponse]:
|
|
70
|
+
try:
|
|
71
|
+
query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
|
|
72
|
+
assert all(isinstance(urn, QueryUrn) for urn in query_urns)
|
|
73
|
+
self.report.fetched_query_urns = len(query_urns)
|
|
74
|
+
|
|
75
|
+
entities_by_urn = self._fetch_query_entities(query_urns)
|
|
76
|
+
self.report.fetched_query_objects = len(entities_by_urn)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
self.report.failure(
|
|
79
|
+
title="Error fetching Queries for lineage",
|
|
80
|
+
message="Error fetching Queries will result on missing lineage",
|
|
81
|
+
context=str(
|
|
82
|
+
dict(
|
|
83
|
+
workspace_name=self.workspace_name,
|
|
84
|
+
start_datetime=self.start_datetime,
|
|
85
|
+
end_datetime=self.end_datetime,
|
|
86
|
+
)
|
|
87
|
+
),
|
|
88
|
+
exc=e,
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
if not query_urns or not entities_by_urn:
|
|
92
|
+
self.report.warning(
|
|
93
|
+
title="No Queries found with Hex as origin",
|
|
94
|
+
message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
|
|
95
|
+
context=str(
|
|
96
|
+
dict(
|
|
97
|
+
workspace_name=self.workspace_name,
|
|
98
|
+
start_datetime=self.start_datetime,
|
|
99
|
+
end_datetime=self.end_datetime,
|
|
100
|
+
)
|
|
101
|
+
),
|
|
102
|
+
)
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
for query_urn, (
|
|
106
|
+
query_properties,
|
|
107
|
+
query_subjects,
|
|
108
|
+
) in entities_by_urn.items():
|
|
109
|
+
maybe_query_response = self._build_query_response(
|
|
110
|
+
query_urn=query_urn,
|
|
111
|
+
query_properties=query_properties,
|
|
112
|
+
query_subjects=query_subjects,
|
|
113
|
+
)
|
|
114
|
+
if maybe_query_response:
|
|
115
|
+
yield maybe_query_response
|
|
116
|
+
|
|
117
|
+
def _fetch_query_entities(
|
|
118
|
+
self, query_urns: List[QueryUrn]
|
|
119
|
+
) -> Dict[
|
|
120
|
+
QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
|
|
121
|
+
]:
|
|
122
|
+
entities_by_urn: Dict[
|
|
123
|
+
QueryUrn,
|
|
124
|
+
Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
|
|
125
|
+
] = {}
|
|
126
|
+
for i in range(0, len(query_urns), self.page_size):
|
|
127
|
+
batch = query_urns[i : i + self.page_size]
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
|
|
130
|
+
entities = self.datahub_client._graph.get_entities(
|
|
131
|
+
entity_name=QueryUrn.ENTITY_TYPE,
|
|
132
|
+
urns=[urn.urn() for urn in batch],
|
|
133
|
+
aspects=[
|
|
134
|
+
QueryPropertiesClass.ASPECT_NAME,
|
|
135
|
+
QuerySubjectsClass.ASPECT_NAME,
|
|
136
|
+
],
|
|
137
|
+
with_system_metadata=False,
|
|
138
|
+
)
|
|
139
|
+
self.report.num_calls_fetch_query_entities += 1
|
|
140
|
+
logger.debug(f"Get entities response: {entities}")
|
|
141
|
+
|
|
142
|
+
for urn, entity in entities.items():
|
|
143
|
+
query_urn = QueryUrn.from_string(urn)
|
|
144
|
+
|
|
145
|
+
properties_tuple = entity.get(
|
|
146
|
+
QueryPropertiesClass.ASPECT_NAME, (None, None)
|
|
147
|
+
)
|
|
148
|
+
query_properties: Optional[QueryPropertiesClass] = None
|
|
149
|
+
if properties_tuple and properties_tuple[0]:
|
|
150
|
+
assert isinstance(properties_tuple[0], QueryPropertiesClass)
|
|
151
|
+
query_properties = properties_tuple[0]
|
|
152
|
+
|
|
153
|
+
subjects_tuple = entity.get(
|
|
154
|
+
QuerySubjectsClass.ASPECT_NAME, (None, None)
|
|
155
|
+
)
|
|
156
|
+
query_subjects: Optional[QuerySubjectsClass] = None
|
|
157
|
+
if subjects_tuple and subjects_tuple[0]:
|
|
158
|
+
assert isinstance(subjects_tuple[0], QuerySubjectsClass)
|
|
159
|
+
query_subjects = subjects_tuple[0]
|
|
160
|
+
|
|
161
|
+
entities_by_urn[query_urn] = (query_properties, query_subjects)
|
|
162
|
+
|
|
163
|
+
return entities_by_urn
|
|
164
|
+
|
|
165
|
+
def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
|
|
166
|
+
last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
|
|
167
|
+
last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
|
|
168
|
+
|
|
169
|
+
urns = self.datahub_client.search.get_urns(
|
|
170
|
+
filter=F.and_(
|
|
171
|
+
F.entity_type(QueryUrn.ENTITY_TYPE),
|
|
172
|
+
F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
|
|
173
|
+
F.custom_filter(
|
|
174
|
+
"lastModifiedAt",
|
|
175
|
+
"GREATER_THAN_OR_EQUAL_TO",
|
|
176
|
+
[str(last_modified_start_at_millis)],
|
|
177
|
+
),
|
|
178
|
+
F.custom_filter(
|
|
179
|
+
"lastModifiedAt",
|
|
180
|
+
"LESS_THAN_OR_EQUAL_TO",
|
|
181
|
+
[str(last_modified_end_at_millis)],
|
|
182
|
+
),
|
|
183
|
+
),
|
|
184
|
+
)
|
|
185
|
+
logger.debug(f"Get URNS by filter: {urns}")
|
|
186
|
+
return [QueryUrn.from_string(urn.urn()) for urn in urns]
|
|
187
|
+
|
|
188
|
+
def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
|
|
189
|
+
"""
|
|
190
|
+
Extract project ID and workspace name from SQL statement.
|
|
191
|
+
|
|
192
|
+
Looks for Hex metadata in SQL comments in the format:
|
|
193
|
+
-- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
|
|
194
|
+
|
|
195
|
+
Example:
|
|
196
|
+
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
|
|
197
|
+
|
|
198
|
+
# TODO: Consider supporting multiline metadata format in the future:
|
|
199
|
+
# -- Hex query metadata: {
|
|
200
|
+
# -- "categories": ["Scratchpad"],
|
|
201
|
+
# -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
|
|
202
|
+
# -- ...
|
|
203
|
+
# -- }
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
A tuple of (project_id, workspace_name) if both are successfully extracted
|
|
207
|
+
None if extraction fails for any reason
|
|
208
|
+
"""
|
|
209
|
+
# Extract both project_id and workspace name in a single regex operation
|
|
210
|
+
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
|
211
|
+
|
|
212
|
+
if not match:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
project_id = match.group(1)
|
|
217
|
+
workspace_name = match.group(2)
|
|
218
|
+
return project_id, workspace_name
|
|
219
|
+
except (IndexError, AttributeError) as e:
|
|
220
|
+
self.report.warning(
|
|
221
|
+
title="Failed to extract information from Hex query metadata",
|
|
222
|
+
message="Failed to extract information from Hex query metadata will result on missing lineage",
|
|
223
|
+
context=sql_statement,
|
|
224
|
+
exc=e,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
def _build_query_response(
|
|
230
|
+
self,
|
|
231
|
+
query_urn: QueryUrn,
|
|
232
|
+
query_properties: Optional[QueryPropertiesClass],
|
|
233
|
+
query_subjects: Optional[QuerySubjectsClass],
|
|
234
|
+
) -> Optional[QueryResponse]:
|
|
235
|
+
# Skip if missing required aspects
|
|
236
|
+
if (
|
|
237
|
+
not query_properties
|
|
238
|
+
or not query_properties.statement
|
|
239
|
+
or not query_properties.statement.value
|
|
240
|
+
or not query_subjects
|
|
241
|
+
or query_subjects.subjects is None # empty list is allowed
|
|
242
|
+
):
|
|
243
|
+
logger.debug(
|
|
244
|
+
f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
|
|
245
|
+
)
|
|
246
|
+
self.report.filtered_out_queries_missing_metadata += 1
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
# Extract hex metadata (project_id and workspace_name)
|
|
250
|
+
metadata_result = self._extract_hex_metadata(query_properties.statement.value)
|
|
251
|
+
if not metadata_result:
|
|
252
|
+
logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
|
|
253
|
+
self.report.filtered_out_queries_missing_metadata += 1
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
hex_project_id, workspace_from_url = metadata_result
|
|
257
|
+
|
|
258
|
+
# Validate workspace
|
|
259
|
+
if workspace_from_url != self.workspace_name:
|
|
260
|
+
logger.debug(
|
|
261
|
+
f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
|
|
262
|
+
)
|
|
263
|
+
self.report.filtered_out_queries_different_workspace += 1
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
# Extract subjects
|
|
267
|
+
dataset_subjects: List[DatasetUrn] = []
|
|
268
|
+
schema_field_subjects: List[SchemaFieldUrn] = []
|
|
269
|
+
for subject in query_subjects.subjects:
|
|
270
|
+
if subject.entity and subject.entity.startswith("urn:li:dataset:"):
|
|
271
|
+
dataset_subjects.append(DatasetUrn.from_string(subject.entity))
|
|
272
|
+
elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
|
|
273
|
+
schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
|
|
274
|
+
|
|
275
|
+
if not dataset_subjects and not schema_field_subjects:
|
|
276
|
+
self.report.filtered_out_queries_no_subjects += 1
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
# Create response
|
|
280
|
+
response = QueryResponse(
|
|
281
|
+
urn=query_urn,
|
|
282
|
+
hex_project_id=hex_project_id,
|
|
283
|
+
dataset_subjects=dataset_subjects,
|
|
284
|
+
schema_field_subjects=schema_field_subjects,
|
|
285
|
+
)
|
|
286
|
+
logger.debug(
|
|
287
|
+
f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
|
|
288
|
+
)
|
|
289
|
+
self.report.total_queries += 1
|
|
290
|
+
self.report.total_dataset_subjects += len(dataset_subjects)
|
|
291
|
+
self.report.total_schema_field_subjects += len(schema_field_subjects)
|
|
292
|
+
|
|
293
|
+
logger.debug(
|
|
294
|
+
f"Processed query {query_urn} with Hex project ID {hex_project_id}"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return response
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from functools import lru_cache
|
|
@@ -100,6 +101,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
100
101
|
from datahub.utilities import config_clean
|
|
101
102
|
from datahub.utilities.lossy_collections import LossyList
|
|
102
103
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
104
|
+
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
103
105
|
|
|
104
106
|
logger = logging.getLogger(__name__)
|
|
105
107
|
|
|
@@ -210,6 +212,11 @@ class SupersetConfig(
|
|
|
210
212
|
default=10, description="Timeout of single API call to superset."
|
|
211
213
|
)
|
|
212
214
|
|
|
215
|
+
max_threads: int = Field(
|
|
216
|
+
default_factory=lambda: os.cpu_count() or 40,
|
|
217
|
+
description="Max parallelism for API calls. Defaults to cpuCount or 40",
|
|
218
|
+
)
|
|
219
|
+
|
|
213
220
|
# TODO: Check and remove this if no longer needed.
|
|
214
221
|
# Config database_alias is removed from sql sources.
|
|
215
222
|
database_alias: Dict[str, str] = Field(
|
|
@@ -339,6 +346,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
339
346
|
|
|
340
347
|
if response.status_code != 200:
|
|
341
348
|
logger.warning(f"Failed to get {entity_type} data: {response.text}")
|
|
349
|
+
continue
|
|
342
350
|
|
|
343
351
|
payload = response.json()
|
|
344
352
|
# Update total_items with the actual count from the response
|
|
@@ -501,33 +509,41 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
501
509
|
|
|
502
510
|
return dashboard_snapshot
|
|
503
511
|
|
|
504
|
-
def
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
513
|
-
)
|
|
514
|
-
continue
|
|
515
|
-
|
|
516
|
-
dashboard_snapshot = self.construct_dashboard_from_api_data(
|
|
517
|
-
dashboard_data
|
|
518
|
-
)
|
|
519
|
-
except Exception as e:
|
|
520
|
-
self.report.warning(
|
|
521
|
-
f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
|
|
512
|
+
def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
513
|
+
dashboard_title = ""
|
|
514
|
+
try:
|
|
515
|
+
dashboard_id = str(dashboard_data.get("id"))
|
|
516
|
+
dashboard_title = dashboard_data.get("dashboard_title", "")
|
|
517
|
+
if not self.config.dashboard_pattern.allowed(dashboard_title):
|
|
518
|
+
self.report.report_dropped(
|
|
519
|
+
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
522
520
|
)
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
521
|
+
return
|
|
522
|
+
dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
|
|
523
|
+
except Exception as e:
|
|
524
|
+
self.report.warning(
|
|
525
|
+
f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
|
|
526
|
+
)
|
|
527
|
+
return
|
|
528
|
+
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
529
|
+
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
530
|
+
yield from self._get_domain_wu(
|
|
531
|
+
title=dashboard_title, entity_urn=dashboard_snapshot.urn
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
535
|
+
dashboard_data_list = [
|
|
536
|
+
(dashboard_data,)
|
|
537
|
+
for dashboard_data in self.paginate_entity_api_results(
|
|
538
|
+
"dashboard/", PAGE_SIZE
|
|
530
539
|
)
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
yield from ThreadedIteratorExecutor.process(
|
|
543
|
+
worker_func=self._process_dashboard,
|
|
544
|
+
args_list=dashboard_data_list,
|
|
545
|
+
max_workers=self.config.max_threads,
|
|
546
|
+
)
|
|
531
547
|
|
|
532
548
|
def build_input_fields(
|
|
533
549
|
self,
|
|
@@ -762,40 +778,46 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
762
778
|
entity_urn=chart_urn,
|
|
763
779
|
)
|
|
764
780
|
|
|
765
|
-
def
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
781
|
+
def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
782
|
+
chart_name = ""
|
|
783
|
+
try:
|
|
784
|
+
chart_id = str(chart_data.get("id"))
|
|
785
|
+
chart_name = chart_data.get("slice_name", "")
|
|
786
|
+
if not self.config.chart_pattern.allowed(chart_name):
|
|
787
|
+
self.report.report_dropped(
|
|
788
|
+
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
|
|
789
|
+
)
|
|
790
|
+
return
|
|
791
|
+
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
|
|
792
|
+
datasource_id = chart_data.get("datasource_id")
|
|
793
|
+
if datasource_id:
|
|
794
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
795
|
+
dataset_name = dataset_response.get("result", {}).get(
|
|
796
|
+
"table_name", ""
|
|
774
797
|
)
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
if datasource_id:
|
|
781
|
-
dataset_response = self.get_dataset_info(datasource_id)
|
|
782
|
-
dataset_name = dataset_response.get("result", {}).get(
|
|
783
|
-
"table_name", ""
|
|
798
|
+
if dataset_name and not self.config.dataset_pattern.allowed(
|
|
799
|
+
dataset_name
|
|
800
|
+
):
|
|
801
|
+
self.report.warning(
|
|
802
|
+
f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
|
|
784
803
|
)
|
|
804
|
+
yield from self.construct_chart_from_chart_data(chart_data)
|
|
805
|
+
except Exception as e:
|
|
806
|
+
self.report.warning(
|
|
807
|
+
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
808
|
+
)
|
|
809
|
+
return
|
|
785
810
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
797
|
-
)
|
|
798
|
-
continue
|
|
811
|
+
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
812
|
+
chart_data_list = [
|
|
813
|
+
(chart_data,)
|
|
814
|
+
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
|
|
815
|
+
]
|
|
816
|
+
yield from ThreadedIteratorExecutor.process(
|
|
817
|
+
worker_func=self._process_chart,
|
|
818
|
+
args_list=chart_data_list,
|
|
819
|
+
max_workers=self.config.max_threads,
|
|
820
|
+
)
|
|
799
821
|
|
|
800
822
|
def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
|
|
801
823
|
schema_fields: List[SchemaField] = []
|
|
@@ -1023,33 +1045,38 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1023
1045
|
|
|
1024
1046
|
return dataset_snapshot
|
|
1025
1047
|
|
|
1026
|
-
def
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
self.report.report_dropped(
|
|
1034
|
-
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
1035
|
-
)
|
|
1036
|
-
continue
|
|
1037
|
-
|
|
1038
|
-
dataset_snapshot = self.construct_dataset_from_dataset_data(
|
|
1039
|
-
dataset_data
|
|
1040
|
-
)
|
|
1041
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1042
|
-
except Exception as e:
|
|
1043
|
-
self.report.warning(
|
|
1044
|
-
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
1048
|
+
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1049
|
+
dataset_name = ""
|
|
1050
|
+
try:
|
|
1051
|
+
dataset_name = dataset_data.get("table_name", "")
|
|
1052
|
+
if not self.config.dataset_pattern.allowed(dataset_name):
|
|
1053
|
+
self.report.report_dropped(
|
|
1054
|
+
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
1045
1055
|
)
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1056
|
+
return
|
|
1057
|
+
dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
|
|
1058
|
+
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1059
|
+
except Exception as e:
|
|
1060
|
+
self.report.warning(
|
|
1061
|
+
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
1052
1062
|
)
|
|
1063
|
+
return
|
|
1064
|
+
yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
|
|
1065
|
+
yield from self._get_domain_wu(
|
|
1066
|
+
title=dataset_data.get("table_name", ""),
|
|
1067
|
+
entity_urn=dataset_snapshot.urn,
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1071
|
+
dataset_data_list = [
|
|
1072
|
+
(dataset_data,)
|
|
1073
|
+
for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
|
|
1074
|
+
]
|
|
1075
|
+
yield from ThreadedIteratorExecutor.process(
|
|
1076
|
+
worker_func=self._process_dataset,
|
|
1077
|
+
args_list=dataset_data_list,
|
|
1078
|
+
max_workers=self.config.max_threads,
|
|
1079
|
+
)
|
|
1053
1080
|
|
|
1054
1081
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1055
1082
|
if self.config.ingest_dashboards:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|