acryl-datahub 1.0.0.1rc3__py3-none-any.whl → 1.0.0.1rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/METADATA +2417 -2417
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/RECORD +22 -22
- datahub/_version.py +1 -1
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +8 -0
- datahub/errors.py +4 -0
- datahub/ingestion/graph/filters.py +22 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/superset.py +153 -13
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/main_client.py +2 -1
- datahub/sdk/search_filters.py +18 -23
- datahub/sql_parsing/split_statements.py +12 -2
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/top_level.txt +0 -0
|
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
|
|
|
12
12
|
from datahub.ingestion.api.common import PipelineContext
|
|
13
13
|
from datahub.ingestion.api.source import SourceReport
|
|
14
14
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
15
|
-
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
15
|
+
from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyList
|
|
17
17
|
from datahub.utilities.stats_collections import TopKDict
|
|
18
18
|
from datahub.utilities.urns._urn_base import Urn
|
|
19
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
|
-
QUERY_ENTITIES = """
|
|
23
|
-
query listEntities($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
-
scrollAcrossEntities(input: $input) {
|
|
25
|
-
nextScrollId
|
|
26
|
-
count
|
|
27
|
-
searchResults {
|
|
28
|
-
entity {
|
|
29
|
-
... on QueryEntity {
|
|
30
|
-
urn
|
|
31
|
-
}
|
|
32
|
-
... on DataProcessInstance {
|
|
33
|
-
urn
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
23
|
|
|
42
24
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
43
25
|
enabled: bool = Field(
|
|
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
64
46
|
)
|
|
65
47
|
|
|
66
48
|
entity_types: Optional[List[str]] = Field(
|
|
67
|
-
default
|
|
49
|
+
# A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
|
|
50
|
+
default=[
|
|
51
|
+
"dataset",
|
|
52
|
+
"dashboard",
|
|
53
|
+
"chart",
|
|
54
|
+
"mlmodel",
|
|
55
|
+
"mlmodelGroup",
|
|
56
|
+
"mlfeatureTable",
|
|
57
|
+
"mlfeature",
|
|
58
|
+
"mlprimaryKey",
|
|
59
|
+
"dataFlow",
|
|
60
|
+
"dataJob",
|
|
61
|
+
"glossaryTerm",
|
|
62
|
+
"glossaryNode",
|
|
63
|
+
"tag",
|
|
64
|
+
"role",
|
|
65
|
+
"corpuser",
|
|
66
|
+
"corpGroup",
|
|
67
|
+
"container",
|
|
68
|
+
"domain",
|
|
69
|
+
"dataProduct",
|
|
70
|
+
"notebook",
|
|
71
|
+
"businessAttribute",
|
|
72
|
+
"schemaField",
|
|
73
|
+
"query",
|
|
74
|
+
"dataProcessInstance",
|
|
75
|
+
],
|
|
68
76
|
description="List of entity types to cleanup",
|
|
69
77
|
)
|
|
70
78
|
|
|
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
103
111
|
num_entities_found: Dict[str, int] = field(default_factory=dict)
|
|
104
112
|
num_soft_deleted_entity_processed: int = 0
|
|
105
113
|
num_soft_deleted_retained_due_to_age: int = 0
|
|
114
|
+
num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
|
|
115
|
+
default_factory=TopKDict
|
|
116
|
+
)
|
|
106
117
|
num_soft_deleted_entity_removal_started: int = 0
|
|
107
118
|
num_hard_deleted: int = 0
|
|
108
119
|
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
111
122
|
)
|
|
112
123
|
runtime_limit_reached: bool = False
|
|
113
124
|
deletion_limit_reached: bool = False
|
|
125
|
+
num_soft_deleted_entity_found: int = 0
|
|
126
|
+
num_soft_deleted_entity_invalid_urn: int = 0
|
|
114
127
|
|
|
115
128
|
|
|
116
129
|
class SoftDeletedEntitiesCleanup:
|
|
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
133
146
|
self.config = config
|
|
134
147
|
self.report = report
|
|
135
148
|
self.dry_run = dry_run
|
|
136
|
-
self.start_time =
|
|
149
|
+
self.start_time = time.time()
|
|
137
150
|
self._report_lock: Lock = Lock()
|
|
138
151
|
self.last_print_time = 0.0
|
|
139
152
|
|
|
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
142
155
|
with self._report_lock:
|
|
143
156
|
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
144
157
|
|
|
158
|
+
def _increment_retained_by_type(self, type: str) -> None:
|
|
159
|
+
"""Thread-safe method to update report fields"""
|
|
160
|
+
with self._report_lock:
|
|
161
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
|
|
162
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
|
|
163
|
+
+ 1
|
|
164
|
+
)
|
|
165
|
+
|
|
145
166
|
def _increment_removal_started_count(self) -> None:
|
|
146
167
|
"""Thread-safe method to update report fields"""
|
|
147
168
|
with self._report_lock:
|
|
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
160
181
|
)
|
|
161
182
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
162
183
|
|
|
163
|
-
def delete_entity(self, urn:
|
|
184
|
+
def delete_entity(self, urn: Urn) -> None:
|
|
164
185
|
assert self.ctx.graph
|
|
165
186
|
|
|
166
|
-
entity_urn = Urn.from_string(urn)
|
|
167
187
|
if self.dry_run:
|
|
168
188
|
logger.info(
|
|
169
189
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
172
192
|
if self._deletion_limit_reached() or self._times_up():
|
|
173
193
|
return
|
|
174
194
|
self._increment_removal_started_count()
|
|
175
|
-
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
195
|
+
self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
|
|
176
196
|
self.ctx.graph.delete_references_to_urn(
|
|
177
|
-
urn=urn,
|
|
197
|
+
urn=urn.urn(),
|
|
178
198
|
dry_run=False,
|
|
179
199
|
)
|
|
180
|
-
self._update_report(urn,
|
|
200
|
+
self._update_report(urn.urn(), urn.entity_type)
|
|
181
201
|
|
|
182
|
-
def delete_soft_deleted_entity(self, urn:
|
|
202
|
+
def delete_soft_deleted_entity(self, urn: Urn) -> None:
|
|
183
203
|
assert self.ctx.graph
|
|
184
204
|
|
|
185
205
|
retention_time = (
|
|
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
187
207
|
- self.config.retention_days * 24 * 60 * 60
|
|
188
208
|
)
|
|
189
209
|
|
|
190
|
-
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
|
|
210
|
+
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
|
|
191
211
|
if "status" in aspect["aspects"]:
|
|
192
212
|
if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
|
|
193
213
|
"status"
|
|
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
196
216
|
self.delete_entity(urn)
|
|
197
217
|
else:
|
|
198
218
|
self._increment_retained_count()
|
|
219
|
+
self._increment_retained_by_type(urn.entity_type)
|
|
199
220
|
|
|
200
221
|
def _print_report(self) -> None:
|
|
201
222
|
time_taken = round(time.time() - self.last_print_time, 1)
|
|
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
204
225
|
self.last_print_time = time.time()
|
|
205
226
|
logger.info(f"\n{self.report.as_string()}")
|
|
206
227
|
|
|
207
|
-
def _process_futures(self, futures: Dict[Future,
|
|
228
|
+
def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
|
|
208
229
|
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
209
230
|
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
210
231
|
|
|
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
214
235
|
self.report.failure(
|
|
215
236
|
title="Failed to delete entity",
|
|
216
237
|
message="Failed to delete entity",
|
|
217
|
-
context=futures[future],
|
|
238
|
+
context=futures[future].urn(),
|
|
218
239
|
exc=future.exception(),
|
|
219
240
|
)
|
|
220
241
|
self.report.num_soft_deleted_entity_processed += 1
|
|
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
|
|
|
229
250
|
time.sleep(self.config.delay)
|
|
230
251
|
return futures
|
|
231
252
|
|
|
232
|
-
def
|
|
253
|
+
def _get_urns(self) -> Iterable[str]:
|
|
233
254
|
assert self.ctx.graph
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
240
|
-
# This will be remove in future version after server with fix has been
|
|
241
|
-
# around for a while
|
|
242
|
-
batch_size = 10
|
|
243
|
-
|
|
244
|
-
while True:
|
|
245
|
-
try:
|
|
246
|
-
if entity_type not in self.report.num_calls_made:
|
|
247
|
-
self.report.num_calls_made[entity_type] = 1
|
|
248
|
-
else:
|
|
249
|
-
self.report.num_calls_made[entity_type] += 1
|
|
250
|
-
self._print_report()
|
|
251
|
-
result = self.ctx.graph.execute_graphql(
|
|
252
|
-
graphql_query,
|
|
253
|
-
{
|
|
254
|
-
"input": {
|
|
255
|
-
"types": [entity_type],
|
|
256
|
-
"query": "*",
|
|
257
|
-
"scrollId": scroll_id if scroll_id else None,
|
|
258
|
-
"count": batch_size,
|
|
259
|
-
"orFilters": [
|
|
260
|
-
{
|
|
261
|
-
"and": [
|
|
262
|
-
{
|
|
263
|
-
"field": "removed",
|
|
264
|
-
"values": ["true"],
|
|
265
|
-
"condition": "EQUAL",
|
|
266
|
-
}
|
|
267
|
-
]
|
|
268
|
-
}
|
|
269
|
-
],
|
|
270
|
-
}
|
|
271
|
-
},
|
|
272
|
-
)
|
|
273
|
-
except Exception as e:
|
|
274
|
-
self.report.failure(
|
|
275
|
-
f"While trying to get {entity_type} with {scroll_id}", exc=e
|
|
276
|
-
)
|
|
277
|
-
break
|
|
278
|
-
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
279
|
-
if not scroll_across_entities:
|
|
280
|
-
break
|
|
281
|
-
search_results = scroll_across_entities.get("searchResults")
|
|
282
|
-
count = scroll_across_entities.get("count")
|
|
283
|
-
if not count or not search_results:
|
|
284
|
-
# Due to a server bug we cannot rely on just count as it was returning response like this
|
|
285
|
-
# {'count': 1, 'nextScrollId': None, 'searchResults': []}
|
|
286
|
-
break
|
|
287
|
-
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
288
|
-
# Temp workaround. See note in beginning of the function
|
|
289
|
-
# We make the batch size = config after call has succeeded once
|
|
290
|
-
batch_size = self.config.batch_size
|
|
291
|
-
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
292
|
-
if entity_type not in self.report.num_entities_found:
|
|
293
|
-
self.report.num_entities_found[entity_type] = 0
|
|
294
|
-
self.report.num_entities_found[entity_type] += scroll_across_entities.get(
|
|
295
|
-
"count"
|
|
255
|
+
# Entities created in the retention period are not considered for deletion
|
|
256
|
+
created_from = int(
|
|
257
|
+
(
|
|
258
|
+
datetime.now(timezone.utc).timestamp()
|
|
259
|
+
- self.config.retention_days * 24 * 60 * 60
|
|
296
260
|
)
|
|
297
|
-
|
|
298
|
-
|
|
261
|
+
* 1000
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
entity_types = self.config.entity_types
|
|
265
|
+
# dataProcessInstance is a special case where we need to get the entities separately
|
|
266
|
+
# because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
|
|
267
|
+
# Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
|
|
268
|
+
if (
|
|
269
|
+
self.config.entity_types
|
|
270
|
+
and "dataProcessInstance" in self.config.entity_types
|
|
271
|
+
):
|
|
272
|
+
entity_types = self.config.entity_types.copy()
|
|
273
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
274
|
+
entity_types=["dataProcessInstance"],
|
|
275
|
+
platform=self.config.platform,
|
|
276
|
+
env=self.config.env,
|
|
277
|
+
query=self.config.query,
|
|
278
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
279
|
+
batch_size=self.config.batch_size,
|
|
280
|
+
extraFilters=[
|
|
281
|
+
SearchFilterRule(
|
|
282
|
+
field="created",
|
|
283
|
+
condition="LESS_THAN",
|
|
284
|
+
values=[f"{created_from}"],
|
|
285
|
+
).to_raw()
|
|
286
|
+
],
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
entity_types.remove("dataProcessInstance")
|
|
299
290
|
|
|
300
|
-
def _get_urns(self) -> Iterable[str]:
|
|
301
|
-
assert self.ctx.graph
|
|
302
291
|
yield from self.ctx.graph.get_urns_by_filter(
|
|
303
|
-
entity_types=
|
|
292
|
+
entity_types=entity_types,
|
|
304
293
|
platform=self.config.platform,
|
|
305
294
|
env=self.config.env,
|
|
306
295
|
query=self.config.query,
|
|
307
296
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
308
297
|
batch_size=self.config.batch_size,
|
|
309
298
|
)
|
|
310
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
|
|
311
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
|
|
312
299
|
|
|
313
300
|
def _times_up(self) -> bool:
|
|
314
301
|
if (
|
|
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
|
|
|
335
322
|
return
|
|
336
323
|
self.start_time = time.time()
|
|
337
324
|
|
|
338
|
-
futures: Dict[Future,
|
|
325
|
+
futures: Dict[Future, Urn] = dict()
|
|
339
326
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
340
327
|
for urn in self._get_urns():
|
|
328
|
+
try:
|
|
329
|
+
self.report.num_soft_deleted_entity_found += 1
|
|
330
|
+
soft_deleted_urn = Urn.from_string(urn)
|
|
331
|
+
except InvalidUrnError as e:
|
|
332
|
+
logger.error(f"Failed to parse urn {urn} with error {e}")
|
|
333
|
+
self.report.num_soft_deleted_entity_invalid_urn += 1
|
|
334
|
+
continue
|
|
335
|
+
|
|
341
336
|
self._print_report()
|
|
342
337
|
while len(futures) >= self.config.futures_max_at_time:
|
|
343
338
|
futures = self._process_futures(futures)
|
|
344
339
|
if self._deletion_limit_reached() or self._times_up():
|
|
345
340
|
break
|
|
346
|
-
future = executor.submit(
|
|
347
|
-
|
|
341
|
+
future = executor.submit(
|
|
342
|
+
self.delete_soft_deleted_entity, soft_deleted_urn
|
|
343
|
+
)
|
|
344
|
+
futures[future] = soft_deleted_urn
|
|
348
345
|
|
|
349
346
|
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
350
347
|
while len(futures) > 0:
|
|
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
602
602
|
if not self.config.include_field_median_value:
|
|
603
603
|
return
|
|
604
604
|
try:
|
|
605
|
-
if self.dataset.engine.dialect.name.lower()
|
|
605
|
+
if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
|
|
606
606
|
column_profile.median = str(
|
|
607
607
|
self.dataset.engine.execute(
|
|
608
608
|
sa.select([sa.func.median(sa.column(column))]).select_from(
|
|
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
610
610
|
)
|
|
611
611
|
).scalar()
|
|
612
612
|
)
|
|
613
|
+
elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
|
|
614
|
+
column_profile.median = str(
|
|
615
|
+
self.dataset.engine.execute(
|
|
616
|
+
sa.select(
|
|
617
|
+
sa.text(
|
|
618
|
+
f"approx_percentile(`{column}`, 0.5) as approx_median"
|
|
619
|
+
)
|
|
620
|
+
).select_from(self.dataset._table)
|
|
621
|
+
).scalar()
|
|
622
|
+
)
|
|
613
623
|
elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
|
|
614
624
|
column_profile.median = str(
|
|
615
625
|
self.dataset.engine.execute(
|
|
@@ -115,7 +115,7 @@ class PowerBiAPI:
|
|
|
115
115
|
if scan_result is None:
|
|
116
116
|
return results
|
|
117
117
|
|
|
118
|
-
for scanned_dashboard in scan_result.get(Constant.DASHBOARDS
|
|
118
|
+
for scanned_dashboard in scan_result.get(Constant.DASHBOARDS) or []:
|
|
119
119
|
# Iterate through response and create a list of PowerBiAPI.Dashboard
|
|
120
120
|
dashboard_id = scanned_dashboard.get("id")
|
|
121
121
|
tags = self._parse_endorsement(
|
|
@@ -133,17 +133,17 @@ class PowerBiAPI:
|
|
|
133
133
|
if scan_result is None:
|
|
134
134
|
return results
|
|
135
135
|
|
|
136
|
-
reports: List[dict] = scan_result.get(Constant.REPORTS
|
|
136
|
+
reports: List[dict] = scan_result.get(Constant.REPORTS) or []
|
|
137
137
|
|
|
138
138
|
for report in reports:
|
|
139
|
-
report_id = report.get(Constant.ID
|
|
139
|
+
report_id = report.get(Constant.ID)
|
|
140
140
|
if report_id is None:
|
|
141
141
|
logger.warning(
|
|
142
142
|
f"Report id is none. Skipping endorsement tag for report instance {report}"
|
|
143
143
|
)
|
|
144
144
|
continue
|
|
145
145
|
endorsements = self._parse_endorsement(
|
|
146
|
-
report.get(Constant.ENDORSEMENT_DETAIL
|
|
146
|
+
report.get(Constant.ENDORSEMENT_DETAIL)
|
|
147
147
|
)
|
|
148
148
|
results[report_id] = endorsements
|
|
149
149
|
|
|
@@ -339,7 +339,7 @@ class PowerBiAPI:
|
|
|
339
339
|
if not endorsements:
|
|
340
340
|
return []
|
|
341
341
|
|
|
342
|
-
endorsement = endorsements.get(Constant.ENDORSEMENT
|
|
342
|
+
endorsement = endorsements.get(Constant.ENDORSEMENT)
|
|
343
343
|
if not endorsement:
|
|
344
344
|
return []
|
|
345
345
|
|
|
@@ -396,7 +396,7 @@ class PowerBiAPI:
|
|
|
396
396
|
|
|
397
397
|
if self.__config.extract_endorsements_to_tags:
|
|
398
398
|
dataset_instance.tags = self._parse_endorsement(
|
|
399
|
-
dataset_dict.get(Constant.ENDORSEMENT_DETAIL
|
|
399
|
+
dataset_dict.get(Constant.ENDORSEMENT_DETAIL)
|
|
400
400
|
)
|
|
401
401
|
|
|
402
402
|
dataset_map[dataset_instance.id] = dataset_instance
|
|
@@ -407,7 +407,7 @@ class PowerBiAPI:
|
|
|
407
407
|
else dataset_instance.id
|
|
408
408
|
)
|
|
409
409
|
logger.debug(f"dataset_dict = {dataset_dict}")
|
|
410
|
-
for table in dataset_dict.get(Constant.TABLES
|
|
410
|
+
for table in dataset_dict.get(Constant.TABLES) or []:
|
|
411
411
|
expression: Optional[str] = (
|
|
412
412
|
table[Constant.SOURCE][0][Constant.EXPRESSION]
|
|
413
413
|
if table.get(Constant.SOURCE) is not None
|
|
@@ -430,10 +430,10 @@ class PowerBiAPI:
|
|
|
430
430
|
column["dataType"], FIELD_TYPE_MAPPING["Null"]
|
|
431
431
|
),
|
|
432
432
|
)
|
|
433
|
-
for column in table.get("columns"
|
|
433
|
+
for column in table.get("columns") or []
|
|
434
434
|
],
|
|
435
435
|
measures=[
|
|
436
|
-
Measure(**measure) for measure in table.get("measures"
|
|
436
|
+
Measure(**measure) for measure in table.get("measures") or []
|
|
437
437
|
],
|
|
438
438
|
dataset=dataset_instance,
|
|
439
439
|
row_count=None,
|
|
@@ -480,7 +480,7 @@ class PowerBiAPI:
|
|
|
480
480
|
)
|
|
481
481
|
)
|
|
482
482
|
if app_id is None: # In PowerBI one workspace can have one app
|
|
483
|
-
app_id = report
|
|
483
|
+
app_id = report[Constant.APP_ID]
|
|
484
484
|
|
|
485
485
|
raw_app_dashboards: List[Dict] = []
|
|
486
486
|
# Filter app dashboards
|
|
@@ -488,7 +488,7 @@ class PowerBiAPI:
|
|
|
488
488
|
if dashboard.get(Constant.APP_ID):
|
|
489
489
|
raw_app_dashboards.append(dashboard)
|
|
490
490
|
if app_id is None: # In PowerBI, one workspace contains one app
|
|
491
|
-
app_id =
|
|
491
|
+
app_id = dashboard[Constant.APP_ID]
|
|
492
492
|
|
|
493
493
|
# workspace doesn't have an App. Above two loops can be avoided
|
|
494
494
|
# if app_id is available at root level in workspace_metadata
|
|
@@ -230,7 +230,8 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
230
230
|
)
|
|
231
231
|
|
|
232
232
|
# Populate lineage for external tables.
|
|
233
|
-
self.
|
|
233
|
+
if not self.config.skip_external_tables:
|
|
234
|
+
self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
|
|
234
235
|
|
|
235
236
|
def _populate_lineage_agg(
|
|
236
237
|
self,
|
|
@@ -400,6 +401,10 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
400
401
|
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
401
402
|
) -> None:
|
|
402
403
|
for schema_name, tables in all_tables[self.database].items():
|
|
404
|
+
logger.info(f"External table lineage: checking schema {schema_name}")
|
|
405
|
+
if not db_schemas[self.database].get(schema_name):
|
|
406
|
+
logger.warning(f"Schema {schema_name} not found")
|
|
407
|
+
continue
|
|
403
408
|
for table in tables:
|
|
404
409
|
schema = db_schemas[self.database][schema_name]
|
|
405
410
|
if (
|
|
@@ -407,6 +412,9 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
407
412
|
and schema.is_external_schema()
|
|
408
413
|
and schema.external_platform
|
|
409
414
|
):
|
|
415
|
+
logger.info(
|
|
416
|
+
f"External table lineage: processing table {schema_name}.{table.name}"
|
|
417
|
+
)
|
|
410
418
|
# external_db_params = schema.option
|
|
411
419
|
upstream_platform = schema.external_platform.lower()
|
|
412
420
|
|
|
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
|
|
|
44
44
|
SELECT
|
|
45
45
|
schema_name,
|
|
46
46
|
schema_type,
|
|
47
|
-
schema_option,
|
|
47
|
+
cast(null as varchar(1024)) as schema_option,
|
|
48
48
|
cast(null as varchar(256)) as external_platform,
|
|
49
49
|
cast(null as varchar(256)) as external_database
|
|
50
50
|
FROM svv_redshift_schemas
|