acryl-datahub 1.0.0.1rc3__py3-none-any.whl → 1.0.0.1rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
12
12
  from datahub.ingestion.api.common import PipelineContext
13
13
  from datahub.ingestion.api.source import SourceReport
14
14
  from datahub.ingestion.graph.client import DataHubGraph
15
- from datahub.ingestion.graph.filters import RemovedStatusFilter
15
+ from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
16
16
  from datahub.utilities.lossy_collections import LossyList
17
17
  from datahub.utilities.stats_collections import TopKDict
18
18
  from datahub.utilities.urns._urn_base import Urn
19
+ from datahub.utilities.urns.error import InvalidUrnError
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
22
- QUERY_ENTITIES = """
23
- query listEntities($input: ScrollAcrossEntitiesInput!) {
24
- scrollAcrossEntities(input: $input) {
25
- nextScrollId
26
- count
27
- searchResults {
28
- entity {
29
- ... on QueryEntity {
30
- urn
31
- }
32
- ... on DataProcessInstance {
33
- urn
34
- }
35
- }
36
- }
37
- }
38
- }
39
- """
40
-
41
23
 
42
24
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
43
25
  enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
64
46
  )
65
47
 
66
48
  entity_types: Optional[List[str]] = Field(
67
- default=None,
49
+ # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
50
+ default=[
51
+ "dataset",
52
+ "dashboard",
53
+ "chart",
54
+ "mlmodel",
55
+ "mlmodelGroup",
56
+ "mlfeatureTable",
57
+ "mlfeature",
58
+ "mlprimaryKey",
59
+ "dataFlow",
60
+ "dataJob",
61
+ "glossaryTerm",
62
+ "glossaryNode",
63
+ "tag",
64
+ "role",
65
+ "corpuser",
66
+ "corpGroup",
67
+ "container",
68
+ "domain",
69
+ "dataProduct",
70
+ "notebook",
71
+ "businessAttribute",
72
+ "schemaField",
73
+ "query",
74
+ "dataProcessInstance",
75
+ ],
68
76
  description="List of entity types to cleanup",
69
77
  )
70
78
 
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
103
111
  num_entities_found: Dict[str, int] = field(default_factory=dict)
104
112
  num_soft_deleted_entity_processed: int = 0
105
113
  num_soft_deleted_retained_due_to_age: int = 0
114
+ num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
115
+ default_factory=TopKDict
116
+ )
106
117
  num_soft_deleted_entity_removal_started: int = 0
107
118
  num_hard_deleted: int = 0
108
119
  num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
111
122
  )
112
123
  runtime_limit_reached: bool = False
113
124
  deletion_limit_reached: bool = False
125
+ num_soft_deleted_entity_found: int = 0
126
+ num_soft_deleted_entity_invalid_urn: int = 0
114
127
 
115
128
 
116
129
  class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
133
146
  self.config = config
134
147
  self.report = report
135
148
  self.dry_run = dry_run
136
- self.start_time = 0.0
149
+ self.start_time = time.time()
137
150
  self._report_lock: Lock = Lock()
138
151
  self.last_print_time = 0.0
139
152
 
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
142
155
  with self._report_lock:
143
156
  self.report.num_soft_deleted_retained_due_to_age += 1
144
157
 
158
+ def _increment_retained_by_type(self, type: str) -> None:
159
+ """Thread-safe method to update report fields"""
160
+ with self._report_lock:
161
+ self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
162
+ self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
163
+ + 1
164
+ )
165
+
145
166
  def _increment_removal_started_count(self) -> None:
146
167
  """Thread-safe method to update report fields"""
147
168
  with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
160
181
  )
161
182
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
162
183
 
163
- def delete_entity(self, urn: str) -> None:
184
+ def delete_entity(self, urn: Urn) -> None:
164
185
  assert self.ctx.graph
165
186
 
166
- entity_urn = Urn.from_string(urn)
167
187
  if self.dry_run:
168
188
  logger.info(
169
189
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
172
192
  if self._deletion_limit_reached() or self._times_up():
173
193
  return
174
194
  self._increment_removal_started_count()
175
- self.ctx.graph.delete_entity(urn=urn, hard=True)
195
+ self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
176
196
  self.ctx.graph.delete_references_to_urn(
177
- urn=urn,
197
+ urn=urn.urn(),
178
198
  dry_run=False,
179
199
  )
180
- self._update_report(urn, entity_urn.entity_type)
200
+ self._update_report(urn.urn(), urn.entity_type)
181
201
 
182
- def delete_soft_deleted_entity(self, urn: str) -> None:
202
+ def delete_soft_deleted_entity(self, urn: Urn) -> None:
183
203
  assert self.ctx.graph
184
204
 
185
205
  retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
187
207
  - self.config.retention_days * 24 * 60 * 60
188
208
  )
189
209
 
190
- aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
210
+ aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
191
211
  if "status" in aspect["aspects"]:
192
212
  if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
193
213
  "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
196
216
  self.delete_entity(urn)
197
217
  else:
198
218
  self._increment_retained_count()
219
+ self._increment_retained_by_type(urn.entity_type)
199
220
 
200
221
  def _print_report(self) -> None:
201
222
  time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
204
225
  self.last_print_time = time.time()
205
226
  logger.info(f"\n{self.report.as_string()}")
206
227
 
207
- def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
228
+ def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
208
229
  done, not_done = wait(futures, return_when=FIRST_COMPLETED)
209
230
  futures = {future: urn for future, urn in futures.items() if future in not_done}
210
231
 
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
214
235
  self.report.failure(
215
236
  title="Failed to delete entity",
216
237
  message="Failed to delete entity",
217
- context=futures[future],
238
+ context=futures[future].urn(),
218
239
  exc=future.exception(),
219
240
  )
220
241
  self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
229
250
  time.sleep(self.config.delay)
230
251
  return futures
231
252
 
232
- def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
253
+ def _get_urns(self) -> Iterable[str]:
233
254
  assert self.ctx.graph
234
- scroll_id: Optional[str] = None
235
-
236
- batch_size = self.config.batch_size
237
- if entity_type == "DATA_PROCESS_INSTANCE":
238
- # Due to a bug in Data process instance querying this is a temp workaround
239
- # to avoid a giant stacktrace by having a smaller batch size in first call
240
- # This will be remove in future version after server with fix has been
241
- # around for a while
242
- batch_size = 10
243
-
244
- while True:
245
- try:
246
- if entity_type not in self.report.num_calls_made:
247
- self.report.num_calls_made[entity_type] = 1
248
- else:
249
- self.report.num_calls_made[entity_type] += 1
250
- self._print_report()
251
- result = self.ctx.graph.execute_graphql(
252
- graphql_query,
253
- {
254
- "input": {
255
- "types": [entity_type],
256
- "query": "*",
257
- "scrollId": scroll_id if scroll_id else None,
258
- "count": batch_size,
259
- "orFilters": [
260
- {
261
- "and": [
262
- {
263
- "field": "removed",
264
- "values": ["true"],
265
- "condition": "EQUAL",
266
- }
267
- ]
268
- }
269
- ],
270
- }
271
- },
272
- )
273
- except Exception as e:
274
- self.report.failure(
275
- f"While trying to get {entity_type} with {scroll_id}", exc=e
276
- )
277
- break
278
- scroll_across_entities = result.get("scrollAcrossEntities")
279
- if not scroll_across_entities:
280
- break
281
- search_results = scroll_across_entities.get("searchResults")
282
- count = scroll_across_entities.get("count")
283
- if not count or not search_results:
284
- # Due to a server bug we cannot rely on just count as it was returning response like this
285
- # {'count': 1, 'nextScrollId': None, 'searchResults': []}
286
- break
287
- if entity_type == "DATA_PROCESS_INSTANCE":
288
- # Temp workaround. See note in beginning of the function
289
- # We make the batch size = config after call has succeeded once
290
- batch_size = self.config.batch_size
291
- scroll_id = scroll_across_entities.get("nextScrollId")
292
- if entity_type not in self.report.num_entities_found:
293
- self.report.num_entities_found[entity_type] = 0
294
- self.report.num_entities_found[entity_type] += scroll_across_entities.get(
295
- "count"
255
+ # Entities created in the retention period are not considered for deletion
256
+ created_from = int(
257
+ (
258
+ datetime.now(timezone.utc).timestamp()
259
+ - self.config.retention_days * 24 * 60 * 60
296
260
  )
297
- for query in search_results:
298
- yield query["entity"]["urn"]
261
+ * 1000
262
+ )
263
+
264
+ entity_types = self.config.entity_types
265
+ # dataProcessInstance is a special case where we need to get the entities separately
266
+ # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
267
+ # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
268
+ if (
269
+ self.config.entity_types
270
+ and "dataProcessInstance" in self.config.entity_types
271
+ ):
272
+ entity_types = self.config.entity_types.copy()
273
+ yield from self.ctx.graph.get_urns_by_filter(
274
+ entity_types=["dataProcessInstance"],
275
+ platform=self.config.platform,
276
+ env=self.config.env,
277
+ query=self.config.query,
278
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
279
+ batch_size=self.config.batch_size,
280
+ extraFilters=[
281
+ SearchFilterRule(
282
+ field="created",
283
+ condition="LESS_THAN",
284
+ values=[f"{created_from}"],
285
+ ).to_raw()
286
+ ],
287
+ )
288
+
289
+ entity_types.remove("dataProcessInstance")
299
290
 
300
- def _get_urns(self) -> Iterable[str]:
301
- assert self.ctx.graph
302
291
  yield from self.ctx.graph.get_urns_by_filter(
303
- entity_types=self.config.entity_types,
292
+ entity_types=entity_types,
304
293
  platform=self.config.platform,
305
294
  env=self.config.env,
306
295
  query=self.config.query,
307
296
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
308
297
  batch_size=self.config.batch_size,
309
298
  )
310
- yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
311
- yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
312
299
 
313
300
  def _times_up(self) -> bool:
314
301
  if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
335
322
  return
336
323
  self.start_time = time.time()
337
324
 
338
- futures: Dict[Future, str] = dict()
325
+ futures: Dict[Future, Urn] = dict()
339
326
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
340
327
  for urn in self._get_urns():
328
+ try:
329
+ self.report.num_soft_deleted_entity_found += 1
330
+ soft_deleted_urn = Urn.from_string(urn)
331
+ except InvalidUrnError as e:
332
+ logger.error(f"Failed to parse urn {urn} with error {e}")
333
+ self.report.num_soft_deleted_entity_invalid_urn += 1
334
+ continue
335
+
341
336
  self._print_report()
342
337
  while len(futures) >= self.config.futures_max_at_time:
343
338
  futures = self._process_futures(futures)
344
339
  if self._deletion_limit_reached() or self._times_up():
345
340
  break
346
- future = executor.submit(self.delete_soft_deleted_entity, urn)
347
- futures[future] = urn
341
+ future = executor.submit(
342
+ self.delete_soft_deleted_entity, soft_deleted_urn
343
+ )
344
+ futures[future] = soft_deleted_urn
348
345
 
349
346
  logger.info(f"Waiting for {len(futures)} futures to complete")
350
347
  while len(futures) > 0:
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
602
602
  if not self.config.include_field_median_value:
603
603
  return
604
604
  try:
605
- if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
605
+ if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
606
606
  column_profile.median = str(
607
607
  self.dataset.engine.execute(
608
608
  sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
610
610
  )
611
611
  ).scalar()
612
612
  )
613
+ elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
614
+ column_profile.median = str(
615
+ self.dataset.engine.execute(
616
+ sa.select(
617
+ sa.text(
618
+ f"approx_percentile(`{column}`, 0.5) as approx_median"
619
+ )
620
+ ).select_from(self.dataset._table)
621
+ ).scalar()
622
+ )
613
623
  elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
614
624
  column_profile.median = str(
615
625
  self.dataset.engine.execute(
@@ -115,7 +115,7 @@ class PowerBiAPI:
115
115
  if scan_result is None:
116
116
  return results
117
117
 
118
- for scanned_dashboard in scan_result.get(Constant.DASHBOARDS, []):
118
+ for scanned_dashboard in scan_result.get(Constant.DASHBOARDS) or []:
119
119
  # Iterate through response and create a list of PowerBiAPI.Dashboard
120
120
  dashboard_id = scanned_dashboard.get("id")
121
121
  tags = self._parse_endorsement(
@@ -133,17 +133,17 @@ class PowerBiAPI:
133
133
  if scan_result is None:
134
134
  return results
135
135
 
136
- reports: List[dict] = scan_result.get(Constant.REPORTS, [])
136
+ reports: List[dict] = scan_result.get(Constant.REPORTS) or []
137
137
 
138
138
  for report in reports:
139
- report_id = report.get(Constant.ID, None)
139
+ report_id = report.get(Constant.ID)
140
140
  if report_id is None:
141
141
  logger.warning(
142
142
  f"Report id is none. Skipping endorsement tag for report instance {report}"
143
143
  )
144
144
  continue
145
145
  endorsements = self._parse_endorsement(
146
- report.get(Constant.ENDORSEMENT_DETAIL, None)
146
+ report.get(Constant.ENDORSEMENT_DETAIL)
147
147
  )
148
148
  results[report_id] = endorsements
149
149
 
@@ -339,7 +339,7 @@ class PowerBiAPI:
339
339
  if not endorsements:
340
340
  return []
341
341
 
342
- endorsement = endorsements.get(Constant.ENDORSEMENT, None)
342
+ endorsement = endorsements.get(Constant.ENDORSEMENT)
343
343
  if not endorsement:
344
344
  return []
345
345
 
@@ -396,7 +396,7 @@ class PowerBiAPI:
396
396
 
397
397
  if self.__config.extract_endorsements_to_tags:
398
398
  dataset_instance.tags = self._parse_endorsement(
399
- dataset_dict.get(Constant.ENDORSEMENT_DETAIL, None)
399
+ dataset_dict.get(Constant.ENDORSEMENT_DETAIL)
400
400
  )
401
401
 
402
402
  dataset_map[dataset_instance.id] = dataset_instance
@@ -407,7 +407,7 @@ class PowerBiAPI:
407
407
  else dataset_instance.id
408
408
  )
409
409
  logger.debug(f"dataset_dict = {dataset_dict}")
410
- for table in dataset_dict.get(Constant.TABLES, []):
410
+ for table in dataset_dict.get(Constant.TABLES) or []:
411
411
  expression: Optional[str] = (
412
412
  table[Constant.SOURCE][0][Constant.EXPRESSION]
413
413
  if table.get(Constant.SOURCE) is not None
@@ -430,10 +430,10 @@ class PowerBiAPI:
430
430
  column["dataType"], FIELD_TYPE_MAPPING["Null"]
431
431
  ),
432
432
  )
433
- for column in table.get("columns", [])
433
+ for column in table.get("columns") or []
434
434
  ],
435
435
  measures=[
436
- Measure(**measure) for measure in table.get("measures", [])
436
+ Measure(**measure) for measure in table.get("measures") or []
437
437
  ],
438
438
  dataset=dataset_instance,
439
439
  row_count=None,
@@ -480,7 +480,7 @@ class PowerBiAPI:
480
480
  )
481
481
  )
482
482
  if app_id is None: # In PowerBI one workspace can have one app
483
- app_id = report.get(Constant.APP_ID)
483
+ app_id = report[Constant.APP_ID]
484
484
 
485
485
  raw_app_dashboards: List[Dict] = []
486
486
  # Filter app dashboards
@@ -488,7 +488,7 @@ class PowerBiAPI:
488
488
  if dashboard.get(Constant.APP_ID):
489
489
  raw_app_dashboards.append(dashboard)
490
490
  if app_id is None: # In PowerBI, one workspace contains one app
491
- app_id = report[Constant.APP_ID]
491
+ app_id = dashboard[Constant.APP_ID]
492
492
 
493
493
  # workspace doesn't have an App. Above two loops can be avoided
494
494
  # if app_id is available at root level in workspace_metadata
@@ -230,7 +230,8 @@ class RedshiftSqlLineageV2(Closeable):
230
230
  )
231
231
 
232
232
  # Populate lineage for external tables.
233
- self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
233
+ if not self.config.skip_external_tables:
234
+ self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
234
235
 
235
236
  def _populate_lineage_agg(
236
237
  self,
@@ -400,6 +401,10 @@ class RedshiftSqlLineageV2(Closeable):
400
401
  db_schemas: Dict[str, Dict[str, RedshiftSchema]],
401
402
  ) -> None:
402
403
  for schema_name, tables in all_tables[self.database].items():
404
+ logger.info(f"External table lineage: checking schema {schema_name}")
405
+ if not db_schemas[self.database].get(schema_name):
406
+ logger.warning(f"Schema {schema_name} not found")
407
+ continue
403
408
  for table in tables:
404
409
  schema = db_schemas[self.database][schema_name]
405
410
  if (
@@ -407,6 +412,9 @@ class RedshiftSqlLineageV2(Closeable):
407
412
  and schema.is_external_schema()
408
413
  and schema.external_platform
409
414
  ):
415
+ logger.info(
416
+ f"External table lineage: processing table {schema_name}.{table.name}"
417
+ )
410
418
  # external_db_params = schema.option
411
419
  upstream_platform = schema.external_platform.lower()
412
420
 
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
44
44
  SELECT
45
45
  schema_name,
46
46
  schema_type,
47
- schema_option,
47
+ cast(null as varchar(1024)) as schema_option,
48
48
  cast(null as varchar(256)) as external_platform,
49
49
  cast(null as varchar(256)) as external_database
50
50
  FROM svv_redshift_schemas