acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (44) hide show
  1. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2333 -2333
  2. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +44 -44
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
  5. datahub/cli/specific/structuredproperties_cli.py +84 -0
  6. datahub/ingestion/api/source.py +2 -0
  7. datahub/ingestion/graph/client.py +4 -2
  8. datahub/ingestion/source/aws/glue.py +14 -1
  9. datahub/ingestion/source/aws/s3_util.py +24 -1
  10. datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
  11. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  12. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
  13. datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
  14. datahub/ingestion/source/bigquery_v2/usage.py +57 -57
  15. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  16. datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
  17. datahub/ingestion/source/datahub/config.py +6 -0
  18. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  19. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  20. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  21. datahub/ingestion/source/gc/datahub_gc.py +10 -14
  22. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  23. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
  24. datahub/ingestion/source/looker/looker_config.py +8 -3
  25. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  26. datahub/ingestion/source/redshift/redshift.py +32 -34
  27. datahub/ingestion/source/redshift/usage.py +29 -29
  28. datahub/ingestion/source/s3/source.py +10 -14
  29. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  30. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
  31. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
  32. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
  33. datahub/ingestion/source/sql/teradata.py +2 -2
  34. datahub/ingestion/source/tableau/tableau.py +119 -31
  35. datahub/ingestion/source/unity/source.py +71 -71
  36. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  37. datahub/metadata/_schema_classes.py +2 -2
  38. datahub/metadata/_urns/urn_defs.py +15 -15
  39. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  40. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  41. datahub/utilities/perf_timer.py +11 -6
  42. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
  43. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
  44. {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0
@@ -248,9 +248,9 @@ class BigQuerySchemaGenerator:
248
248
  def get_project_workunits(
249
249
  self, project: BigqueryProject
250
250
  ) -> Iterable[MetadataWorkUnit]:
251
- self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION)
252
- logger.info(f"Processing project: {project.id}")
253
- yield from self._process_project(project)
251
+ with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
252
+ logger.info(f"Processing project: {project.id}")
253
+ yield from self._process_project(project)
254
254
 
255
255
  def get_dataplatform_instance_aspect(
256
256
  self, dataset_urn: str, project_id: str
@@ -405,11 +405,11 @@ class BigQuerySchemaGenerator:
405
405
 
406
406
  if self.config.is_profiling_enabled():
407
407
  logger.info(f"Starting profiling project {project_id}")
408
- self.report.set_ingestion_stage(project_id, PROFILING)
409
- yield from self.profiler.get_workunits(
410
- project_id=project_id,
411
- tables=db_tables,
412
- )
408
+ with self.report.new_stage(f"{project_id}: {PROFILING}"):
409
+ yield from self.profiler.get_workunits(
410
+ project_id=project_id,
411
+ tables=db_tables,
412
+ )
413
413
 
414
414
  def _process_project_datasets(
415
415
  self,
@@ -1203,9 +1203,9 @@ class BigQuerySchemaGenerator:
1203
1203
  report=self.report,
1204
1204
  )
1205
1205
 
1206
- self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round(
1207
- timer.elapsed_seconds(), 2
1208
- )
1206
+ self.report.metadata_extraction_sec[
1207
+ f"{project_id}.{dataset.name}"
1208
+ ] = timer.elapsed_seconds(digits=2)
1209
1209
 
1210
1210
  def get_core_table_details(
1211
1211
  self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
@@ -330,11 +330,11 @@ class BigqueryLineageExtractor:
330
330
  projects = ["*"] # project_id not used when using exported metadata
331
331
 
332
332
  for project in projects:
333
- self.report.set_ingestion_stage(project, LINEAGE_EXTRACTION)
334
- yield from self.generate_lineage(
335
- project,
336
- table_refs,
337
- )
333
+ with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
334
+ yield from self.generate_lineage(
335
+ project,
336
+ table_refs,
337
+ )
338
338
 
339
339
  if self.redundant_run_skip_handler:
340
340
  # Update the checkpoint state for this run.
@@ -368,8 +368,8 @@ class BigqueryLineageExtractor:
368
368
  self.report.lineage_metadata_entries[project_id] = len(lineage)
369
369
  logger.info(f"Built lineage map containing {len(lineage)} entries.")
370
370
  logger.debug(f"lineage metadata is {lineage}")
371
- self.report.lineage_extraction_sec[project_id] = round(
372
- timer.elapsed_seconds(), 2
371
+ self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
372
+ digits=2
373
373
  )
374
374
  self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
375
375
  memory_footprint.total_size(lineage)
@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
495
495
  def _generate_operational_workunits(
496
496
  self, usage_state: BigQueryUsageState, table_refs: Collection[str]
497
497
  ) -> Iterable[MetadataWorkUnit]:
498
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
499
- for audit_event in usage_state.standalone_events():
500
- try:
501
- operational_wu = self._create_operation_workunit(
502
- audit_event, table_refs
503
- )
504
- if operational_wu:
505
- yield operational_wu
506
- self.report.num_operational_stats_workunits_emitted += 1
507
- except Exception as e:
508
- self.report.warning(
509
- message="Unable to generate operation workunit",
510
- context=f"{audit_event}",
511
- exc=e,
512
- )
498
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
499
+ for audit_event in usage_state.standalone_events():
500
+ try:
501
+ operational_wu = self._create_operation_workunit(
502
+ audit_event, table_refs
503
+ )
504
+ if operational_wu:
505
+ yield operational_wu
506
+ self.report.num_operational_stats_workunits_emitted += 1
507
+ except Exception as e:
508
+ self.report.warning(
509
+ message="Unable to generate operation workunit",
510
+ context=f"{audit_event}",
511
+ exc=e,
512
+ )
513
513
 
514
514
  def _generate_usage_workunits(
515
515
  self, usage_state: BigQueryUsageState
516
516
  ) -> Iterable[MetadataWorkUnit]:
517
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
518
- top_n = (
519
- self.config.usage.top_n_queries
520
- if self.config.usage.include_top_n_queries
521
- else 0
522
- )
523
- for entry in usage_state.usage_statistics(top_n=top_n):
524
- try:
525
- query_freq = [
526
- (
527
- self.uuid_to_query.get(
528
- query_hash, usage_state.queries[query_hash]
529
- ),
530
- count,
517
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
518
+ top_n = (
519
+ self.config.usage.top_n_queries
520
+ if self.config.usage.include_top_n_queries
521
+ else 0
522
+ )
523
+ for entry in usage_state.usage_statistics(top_n=top_n):
524
+ try:
525
+ query_freq = [
526
+ (
527
+ self.uuid_to_query.get(
528
+ query_hash, usage_state.queries[query_hash]
529
+ ),
530
+ count,
531
+ )
532
+ for query_hash, count in entry.query_freq
533
+ ]
534
+ yield make_usage_workunit(
535
+ bucket_start_time=datetime.fromisoformat(entry.timestamp),
536
+ resource=BigQueryTableRef.from_string_name(entry.resource),
537
+ query_count=entry.query_count,
538
+ query_freq=query_freq,
539
+ user_freq=entry.user_freq,
540
+ column_freq=entry.column_freq,
541
+ bucket_duration=self.config.bucket_duration,
542
+ resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
543
+ top_n_queries=self.config.usage.top_n_queries,
544
+ format_sql_queries=self.config.usage.format_sql_queries,
545
+ queries_character_limit=self.config.usage.queries_character_limit,
546
+ )
547
+ self.report.num_usage_workunits_emitted += 1
548
+ except Exception as e:
549
+ self.report.warning(
550
+ message="Unable to generate usage statistics workunit",
551
+ context=f"{entry.timestamp}, {entry.resource}",
552
+ exc=e,
531
553
  )
532
- for query_hash, count in entry.query_freq
533
- ]
534
- yield make_usage_workunit(
535
- bucket_start_time=datetime.fromisoformat(entry.timestamp),
536
- resource=BigQueryTableRef.from_string_name(entry.resource),
537
- query_count=entry.query_count,
538
- query_freq=query_freq,
539
- user_freq=entry.user_freq,
540
- column_freq=entry.column_freq,
541
- bucket_duration=self.config.bucket_duration,
542
- resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
543
- top_n_queries=self.config.usage.top_n_queries,
544
- format_sql_queries=self.config.usage.format_sql_queries,
545
- queries_character_limit=self.config.usage.queries_character_limit,
546
- )
547
- self.report.num_usage_workunits_emitted += 1
548
- except Exception as e:
549
- self.report.warning(
550
- message="Unable to generate usage statistics workunit",
551
- context=f"{entry.timestamp}, {entry.resource}",
552
- exc=e,
553
- )
554
554
 
555
555
  def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
556
556
  if self.config.use_exported_bigquery_audit_metadata:
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
559
559
  for project_id in projects:
560
560
  with PerfTimer() as timer:
561
561
  try:
562
- self.report.set_ingestion_stage(
563
- project_id, USAGE_EXTRACTION_INGESTION
564
- )
565
- yield from self._get_parsed_bigquery_log_events(project_id)
562
+ with self.report.new_stage(
563
+ f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
564
+ ):
565
+ yield from self._get_parsed_bigquery_log_events(project_id)
566
566
  except Exception as e:
567
567
  self.report.usage_failed_extraction.append(project_id)
568
568
  self.report.warning(
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
572
572
  )
573
573
  self.report_status(f"usage-extraction-{project_id}", False)
574
574
 
575
- self.report.usage_extraction_sec[project_id] = round(
576
- timer.elapsed_seconds(), 2
575
+ self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
576
+ digits=2
577
577
  )
578
578
 
579
579
  def _store_usage_event(
@@ -70,30 +70,30 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- self.report.set_ingestion_stage(keyspace_name, PROFILING)
74
- with ThreadPoolExecutor(
75
- max_workers=self.config.profiling.max_workers
76
- ) as executor:
77
- future_to_dataset = {
78
- executor.submit(
79
- self.generate_profile,
80
- keyspace_name,
81
- table_name,
82
- cassandra_data.columns.get(table_name, []),
83
- ): table_name
84
- for table_name in tables
85
- }
86
- for future in as_completed(future_to_dataset):
87
- table_name = future_to_dataset[future]
88
- try:
89
- yield from future.result()
90
- except Exception as exc:
91
- self.report.profiling_skipped_other[table_name] += 1
92
- self.report.failure(
93
- message="Failed to profile for table",
94
- context=f"{keyspace_name}.{table_name}",
95
- exc=exc,
96
- )
73
+ with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
74
+ with ThreadPoolExecutor(
75
+ max_workers=self.config.profiling.max_workers
76
+ ) as executor:
77
+ future_to_dataset = {
78
+ executor.submit(
79
+ self.generate_profile,
80
+ keyspace_name,
81
+ table_name,
82
+ cassandra_data.columns.get(table_name, []),
83
+ ): table_name
84
+ for table_name in tables
85
+ }
86
+ for future in as_completed(future_to_dataset):
87
+ table_name = future_to_dataset[future]
88
+ try:
89
+ yield from future.result()
90
+ except Exception as exc:
91
+ self.report.profiling_skipped_other[table_name] += 1
92
+ self.report.failure(
93
+ message="Failed to profile for table",
94
+ context=f"{keyspace_name}.{table_name}",
95
+ exc=exc,
96
+ )
97
97
 
98
98
  def generate_profile(
99
99
  self,
@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
54
54
  else:
55
55
  raise KeyError(f"Unknown entity {ent_type}.")
56
56
 
57
- def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
58
- self.report_ingestion_stage_start(f"{keyspace}: {stage}")
59
-
60
57
  # TODO Need to create seperate common config for profiling report
61
58
  profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
62
59
  profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
@@ -108,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
108
108
 
109
109
  urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
110
110
 
111
+ drop_duplicate_schema_fields: bool = Field(
112
+ default=False,
113
+ description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
114
+ "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
115
+ )
116
+
111
117
  @root_validator(skip_on_failure=True)
112
118
  def check_ingesting_data(cls, values):
113
119
  if (
@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
12
12
  support_status,
13
13
  )
14
14
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
15
- from datahub.ingestion.api.source_helpers import auto_workunit_reporter
15
+ from datahub.ingestion.api.source_helpers import (
16
+ auto_fix_duplicate_schema_field_paths,
17
+ auto_workunit_reporter,
18
+ )
16
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
17
20
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
18
21
  from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
57
60
 
58
61
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
59
62
  # Exactly replicate data from DataHub source
60
- return [partial(auto_workunit_reporter, self.get_report())]
63
+ return [
64
+ (
65
+ auto_fix_duplicate_schema_field_paths
66
+ if self.config.drop_duplicate_schema_fields
67
+ else None
68
+ ),
69
+ partial(auto_workunit_reporter, self.get_report()),
70
+ ]
61
71
 
62
72
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
63
73
  self.report.stop_time = datetime.now(tz=timezone.utc)
@@ -45,6 +45,3 @@ class DremioSourceReport(
45
45
  self.views_scanned += 1
46
46
  else:
47
47
  raise KeyError(f"Unknown entity {ent_type}.")
48
-
49
- def set_ingestion_stage(self, dataset: str, stage: str) -> None:
50
- self.report_ingestion_stage_start(f"{dataset}: {stage}")
@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
472
472
  env=self.config.env,
473
473
  platform_instance=self.config.platform_instance,
474
474
  )
475
- self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
476
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
475
+ with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
476
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
477
477
 
478
478
  def generate_view_lineage(
479
479
  self, dataset_urn: str, parents: List[str]
@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
141
141
  ) -> Iterable[MetadataWorkUnit]:
142
142
  if self.config.cleanup_expired_tokens:
143
143
  try:
144
- self.report.report_ingestion_stage_start("Expired Token Cleanup")
145
- self.revoke_expired_tokens()
144
+ with self.report.new_stage("Expired Token Cleanup"):
145
+ self.revoke_expired_tokens()
146
146
  except Exception as e:
147
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
148
148
  if self.config.truncate_indices:
149
149
  try:
150
- self.report.report_ingestion_stage_start("Truncate Indices")
151
- self.truncate_indices()
150
+ with self.report.new_stage("Truncate Indices"):
151
+ self.truncate_indices()
152
152
  except Exception as e:
153
153
  self.report.failure("While trying to truncate indices ", exc=e)
154
154
  if self.config.soft_deleted_entities_cleanup.enabled:
155
155
  try:
156
- self.report.report_ingestion_stage_start(
157
- "Soft Deleted Entities Cleanup"
158
- )
159
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
156
+ with self.report.new_stage("Soft Deleted Entities Cleanup"):
157
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
160
158
  except Exception as e:
161
159
  self.report.failure(
162
160
  "While trying to cleanup soft deleted entities ", exc=e
163
161
  )
164
162
  if self.config.dataprocess_cleanup.enabled:
165
163
  try:
166
- self.report.report_ingestion_stage_start("Data Process Cleanup")
167
- yield from self.dataprocess_cleanup.get_workunits_internal()
164
+ with self.report.new_stage("Data Process Cleanup"):
165
+ yield from self.dataprocess_cleanup.get_workunits_internal()
168
166
  except Exception as e:
169
167
  self.report.failure("While trying to cleanup data process ", exc=e)
170
168
  if self.config.execution_request_cleanup.enabled:
171
169
  try:
172
- self.report.report_ingestion_stage_start("Execution request Cleanup")
173
- self.execution_request_cleanup.run()
170
+ with self.report.new_stage("Execution request Cleanup"):
171
+ self.execution_request_cleanup.run()
174
172
  except Exception as e:
175
173
  self.report.failure("While trying to cleanup execution request ", exc=e)
176
- # Otherwise last stage's duration does not get calculated.
177
- self.report.report_ingestion_stage_start("End")
178
174
  yield from []
179
175
 
180
176
  def truncate_indices(self) -> None:
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
29
29
  )
30
30
 
31
31
  keep_history_max_days: int = Field(
32
- 30,
32
+ 90,
33
33
  description="Maximum number of days to keep execution requests for, per ingestion source",
34
34
  )
35
35
 
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
48
48
  description="Maximum runtime in seconds for the cleanup task",
49
49
  )
50
50
 
51
+ limit_entities_delete: Optional[int] = Field(
52
+ 10000, description="Max number of execution requests to hard delete."
53
+ )
54
+
51
55
  max_read_errors: int = Field(
52
56
  default=10,
53
57
  description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
65
69
  ergc_delete_errors: int = 0
66
70
  ergc_start_time: Optional[datetime.datetime] = None
67
71
  ergc_end_time: Optional[datetime.datetime] = None
72
+ ergc_delete_limit_reached: bool = False
73
+ ergc_runtime_limit_reached: bool = False
68
74
 
69
75
 
70
76
  class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
85
91
  self.graph = graph
86
92
  self.report = report
87
93
  self.instance_id = int(time.time())
94
+ self.last_print_time = 0.0
88
95
 
89
96
  if config is not None:
90
97
  self.config = config
91
98
  else:
92
99
  self.config = DatahubExecutionRequestCleanupConfig()
93
100
 
101
+ def _print_report(self) -> None:
102
+ time_taken = round(time.time() - self.last_print_time, 1)
103
+ # Print report every 2 minutes
104
+ if time_taken > 120:
105
+ self.last_print_time = time.time()
106
+ logger.info(f"\n{self.report.as_string()}")
107
+
94
108
  def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
95
109
  input_aspect = (
96
110
  entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
175
189
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
176
190
 
177
191
  for entry in self._scroll_execution_requests():
192
+ self._print_report()
178
193
  self.report.ergc_records_read += 1
179
194
  key = entry.ingestion_source
180
195
 
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
225
240
  f"record timestamp: {entry.requested_at}."
226
241
  )
227
242
  )
228
- self.report.ergc_records_deleted += 1
229
243
  yield entry
230
244
 
231
245
  def _delete_entry(self, entry: CleanupRecord) -> None:
232
246
  try:
233
- logger.info(
234
- f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
235
- )
236
247
  self.graph.delete_entity(entry.urn, True)
248
+ self.report.ergc_records_deleted += 1
237
249
  except Exception as e:
238
250
  self.report.ergc_delete_errors += 1
239
251
  self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
252
264
  >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
253
265
  )
254
266
  ):
267
+ self.report.ergc_runtime_limit_reached = True
255
268
  logger.info(f"ergc({self.instance_id}): max runtime reached.")
256
269
  return True
257
270
  return False
258
271
 
272
+ def _reached_delete_limit(self) -> bool:
273
+ if (
274
+ self.config.limit_entities_delete
275
+ and self.report.ergc_records_deleted >= self.config.limit_entities_delete
276
+ ):
277
+ logger.info(
278
+ f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
279
+ )
280
+ self.report.ergc_delete_limit_reached = True
281
+ return True
282
+ return False
283
+
259
284
  def run(self) -> None:
260
285
  if not self.config.enabled:
261
286
  logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
274
299
  )
275
300
 
276
301
  for entry in self._scroll_garbage_records():
277
- if self._reached_runtime_limit():
302
+ if self._reached_runtime_limit() or self._reached_delete_limit():
278
303
  break
279
304
  self._delete_entry(entry)
280
305
 
@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
231
231
  def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
232
232
  assert self.ctx.graph
233
233
  scroll_id: Optional[str] = None
234
+
235
+ batch_size = self.config.batch_size
236
+ if entity_type == "DATA_PROCESS_INSTANCE":
237
+ # Due to a bug in Data process instance querying this is a temp workaround
238
+ # to avoid a giant stacktrace by having a smaller batch size in first call
239
+ # This will be remove in future version after server with fix has been
240
+ # around for a while
241
+ batch_size = 10
242
+
234
243
  while True:
235
244
  try:
236
245
  result = self.ctx.graph.execute_graphql(
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
240
249
  "types": [entity_type],
241
250
  "query": "*",
242
251
  "scrollId": scroll_id if scroll_id else None,
243
- "count": self.config.batch_size,
252
+ "count": batch_size,
244
253
  "orFilters": [
245
254
  {
246
255
  "and": [
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
263
272
  scroll_across_entities = result.get("scrollAcrossEntities")
264
273
  if not scroll_across_entities or not scroll_across_entities.get("count"):
265
274
  break
275
+ if entity_type == "DATA_PROCESS_INSTANCE":
276
+ # Temp workaround. See note in beginning of the function
277
+ # We make the batch size = config after call has succeeded once
278
+ batch_size = self.config.batch_size
266
279
  scroll_id = scroll_across_entities.get("nextScrollId")
267
280
  self.report.num_queries_found += scroll_across_entities.get("count")
268
281
  for query in scroll_across_entities.get("searchResults"):
@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
300
300
 
301
301
  folder_path_pattern: AllowDenyPattern = Field(
302
302
  default=AllowDenyPattern.allow_all(),
303
- description="Allow or deny dashboards from specific folders. "
303
+ description="Allow or deny dashboards from specific folders using their fully qualified paths. "
304
304
  "For example: \n"
305
305
  "deny: \n"
306
- " - sales/deprecated \n"
307
- "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n"
306
+ " - Shared/deprecated \n"
307
+ "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
308
+ "allow: \n"
309
+ " - Shared/sales \n"
310
+ "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
311
+ "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
312
+ "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
308
313
  "Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
309
314
  )
310
315
 
@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
485
485
  self.filtered_reports.append(view)
486
486
 
487
487
 
488
- @platform_name("PowerBI")
488
+ @platform_name("PowerBI Report Server")
489
489
  @config_class(PowerBiReportServerDashboardSourceConfig)
490
490
  @support_status(SupportStatus.INCUBATING)
491
491
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")