acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2333 -2333
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +44 -44
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/ingestion/api/source.py +2 -0
- datahub/ingestion/graph/client.py +4 -2
- datahub/ingestion/source/aws/glue.py +14 -1
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
- datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
- datahub/ingestion/source/bigquery_v2/usage.py +57 -57
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
- datahub/ingestion/source/datahub/config.py +6 -0
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +10 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
- datahub/ingestion/source/looker/looker_config.py +8 -3
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +32 -34
- datahub/ingestion/source/redshift/usage.py +29 -29
- datahub/ingestion/source/s3/source.py +10 -14
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/tableau/tableau.py +119 -31
- datahub/ingestion/source/unity/source.py +71 -71
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/metadata/_schema_classes.py +2 -2
- datahub/metadata/_urns/urn_defs.py +15 -15
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/utilities/perf_timer.py +11 -6
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0
|
@@ -248,9 +248,9 @@ class BigQuerySchemaGenerator:
|
|
|
248
248
|
def get_project_workunits(
|
|
249
249
|
self, project: BigqueryProject
|
|
250
250
|
) -> Iterable[MetadataWorkUnit]:
|
|
251
|
-
self.report.
|
|
252
|
-
|
|
253
|
-
|
|
251
|
+
with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
|
|
252
|
+
logger.info(f"Processing project: {project.id}")
|
|
253
|
+
yield from self._process_project(project)
|
|
254
254
|
|
|
255
255
|
def get_dataplatform_instance_aspect(
|
|
256
256
|
self, dataset_urn: str, project_id: str
|
|
@@ -405,11 +405,11 @@ class BigQuerySchemaGenerator:
|
|
|
405
405
|
|
|
406
406
|
if self.config.is_profiling_enabled():
|
|
407
407
|
logger.info(f"Starting profiling project {project_id}")
|
|
408
|
-
self.report.
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
408
|
+
with self.report.new_stage(f"{project_id}: {PROFILING}"):
|
|
409
|
+
yield from self.profiler.get_workunits(
|
|
410
|
+
project_id=project_id,
|
|
411
|
+
tables=db_tables,
|
|
412
|
+
)
|
|
413
413
|
|
|
414
414
|
def _process_project_datasets(
|
|
415
415
|
self,
|
|
@@ -1203,9 +1203,9 @@ class BigQuerySchemaGenerator:
|
|
|
1203
1203
|
report=self.report,
|
|
1204
1204
|
)
|
|
1205
1205
|
|
|
1206
|
-
self.report.metadata_extraction_sec[
|
|
1207
|
-
|
|
1208
|
-
)
|
|
1206
|
+
self.report.metadata_extraction_sec[
|
|
1207
|
+
f"{project_id}.{dataset.name}"
|
|
1208
|
+
] = timer.elapsed_seconds(digits=2)
|
|
1209
1209
|
|
|
1210
1210
|
def get_core_table_details(
|
|
1211
1211
|
self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
|
|
@@ -330,11 +330,11 @@ class BigqueryLineageExtractor:
|
|
|
330
330
|
projects = ["*"] # project_id not used when using exported metadata
|
|
331
331
|
|
|
332
332
|
for project in projects:
|
|
333
|
-
self.report.
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
333
|
+
with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
|
|
334
|
+
yield from self.generate_lineage(
|
|
335
|
+
project,
|
|
336
|
+
table_refs,
|
|
337
|
+
)
|
|
338
338
|
|
|
339
339
|
if self.redundant_run_skip_handler:
|
|
340
340
|
# Update the checkpoint state for this run.
|
|
@@ -368,8 +368,8 @@ class BigqueryLineageExtractor:
|
|
|
368
368
|
self.report.lineage_metadata_entries[project_id] = len(lineage)
|
|
369
369
|
logger.info(f"Built lineage map containing {len(lineage)} entries.")
|
|
370
370
|
logger.debug(f"lineage metadata is {lineage}")
|
|
371
|
-
self.report.lineage_extraction_sec[project_id] =
|
|
372
|
-
|
|
371
|
+
self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
|
|
372
|
+
digits=2
|
|
373
373
|
)
|
|
374
374
|
self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
|
|
375
375
|
memory_footprint.total_size(lineage)
|
|
@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
|
|
|
495
495
|
def _generate_operational_workunits(
|
|
496
496
|
self, usage_state: BigQueryUsageState, table_refs: Collection[str]
|
|
497
497
|
) -> Iterable[MetadataWorkUnit]:
|
|
498
|
-
self.report.
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
498
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
|
|
499
|
+
for audit_event in usage_state.standalone_events():
|
|
500
|
+
try:
|
|
501
|
+
operational_wu = self._create_operation_workunit(
|
|
502
|
+
audit_event, table_refs
|
|
503
|
+
)
|
|
504
|
+
if operational_wu:
|
|
505
|
+
yield operational_wu
|
|
506
|
+
self.report.num_operational_stats_workunits_emitted += 1
|
|
507
|
+
except Exception as e:
|
|
508
|
+
self.report.warning(
|
|
509
|
+
message="Unable to generate operation workunit",
|
|
510
|
+
context=f"{audit_event}",
|
|
511
|
+
exc=e,
|
|
512
|
+
)
|
|
513
513
|
|
|
514
514
|
def _generate_usage_workunits(
|
|
515
515
|
self, usage_state: BigQueryUsageState
|
|
516
516
|
) -> Iterable[MetadataWorkUnit]:
|
|
517
|
-
self.report.
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
517
|
+
with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
|
|
518
|
+
top_n = (
|
|
519
|
+
self.config.usage.top_n_queries
|
|
520
|
+
if self.config.usage.include_top_n_queries
|
|
521
|
+
else 0
|
|
522
|
+
)
|
|
523
|
+
for entry in usage_state.usage_statistics(top_n=top_n):
|
|
524
|
+
try:
|
|
525
|
+
query_freq = [
|
|
526
|
+
(
|
|
527
|
+
self.uuid_to_query.get(
|
|
528
|
+
query_hash, usage_state.queries[query_hash]
|
|
529
|
+
),
|
|
530
|
+
count,
|
|
531
|
+
)
|
|
532
|
+
for query_hash, count in entry.query_freq
|
|
533
|
+
]
|
|
534
|
+
yield make_usage_workunit(
|
|
535
|
+
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
+
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
+
query_count=entry.query_count,
|
|
538
|
+
query_freq=query_freq,
|
|
539
|
+
user_freq=entry.user_freq,
|
|
540
|
+
column_freq=entry.column_freq,
|
|
541
|
+
bucket_duration=self.config.bucket_duration,
|
|
542
|
+
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
+
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
+
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
+
)
|
|
547
|
+
self.report.num_usage_workunits_emitted += 1
|
|
548
|
+
except Exception as e:
|
|
549
|
+
self.report.warning(
|
|
550
|
+
message="Unable to generate usage statistics workunit",
|
|
551
|
+
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
+
exc=e,
|
|
531
553
|
)
|
|
532
|
-
for query_hash, count in entry.query_freq
|
|
533
|
-
]
|
|
534
|
-
yield make_usage_workunit(
|
|
535
|
-
bucket_start_time=datetime.fromisoformat(entry.timestamp),
|
|
536
|
-
resource=BigQueryTableRef.from_string_name(entry.resource),
|
|
537
|
-
query_count=entry.query_count,
|
|
538
|
-
query_freq=query_freq,
|
|
539
|
-
user_freq=entry.user_freq,
|
|
540
|
-
column_freq=entry.column_freq,
|
|
541
|
-
bucket_duration=self.config.bucket_duration,
|
|
542
|
-
resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
|
|
543
|
-
top_n_queries=self.config.usage.top_n_queries,
|
|
544
|
-
format_sql_queries=self.config.usage.format_sql_queries,
|
|
545
|
-
queries_character_limit=self.config.usage.queries_character_limit,
|
|
546
|
-
)
|
|
547
|
-
self.report.num_usage_workunits_emitted += 1
|
|
548
|
-
except Exception as e:
|
|
549
|
-
self.report.warning(
|
|
550
|
-
message="Unable to generate usage statistics workunit",
|
|
551
|
-
context=f"{entry.timestamp}, {entry.resource}",
|
|
552
|
-
exc=e,
|
|
553
|
-
)
|
|
554
554
|
|
|
555
555
|
def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
|
|
556
556
|
if self.config.use_exported_bigquery_audit_metadata:
|
|
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
|
|
|
559
559
|
for project_id in projects:
|
|
560
560
|
with PerfTimer() as timer:
|
|
561
561
|
try:
|
|
562
|
-
self.report.
|
|
563
|
-
project_id
|
|
564
|
-
)
|
|
565
|
-
|
|
562
|
+
with self.report.new_stage(
|
|
563
|
+
f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
|
|
564
|
+
):
|
|
565
|
+
yield from self._get_parsed_bigquery_log_events(project_id)
|
|
566
566
|
except Exception as e:
|
|
567
567
|
self.report.usage_failed_extraction.append(project_id)
|
|
568
568
|
self.report.warning(
|
|
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
|
|
|
572
572
|
)
|
|
573
573
|
self.report_status(f"usage-extraction-{project_id}", False)
|
|
574
574
|
|
|
575
|
-
self.report.usage_extraction_sec[project_id] =
|
|
576
|
-
|
|
575
|
+
self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
|
|
576
|
+
digits=2
|
|
577
577
|
)
|
|
578
578
|
|
|
579
579
|
def _store_usage_event(
|
|
@@ -70,30 +70,30 @@ class CassandraProfiler:
|
|
|
70
70
|
) -> Iterable[MetadataWorkUnit]:
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
|
-
self.report.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
73
|
+
with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
|
|
74
|
+
with ThreadPoolExecutor(
|
|
75
|
+
max_workers=self.config.profiling.max_workers
|
|
76
|
+
) as executor:
|
|
77
|
+
future_to_dataset = {
|
|
78
|
+
executor.submit(
|
|
79
|
+
self.generate_profile,
|
|
80
|
+
keyspace_name,
|
|
81
|
+
table_name,
|
|
82
|
+
cassandra_data.columns.get(table_name, []),
|
|
83
|
+
): table_name
|
|
84
|
+
for table_name in tables
|
|
85
|
+
}
|
|
86
|
+
for future in as_completed(future_to_dataset):
|
|
87
|
+
table_name = future_to_dataset[future]
|
|
88
|
+
try:
|
|
89
|
+
yield from future.result()
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
self.report.profiling_skipped_other[table_name] += 1
|
|
92
|
+
self.report.failure(
|
|
93
|
+
message="Failed to profile for table",
|
|
94
|
+
context=f"{keyspace_name}.{table_name}",
|
|
95
|
+
exc=exc,
|
|
96
|
+
)
|
|
97
97
|
|
|
98
98
|
def generate_profile(
|
|
99
99
|
self,
|
|
@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
|
|
|
54
54
|
else:
|
|
55
55
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
56
56
|
|
|
57
|
-
def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
|
|
58
|
-
self.report_ingestion_stage_start(f"{keyspace}: {stage}")
|
|
59
|
-
|
|
60
57
|
# TODO Need to create seperate common config for profiling report
|
|
61
58
|
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
62
59
|
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
@@ -108,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
108
108
|
|
|
109
109
|
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
|
|
110
110
|
|
|
111
|
+
drop_duplicate_schema_fields: bool = Field(
|
|
112
|
+
default=False,
|
|
113
|
+
description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
|
|
114
|
+
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
115
|
+
)
|
|
116
|
+
|
|
111
117
|
@root_validator(skip_on_failure=True)
|
|
112
118
|
def check_ingesting_data(cls, values):
|
|
113
119
|
if (
|
|
@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
12
12
|
support_status,
|
|
13
13
|
)
|
|
14
14
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
15
|
-
from datahub.ingestion.api.source_helpers import
|
|
15
|
+
from datahub.ingestion.api.source_helpers import (
|
|
16
|
+
auto_fix_duplicate_schema_field_paths,
|
|
17
|
+
auto_workunit_reporter,
|
|
18
|
+
)
|
|
16
19
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
17
20
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
18
21
|
from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
|
|
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
57
60
|
|
|
58
61
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
59
62
|
# Exactly replicate data from DataHub source
|
|
60
|
-
return [
|
|
63
|
+
return [
|
|
64
|
+
(
|
|
65
|
+
auto_fix_duplicate_schema_field_paths
|
|
66
|
+
if self.config.drop_duplicate_schema_fields
|
|
67
|
+
else None
|
|
68
|
+
),
|
|
69
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
70
|
+
]
|
|
61
71
|
|
|
62
72
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
63
73
|
self.report.stop_time = datetime.now(tz=timezone.utc)
|
|
@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
472
472
|
env=self.config.env,
|
|
473
473
|
platform_instance=self.config.platform_instance,
|
|
474
474
|
)
|
|
475
|
-
self.report.
|
|
476
|
-
|
|
475
|
+
with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
|
|
476
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
477
477
|
|
|
478
478
|
def generate_view_lineage(
|
|
479
479
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
|
|
|
141
141
|
) -> Iterable[MetadataWorkUnit]:
|
|
142
142
|
if self.config.cleanup_expired_tokens:
|
|
143
143
|
try:
|
|
144
|
-
self.report.
|
|
145
|
-
|
|
144
|
+
with self.report.new_stage("Expired Token Cleanup"):
|
|
145
|
+
self.revoke_expired_tokens()
|
|
146
146
|
except Exception as e:
|
|
147
147
|
self.report.failure("While trying to cleanup expired token ", exc=e)
|
|
148
148
|
if self.config.truncate_indices:
|
|
149
149
|
try:
|
|
150
|
-
self.report.
|
|
151
|
-
|
|
150
|
+
with self.report.new_stage("Truncate Indices"):
|
|
151
|
+
self.truncate_indices()
|
|
152
152
|
except Exception as e:
|
|
153
153
|
self.report.failure("While trying to truncate indices ", exc=e)
|
|
154
154
|
if self.config.soft_deleted_entities_cleanup.enabled:
|
|
155
155
|
try:
|
|
156
|
-
self.report.
|
|
157
|
-
|
|
158
|
-
)
|
|
159
|
-
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
156
|
+
with self.report.new_stage("Soft Deleted Entities Cleanup"):
|
|
157
|
+
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
160
158
|
except Exception as e:
|
|
161
159
|
self.report.failure(
|
|
162
160
|
"While trying to cleanup soft deleted entities ", exc=e
|
|
163
161
|
)
|
|
164
162
|
if self.config.dataprocess_cleanup.enabled:
|
|
165
163
|
try:
|
|
166
|
-
self.report.
|
|
167
|
-
|
|
164
|
+
with self.report.new_stage("Data Process Cleanup"):
|
|
165
|
+
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
168
166
|
except Exception as e:
|
|
169
167
|
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
170
168
|
if self.config.execution_request_cleanup.enabled:
|
|
171
169
|
try:
|
|
172
|
-
self.report.
|
|
173
|
-
|
|
170
|
+
with self.report.new_stage("Execution request Cleanup"):
|
|
171
|
+
self.execution_request_cleanup.run()
|
|
174
172
|
except Exception as e:
|
|
175
173
|
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
176
|
-
# Otherwise last stage's duration does not get calculated.
|
|
177
|
-
self.report.report_ingestion_stage_start("End")
|
|
178
174
|
yield from []
|
|
179
175
|
|
|
180
176
|
def truncate_indices(self) -> None:
|
|
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
keep_history_max_days: int = Field(
|
|
32
|
-
|
|
32
|
+
90,
|
|
33
33
|
description="Maximum number of days to keep execution requests for, per ingestion source",
|
|
34
34
|
)
|
|
35
35
|
|
|
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
48
48
|
description="Maximum runtime in seconds for the cleanup task",
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
+
limit_entities_delete: Optional[int] = Field(
|
|
52
|
+
10000, description="Max number of execution requests to hard delete."
|
|
53
|
+
)
|
|
54
|
+
|
|
51
55
|
max_read_errors: int = Field(
|
|
52
56
|
default=10,
|
|
53
57
|
description="Maximum number of read errors before aborting",
|
|
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
|
|
|
65
69
|
ergc_delete_errors: int = 0
|
|
66
70
|
ergc_start_time: Optional[datetime.datetime] = None
|
|
67
71
|
ergc_end_time: Optional[datetime.datetime] = None
|
|
72
|
+
ergc_delete_limit_reached: bool = False
|
|
73
|
+
ergc_runtime_limit_reached: bool = False
|
|
68
74
|
|
|
69
75
|
|
|
70
76
|
class CleanupRecord(BaseModel):
|
|
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
|
|
|
85
91
|
self.graph = graph
|
|
86
92
|
self.report = report
|
|
87
93
|
self.instance_id = int(time.time())
|
|
94
|
+
self.last_print_time = 0.0
|
|
88
95
|
|
|
89
96
|
if config is not None:
|
|
90
97
|
self.config = config
|
|
91
98
|
else:
|
|
92
99
|
self.config = DatahubExecutionRequestCleanupConfig()
|
|
93
100
|
|
|
101
|
+
def _print_report(self) -> None:
|
|
102
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
103
|
+
# Print report every 2 minutes
|
|
104
|
+
if time_taken > 120:
|
|
105
|
+
self.last_print_time = time.time()
|
|
106
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
107
|
+
|
|
94
108
|
def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
|
|
95
109
|
input_aspect = (
|
|
96
110
|
entry.get("aspects", {})
|
|
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
175
189
|
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
|
|
176
190
|
|
|
177
191
|
for entry in self._scroll_execution_requests():
|
|
192
|
+
self._print_report()
|
|
178
193
|
self.report.ergc_records_read += 1
|
|
179
194
|
key = entry.ingestion_source
|
|
180
195
|
|
|
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
|
|
|
225
240
|
f"record timestamp: {entry.requested_at}."
|
|
226
241
|
)
|
|
227
242
|
)
|
|
228
|
-
self.report.ergc_records_deleted += 1
|
|
229
243
|
yield entry
|
|
230
244
|
|
|
231
245
|
def _delete_entry(self, entry: CleanupRecord) -> None:
|
|
232
246
|
try:
|
|
233
|
-
logger.info(
|
|
234
|
-
f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
|
|
235
|
-
)
|
|
236
247
|
self.graph.delete_entity(entry.urn, True)
|
|
248
|
+
self.report.ergc_records_deleted += 1
|
|
237
249
|
except Exception as e:
|
|
238
250
|
self.report.ergc_delete_errors += 1
|
|
239
251
|
self.report.failure(
|
|
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
|
|
|
252
264
|
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
|
|
253
265
|
)
|
|
254
266
|
):
|
|
267
|
+
self.report.ergc_runtime_limit_reached = True
|
|
255
268
|
logger.info(f"ergc({self.instance_id}): max runtime reached.")
|
|
256
269
|
return True
|
|
257
270
|
return False
|
|
258
271
|
|
|
272
|
+
def _reached_delete_limit(self) -> bool:
|
|
273
|
+
if (
|
|
274
|
+
self.config.limit_entities_delete
|
|
275
|
+
and self.report.ergc_records_deleted >= self.config.limit_entities_delete
|
|
276
|
+
):
|
|
277
|
+
logger.info(
|
|
278
|
+
f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
|
|
279
|
+
)
|
|
280
|
+
self.report.ergc_delete_limit_reached = True
|
|
281
|
+
return True
|
|
282
|
+
return False
|
|
283
|
+
|
|
259
284
|
def run(self) -> None:
|
|
260
285
|
if not self.config.enabled:
|
|
261
286
|
logger.info(
|
|
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
274
299
|
)
|
|
275
300
|
|
|
276
301
|
for entry in self._scroll_garbage_records():
|
|
277
|
-
if self._reached_runtime_limit():
|
|
302
|
+
if self._reached_runtime_limit() or self._reached_delete_limit():
|
|
278
303
|
break
|
|
279
304
|
self._delete_entry(entry)
|
|
280
305
|
|
|
@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
|
|
|
231
231
|
def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
|
|
232
232
|
assert self.ctx.graph
|
|
233
233
|
scroll_id: Optional[str] = None
|
|
234
|
+
|
|
235
|
+
batch_size = self.config.batch_size
|
|
236
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
237
|
+
# Due to a bug in Data process instance querying this is a temp workaround
|
|
238
|
+
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
239
|
+
# This will be remove in future version after server with fix has been
|
|
240
|
+
# around for a while
|
|
241
|
+
batch_size = 10
|
|
242
|
+
|
|
234
243
|
while True:
|
|
235
244
|
try:
|
|
236
245
|
result = self.ctx.graph.execute_graphql(
|
|
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
240
249
|
"types": [entity_type],
|
|
241
250
|
"query": "*",
|
|
242
251
|
"scrollId": scroll_id if scroll_id else None,
|
|
243
|
-
"count":
|
|
252
|
+
"count": batch_size,
|
|
244
253
|
"orFilters": [
|
|
245
254
|
{
|
|
246
255
|
"and": [
|
|
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
|
|
|
263
272
|
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
264
273
|
if not scroll_across_entities or not scroll_across_entities.get("count"):
|
|
265
274
|
break
|
|
275
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
276
|
+
# Temp workaround. See note in beginning of the function
|
|
277
|
+
# We make the batch size = config after call has succeeded once
|
|
278
|
+
batch_size = self.config.batch_size
|
|
266
279
|
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
267
280
|
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
268
281
|
for query in scroll_across_entities.get("searchResults"):
|
|
@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
|
|
|
300
300
|
|
|
301
301
|
folder_path_pattern: AllowDenyPattern = Field(
|
|
302
302
|
default=AllowDenyPattern.allow_all(),
|
|
303
|
-
description="Allow or deny dashboards from specific folders. "
|
|
303
|
+
description="Allow or deny dashboards from specific folders using their fully qualified paths. "
|
|
304
304
|
"For example: \n"
|
|
305
305
|
"deny: \n"
|
|
306
|
-
" -
|
|
307
|
-
"This pattern will deny the ingestion of all dashboards and looks within the
|
|
306
|
+
" - Shared/deprecated \n"
|
|
307
|
+
"This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
|
|
308
|
+
"allow: \n"
|
|
309
|
+
" - Shared/sales \n"
|
|
310
|
+
"This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
|
|
311
|
+
"To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
|
|
312
|
+
"For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
|
|
308
313
|
"Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
|
|
309
314
|
)
|
|
310
315
|
|
|
@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
|
485
485
|
self.filtered_reports.append(view)
|
|
486
486
|
|
|
487
487
|
|
|
488
|
-
@platform_name("PowerBI")
|
|
488
|
+
@platform_name("PowerBI Report Server")
|
|
489
489
|
@config_class(PowerBiReportServerDashboardSourceConfig)
|
|
490
490
|
@support_status(SupportStatus.INCUBATING)
|
|
491
491
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|