acryl-datahub 0.15.0.1rc11__py3-none-any.whl → 0.15.0.1rc13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/METADATA +2320 -2324
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/RECORD +40 -39
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/configuration/common.py +2 -5
- datahub/emitter/mce_builder.py +17 -1
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +2 -2
- datahub/emitter/rest_emitter.py +2 -2
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +159 -71
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/sql/hive.py +15 -0
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +1 -4
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/metadata/_schema_classes.py +61 -1
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/schema.avsc +64 -29
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime
|
|
5
5
|
from functools import lru_cache
|
|
6
6
|
from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Optional
|
|
7
7
|
|
|
@@ -15,6 +15,7 @@ from google.cloud.bigquery.table import (
|
|
|
15
15
|
TimePartitioningType,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
18
19
|
from datahub.ingestion.api.source import SourceReport
|
|
19
20
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
|
|
20
21
|
from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
|
|
@@ -393,13 +394,7 @@ class BigQuerySchemaApi:
|
|
|
393
394
|
name=table.table_name,
|
|
394
395
|
created=table.created,
|
|
395
396
|
table_type=table.table_type,
|
|
396
|
-
last_altered=(
|
|
397
|
-
datetime.fromtimestamp(
|
|
398
|
-
table.get("last_altered") / 1000, tz=timezone.utc
|
|
399
|
-
)
|
|
400
|
-
if table.get("last_altered") is not None
|
|
401
|
-
else None
|
|
402
|
-
),
|
|
397
|
+
last_altered=parse_ts_millis(table.get("last_altered")),
|
|
403
398
|
size_in_bytes=table.get("bytes"),
|
|
404
399
|
rows_count=table.get("row_count"),
|
|
405
400
|
comment=table.comment,
|
|
@@ -460,11 +455,7 @@ class BigQuerySchemaApi:
|
|
|
460
455
|
return BigqueryView(
|
|
461
456
|
name=view.table_name,
|
|
462
457
|
created=view.created,
|
|
463
|
-
last_altered=(
|
|
464
|
-
datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
|
|
465
|
-
if view.get("last_altered") is not None
|
|
466
|
-
else None
|
|
467
|
-
),
|
|
458
|
+
last_altered=(parse_ts_millis(view.get("last_altered"))),
|
|
468
459
|
comment=view.comment,
|
|
469
460
|
view_definition=view.view_definition,
|
|
470
461
|
materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
|
|
@@ -705,13 +696,7 @@ class BigQuerySchemaApi:
|
|
|
705
696
|
return BigqueryTableSnapshot(
|
|
706
697
|
name=snapshot.table_name,
|
|
707
698
|
created=snapshot.created,
|
|
708
|
-
last_altered=(
|
|
709
|
-
datetime.fromtimestamp(
|
|
710
|
-
snapshot.get("last_altered") / 1000, tz=timezone.utc
|
|
711
|
-
)
|
|
712
|
-
if snapshot.get("last_altered") is not None
|
|
713
|
-
else None
|
|
714
|
-
),
|
|
699
|
+
last_altered=parse_ts_millis(snapshot.get("last_altered")),
|
|
715
700
|
comment=snapshot.comment,
|
|
716
701
|
ddl=snapshot.ddl,
|
|
717
702
|
snapshot_time=snapshot.snapshot_time,
|
|
@@ -12,6 +12,7 @@ from confluent_kafka.schema_registry import SchemaRegistryClient
|
|
|
12
12
|
from confluent_kafka.schema_registry.avro import AvroDeserializer
|
|
13
13
|
|
|
14
14
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
15
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
15
16
|
from datahub.ingestion.api.closeable import Closeable
|
|
16
17
|
from datahub.ingestion.api.common import PipelineContext
|
|
17
18
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
@@ -92,7 +93,7 @@ class DataHubKafkaReader(Closeable):
|
|
|
92
93
|
if mcl.created and mcl.created.time > stop_time.timestamp() * 1000:
|
|
93
94
|
logger.info(
|
|
94
95
|
f"Stopped reading from kafka, reached MCL "
|
|
95
|
-
f"with audit stamp {
|
|
96
|
+
f"with audit stamp {parse_ts_millis(mcl.created.time)}"
|
|
96
97
|
)
|
|
97
98
|
break
|
|
98
99
|
|
|
@@ -167,9 +167,11 @@ class DataJobEntity:
|
|
|
167
167
|
class DataProcessCleanupReport(SourceReport):
|
|
168
168
|
num_aspects_removed: int = 0
|
|
169
169
|
num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
170
|
-
|
|
170
|
+
sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
171
171
|
default_factory=TopKDict
|
|
172
172
|
)
|
|
173
|
+
num_data_flows_found: int = 0
|
|
174
|
+
num_data_jobs_found: int = 0
|
|
173
175
|
|
|
174
176
|
|
|
175
177
|
class DataProcessCleanup:
|
|
@@ -265,13 +267,17 @@ class DataProcessCleanup:
|
|
|
265
267
|
self.report.report_failure(
|
|
266
268
|
f"Exception while deleting DPI: {e}", exc=e
|
|
267
269
|
)
|
|
268
|
-
if
|
|
270
|
+
if (
|
|
271
|
+
deleted_count_last_n % self.config.batch_size == 0
|
|
272
|
+
and deleted_count_last_n > 0
|
|
273
|
+
):
|
|
269
274
|
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
270
275
|
if self.config.delay:
|
|
271
276
|
logger.info(f"Sleeping for {self.config.delay} seconds")
|
|
272
277
|
time.sleep(self.config.delay)
|
|
273
278
|
|
|
274
|
-
|
|
279
|
+
if deleted_count_last_n > 0:
|
|
280
|
+
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
275
281
|
|
|
276
282
|
def delete_entity(self, urn: str, type: str) -> None:
|
|
277
283
|
assert self.ctx.graph
|
|
@@ -280,9 +286,9 @@ class DataProcessCleanup:
|
|
|
280
286
|
self.report.num_aspect_removed_by_type[type] = (
|
|
281
287
|
self.report.num_aspect_removed_by_type.get(type, 0) + 1
|
|
282
288
|
)
|
|
283
|
-
if type not in self.report.
|
|
284
|
-
self.report.
|
|
285
|
-
self.report.
|
|
289
|
+
if type not in self.report.sample_soft_deleted_aspects_by_type:
|
|
290
|
+
self.report.sample_soft_deleted_aspects_by_type[type] = LossyList()
|
|
291
|
+
self.report.sample_soft_deleted_aspects_by_type[type].append(urn)
|
|
286
292
|
|
|
287
293
|
if self.dry_run:
|
|
288
294
|
logger.info(
|
|
@@ -351,7 +357,10 @@ class DataProcessCleanup:
|
|
|
351
357
|
except Exception as e:
|
|
352
358
|
self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
|
|
353
359
|
|
|
354
|
-
if
|
|
360
|
+
if (
|
|
361
|
+
deleted_count_retention % self.config.batch_size == 0
|
|
362
|
+
and deleted_count_retention > 0
|
|
363
|
+
):
|
|
355
364
|
logger.info(
|
|
356
365
|
f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
|
|
357
366
|
)
|
|
@@ -393,6 +402,7 @@ class DataProcessCleanup:
|
|
|
393
402
|
scrollAcrossEntities = result.get("scrollAcrossEntities")
|
|
394
403
|
if not scrollAcrossEntities:
|
|
395
404
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
405
|
+
self.report.num_data_flows_found += scrollAcrossEntities.get("count")
|
|
396
406
|
logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
|
|
397
407
|
|
|
398
408
|
scroll_id = scrollAcrossEntities.get("nextScrollId")
|
|
@@ -415,8 +425,9 @@ class DataProcessCleanup:
|
|
|
415
425
|
assert self.ctx.graph
|
|
416
426
|
|
|
417
427
|
dataFlows: Dict[str, DataFlowEntity] = {}
|
|
418
|
-
|
|
419
|
-
|
|
428
|
+
if self.config.delete_empty_data_flows:
|
|
429
|
+
for flow in self.get_data_flows():
|
|
430
|
+
dataFlows[flow.urn] = flow
|
|
420
431
|
|
|
421
432
|
scroll_id: Optional[str] = None
|
|
422
433
|
previous_scroll_id: Optional[str] = None
|
|
@@ -443,6 +454,7 @@ class DataProcessCleanup:
|
|
|
443
454
|
if not scrollAcrossEntities:
|
|
444
455
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
445
456
|
|
|
457
|
+
self.report.num_data_jobs_found += scrollAcrossEntities.get("count")
|
|
446
458
|
logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities")
|
|
447
459
|
|
|
448
460
|
scroll_id = scrollAcrossEntities.get("nextScrollId")
|
|
@@ -481,7 +493,8 @@ class DataProcessCleanup:
|
|
|
481
493
|
|
|
482
494
|
previous_scroll_id = scroll_id
|
|
483
495
|
|
|
484
|
-
|
|
496
|
+
if deleted_jobs > 0:
|
|
497
|
+
logger.info(f"Deleted {deleted_jobs} DataJobs")
|
|
485
498
|
# Delete empty dataflows if needed
|
|
486
499
|
if self.config.delete_empty_data_flows:
|
|
487
500
|
deleted_data_flows: int = 0
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
|
-
from concurrent.futures import ThreadPoolExecutor,
|
|
3
|
+
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
|
-
from
|
|
6
|
+
from threading import Lock
|
|
7
|
+
from typing import Dict, Iterable, List, Optional
|
|
7
8
|
|
|
8
9
|
from pydantic import Field
|
|
9
10
|
|
|
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
|
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
22
|
+
QUERY_QUERY_ENTITY = """
|
|
23
|
+
query listQueries($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
+
scrollAcrossEntities(input: $input) {
|
|
25
|
+
nextScrollId
|
|
26
|
+
count
|
|
27
|
+
searchResults {
|
|
28
|
+
entity {
|
|
29
|
+
... on QueryEntity {
|
|
30
|
+
urn
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
|
|
21
38
|
|
|
22
39
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
23
40
|
enabled: bool = Field(
|
|
24
41
|
default=True, description="Whether to do soft deletion cleanup."
|
|
25
42
|
)
|
|
26
|
-
retention_days:
|
|
43
|
+
retention_days: int = Field(
|
|
27
44
|
10,
|
|
28
45
|
description="Number of days to retain metadata in DataHub",
|
|
29
46
|
)
|
|
@@ -62,23 +79,30 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
62
79
|
default=None,
|
|
63
80
|
description="Query to filter entities",
|
|
64
81
|
)
|
|
82
|
+
|
|
65
83
|
limit_entities_delete: Optional[int] = Field(
|
|
66
84
|
25000, description="Max number of entities to delete."
|
|
67
85
|
)
|
|
68
86
|
|
|
69
|
-
|
|
70
|
-
|
|
87
|
+
futures_max_at_time: int = Field(
|
|
88
|
+
1000, description="Max number of futures to have at a time."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
runtime_limit_seconds: int = Field(
|
|
92
|
+
7200, # 2 hours by default
|
|
71
93
|
description="Runtime limit in seconds",
|
|
72
94
|
)
|
|
73
95
|
|
|
74
96
|
|
|
75
97
|
@dataclass
|
|
76
98
|
class SoftDeletedEntitiesReport(SourceReport):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
99
|
+
num_queries_found: int = 0
|
|
100
|
+
num_soft_deleted_entity_processed: int = 0
|
|
101
|
+
num_soft_deleted_retained_due_to_age: int = 0
|
|
102
|
+
num_soft_deleted_entity_removal_started: int = 0
|
|
103
|
+
num_hard_deleted: int = 0
|
|
104
|
+
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
105
|
+
sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
82
106
|
default_factory=TopKDict
|
|
83
107
|
)
|
|
84
108
|
|
|
@@ -103,48 +127,53 @@ class SoftDeletedEntitiesCleanup:
|
|
|
103
127
|
self.config = config
|
|
104
128
|
self.report = report
|
|
105
129
|
self.dry_run = dry_run
|
|
130
|
+
self.start_time = 0.0
|
|
131
|
+
self._report_lock: Lock = Lock()
|
|
132
|
+
self.last_print_time = 0.0
|
|
133
|
+
|
|
134
|
+
def _increment_retained_count(self) -> None:
|
|
135
|
+
"""Thread-safe method to update report fields"""
|
|
136
|
+
with self._report_lock:
|
|
137
|
+
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
138
|
+
|
|
139
|
+
def _increment_removal_started_count(self) -> None:
|
|
140
|
+
"""Thread-safe method to update report fields"""
|
|
141
|
+
with self._report_lock:
|
|
142
|
+
self.report.num_soft_deleted_entity_removal_started += 1
|
|
143
|
+
|
|
144
|
+
def _update_report(self, urn: str, entity_type: str) -> None:
|
|
145
|
+
"""Thread-safe method to update report fields"""
|
|
146
|
+
with self._report_lock:
|
|
147
|
+
self.report.num_hard_deleted += 1
|
|
148
|
+
|
|
149
|
+
current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
|
|
150
|
+
self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
|
|
151
|
+
if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
|
|
152
|
+
self.report.sample_hard_deleted_aspects_by_type[
|
|
153
|
+
entity_type
|
|
154
|
+
] = LossyList()
|
|
155
|
+
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
106
156
|
|
|
107
157
|
def delete_entity(self, urn: str) -> None:
|
|
108
158
|
assert self.ctx.graph
|
|
109
159
|
|
|
110
160
|
entity_urn = Urn.from_string(urn)
|
|
111
|
-
self.report.num_soft_deleted_entity_removed += 1
|
|
112
|
-
self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
|
|
113
|
-
self.report.num_soft_deleted_entity_removed_by_type.get(
|
|
114
|
-
entity_urn.entity_type, 0
|
|
115
|
-
)
|
|
116
|
-
+ 1
|
|
117
|
-
)
|
|
118
|
-
if (
|
|
119
|
-
entity_urn.entity_type
|
|
120
|
-
not in self.report.sample_soft_deleted_removed_aspects_by_type
|
|
121
|
-
):
|
|
122
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
123
|
-
entity_urn.entity_type
|
|
124
|
-
] = LossyList()
|
|
125
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
126
|
-
entity_urn.entity_type
|
|
127
|
-
].append(urn)
|
|
128
|
-
|
|
129
161
|
if self.dry_run:
|
|
130
162
|
logger.info(
|
|
131
163
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
132
164
|
)
|
|
133
165
|
return
|
|
134
|
-
|
|
166
|
+
self._increment_removal_started_count()
|
|
135
167
|
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
136
168
|
self.ctx.graph.delete_references_to_urn(
|
|
137
169
|
urn=urn,
|
|
138
170
|
dry_run=False,
|
|
139
171
|
)
|
|
172
|
+
self._update_report(urn, entity_urn.entity_type)
|
|
140
173
|
|
|
141
174
|
def delete_soft_deleted_entity(self, urn: str) -> None:
|
|
142
175
|
assert self.ctx.graph
|
|
143
176
|
|
|
144
|
-
if self.config.retention_days is None:
|
|
145
|
-
logger.info("Retention days is not set, skipping soft delete cleanup")
|
|
146
|
-
return
|
|
147
|
-
|
|
148
177
|
retention_time = (
|
|
149
178
|
int(datetime.now(timezone.utc).timestamp())
|
|
150
179
|
- self.config.retention_days * 24 * 60 * 60
|
|
@@ -157,15 +186,85 @@ class SoftDeletedEntitiesCleanup:
|
|
|
157
186
|
]["created"]["time"] < (retention_time * 1000):
|
|
158
187
|
logger.debug(f"Hard deleting {urn}")
|
|
159
188
|
self.delete_entity(urn)
|
|
189
|
+
else:
|
|
190
|
+
self._increment_retained_count()
|
|
191
|
+
|
|
192
|
+
def _print_report(self) -> None:
|
|
193
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
194
|
+
# Print report every 2 minutes
|
|
195
|
+
if time_taken > 120:
|
|
196
|
+
self.last_print_time = time.time()
|
|
197
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
198
|
+
|
|
199
|
+
def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
|
|
200
|
+
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
201
|
+
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
202
|
+
|
|
203
|
+
for future in done:
|
|
204
|
+
self._print_report()
|
|
205
|
+
if future.exception():
|
|
206
|
+
logger.error(
|
|
207
|
+
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
+
)
|
|
209
|
+
self.report.failure(
|
|
210
|
+
f"Failed to delete entity {futures[future]}",
|
|
211
|
+
exc=future.exception(),
|
|
212
|
+
)
|
|
213
|
+
self.report.num_soft_deleted_entity_processed += 1
|
|
214
|
+
if (
|
|
215
|
+
self.report.num_soft_deleted_entity_processed % self.config.batch_size
|
|
216
|
+
== 0
|
|
217
|
+
):
|
|
218
|
+
if self.config.delay:
|
|
219
|
+
logger.debug(
|
|
220
|
+
f"Sleeping for {self.config.delay} seconds before further processing batch"
|
|
221
|
+
)
|
|
222
|
+
time.sleep(self.config.delay)
|
|
223
|
+
return futures
|
|
160
224
|
|
|
161
|
-
def
|
|
162
|
-
if not self.config.enabled:
|
|
163
|
-
return
|
|
225
|
+
def _get_soft_deleted_queries(self) -> Iterable[str]:
|
|
164
226
|
assert self.ctx.graph
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
227
|
+
scroll_id: Optional[str] = None
|
|
228
|
+
while True:
|
|
229
|
+
try:
|
|
230
|
+
result = self.ctx.graph.execute_graphql(
|
|
231
|
+
QUERY_QUERY_ENTITY,
|
|
232
|
+
{
|
|
233
|
+
"input": {
|
|
234
|
+
"types": ["QUERY"],
|
|
235
|
+
"query": "*",
|
|
236
|
+
"scrollId": scroll_id if scroll_id else None,
|
|
237
|
+
"count": self.config.batch_size,
|
|
238
|
+
"orFilters": [
|
|
239
|
+
{
|
|
240
|
+
"and": [
|
|
241
|
+
{
|
|
242
|
+
"field": "removed",
|
|
243
|
+
"values": ["true"],
|
|
244
|
+
"condition": "EQUAL",
|
|
245
|
+
}
|
|
246
|
+
]
|
|
247
|
+
}
|
|
248
|
+
],
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
except Exception as e:
|
|
253
|
+
self.report.failure(
|
|
254
|
+
f"While trying to get queries with {scroll_id}", exc=e
|
|
255
|
+
)
|
|
256
|
+
break
|
|
257
|
+
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
258
|
+
if not scroll_across_entities:
|
|
259
|
+
break
|
|
260
|
+
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
261
|
+
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
262
|
+
for query in scroll_across_entities.get("searchResults"):
|
|
263
|
+
yield query["entity"]["urn"]
|
|
264
|
+
|
|
265
|
+
def _get_urns(self) -> Iterable[str]:
|
|
266
|
+
assert self.ctx.graph
|
|
267
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
169
268
|
entity_types=self.config.entity_types,
|
|
170
269
|
platform=self.config.platform,
|
|
171
270
|
env=self.config.env,
|
|
@@ -173,52 +272,41 @@ class SoftDeletedEntitiesCleanup:
|
|
|
173
272
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
174
273
|
batch_size=self.config.batch_size,
|
|
175
274
|
)
|
|
275
|
+
yield from self._get_soft_deleted_queries()
|
|
276
|
+
|
|
277
|
+
def cleanup_soft_deleted_entities(self) -> None:
|
|
278
|
+
if not self.config.enabled:
|
|
279
|
+
return
|
|
280
|
+
self.start_time = time.time()
|
|
176
281
|
|
|
177
|
-
futures =
|
|
282
|
+
futures: Dict[Future, str] = dict()
|
|
178
283
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
284
|
+
for urn in self._get_urns():
|
|
285
|
+
self._print_report()
|
|
286
|
+
while len(futures) >= self.config.futures_max_at_time:
|
|
287
|
+
futures = self._process_futures(futures)
|
|
182
288
|
if (
|
|
183
289
|
self.config.limit_entities_delete
|
|
184
|
-
and
|
|
290
|
+
and self.report.num_hard_deleted > self.config.limit_entities_delete
|
|
185
291
|
):
|
|
186
292
|
logger.info(
|
|
187
|
-
f"Limit of {self.config.limit_entities_delete} entities reached.
|
|
293
|
+
f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
|
|
188
294
|
)
|
|
189
295
|
break
|
|
190
296
|
if (
|
|
191
297
|
self.config.runtime_limit_seconds
|
|
192
|
-
and time.time() -
|
|
298
|
+
and time.time() - self.start_time
|
|
299
|
+
> self.config.runtime_limit_seconds
|
|
193
300
|
):
|
|
194
301
|
logger.info(
|
|
195
|
-
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached.
|
|
302
|
+
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
|
|
196
303
|
)
|
|
197
304
|
break
|
|
198
305
|
|
|
199
306
|
future = executor.submit(self.delete_soft_deleted_entity, urn)
|
|
200
307
|
futures[future] = urn
|
|
201
308
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
logger.error(
|
|
207
|
-
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
-
)
|
|
209
|
-
self.report.failure(
|
|
210
|
-
f"Failed to delete entity {futures[future]}",
|
|
211
|
-
exc=future.exception(),
|
|
212
|
-
)
|
|
213
|
-
deleted_count_retention += 1
|
|
214
|
-
|
|
215
|
-
if deleted_count_retention % self.config.batch_size == 0:
|
|
216
|
-
logger.info(
|
|
217
|
-
f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
if self.config.delay:
|
|
221
|
-
logger.debug(
|
|
222
|
-
f"Sleeping for {self.config.delay} seconds before getting next batch"
|
|
223
|
-
)
|
|
224
|
-
time.sleep(self.config.delay)
|
|
309
|
+
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
310
|
+
while len(futures) > 0:
|
|
311
|
+
self._print_report()
|
|
312
|
+
futures = self._process_futures(futures)
|
|
@@ -225,7 +225,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
225
225
|
self.init_spark()
|
|
226
226
|
|
|
227
227
|
def init_spark(self):
|
|
228
|
-
os.environ.setdefault("SPARK_VERSION", "3.
|
|
228
|
+
os.environ.setdefault("SPARK_VERSION", "3.5")
|
|
229
229
|
spark_version = os.environ["SPARK_VERSION"]
|
|
230
230
|
|
|
231
231
|
# Importing here to avoid Deequ dependency for non profiling use cases
|
|
@@ -838,3 +838,18 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
838
838
|
entityUrn=dataset_urn,
|
|
839
839
|
aspect=view_properties_aspect,
|
|
840
840
|
).as_workunit()
|
|
841
|
+
|
|
842
|
+
if view_definition and self.config.include_view_lineage:
|
|
843
|
+
default_db = None
|
|
844
|
+
default_schema = None
|
|
845
|
+
try:
|
|
846
|
+
default_db, default_schema = self.get_db_schema(dataset_name)
|
|
847
|
+
except ValueError:
|
|
848
|
+
logger.warning(f"Invalid view identifier: {dataset_name}")
|
|
849
|
+
|
|
850
|
+
self.aggregator.add_view_definition(
|
|
851
|
+
view_urn=dataset_urn,
|
|
852
|
+
view_definition=view_definition,
|
|
853
|
+
default_db=default_db,
|
|
854
|
+
default_schema=default_schema,
|
|
855
|
+
)
|
|
@@ -123,6 +123,10 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
123
123
|
description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
+
include_view_lineage: bool = Field(
|
|
127
|
+
default=False, description="", hidden_from_docs=True
|
|
128
|
+
)
|
|
129
|
+
|
|
126
130
|
include_catalog_name_in_ids: bool = Field(
|
|
127
131
|
default=False,
|
|
128
132
|
description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
|
|
@@ -160,6 +164,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
160
164
|
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
161
165
|
@capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
|
|
162
166
|
@capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
|
|
167
|
+
@capability(
|
|
168
|
+
SourceCapability.LINEAGE_COARSE, "View lineage is not supported", supported=False
|
|
169
|
+
)
|
|
163
170
|
class HiveMetastoreSource(SQLAlchemySource):
|
|
164
171
|
"""
|
|
165
172
|
This plugin extracts the following:
|
|
@@ -724,7 +724,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
724
724
|
):
|
|
725
725
|
yield from auto_workunit(
|
|
726
726
|
generate_procedure_lineage(
|
|
727
|
-
schema_resolver=self.
|
|
727
|
+
schema_resolver=self.get_schema_resolver(),
|
|
728
728
|
procedure=procedure,
|
|
729
729
|
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
730
730
|
is_temp_table=self.is_temp_table,
|