acryl-datahub 0.15.0.1rc12__py3-none-any.whl → 0.15.0.1rc13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/METADATA +2563 -2563
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/RECORD +14 -13
- datahub/__init__.py +1 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/gc/dataprocess_cleanup.py +4 -4
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +159 -71
- datahub/metadata/_schema_classes.py +61 -1
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/schema.avsc +64 -29
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=kKM5imQ7UziWDSMvn1Ic5ZENvcshwalM2y2qGjZxUHY,577
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -169,7 +169,7 @@ datahub/ingestion/graph/config.py,sha256=3b_Gxa5wcBnphP63bBiAFdWS7PJhUHRE1WZL_q4
|
|
|
169
169
|
datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
|
|
170
170
|
datahub/ingestion/graph/filters.py,sha256=UeUZQHoimavIYx-jXLA0WGkOUe10TaO8uEZkfa-QgNE,6188
|
|
171
171
|
datahub/ingestion/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
172
|
-
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py,sha256=
|
|
172
|
+
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py,sha256=O2SGDU2_qMtyr_1BH9-WkNOojFWig2z4O3M21nTRo70,9908
|
|
173
173
|
datahub/ingestion/reporting/file_reporter.py,sha256=tiWukmMxHrTQI3rOAumsq6lRlw8T6spqpS6XBDYnrZU,1640
|
|
174
174
|
datahub/ingestion/reporting/reporting_provider_registry.py,sha256=jTYSh3T4sensjnHQfPLiIcbA2dG8w0px9ghChAJjGdU,310
|
|
175
175
|
datahub/ingestion/run/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -303,9 +303,9 @@ datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP
|
|
|
303
303
|
datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
|
|
304
304
|
datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
305
305
|
datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
|
|
306
|
-
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=
|
|
306
|
+
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
|
|
307
307
|
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
|
|
308
|
-
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=
|
|
308
|
+
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=LvDGTaAaI-T0OZ3fkaFwipLdzPePunuSVWoEuSBsfEM,11099
|
|
309
309
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
310
310
|
datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
|
|
311
311
|
datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
|
|
@@ -566,8 +566,8 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
|
|
|
566
566
|
datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
|
|
567
567
|
datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
|
|
568
568
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
569
|
-
datahub/metadata/_schema_classes.py,sha256=
|
|
570
|
-
datahub/metadata/schema.avsc,sha256=
|
|
569
|
+
datahub/metadata/_schema_classes.py,sha256=IAWpWPxOeGmvmc96dapE0CySk1Rikbh-YieT-K9YTMY,964636
|
|
570
|
+
datahub/metadata/schema.avsc,sha256=CeVb_Z7k0e5kmeqDUXUW7JDL6KSKBCdfAZzqRI_mLZo,729869
|
|
571
571
|
datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
|
|
572
572
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
573
573
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
@@ -581,7 +581,7 @@ datahub/metadata/com/linkedin/pegasus2avro/access/token/__init__.py,sha256=P9M7N
|
|
|
581
581
|
datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py,sha256=PgK5O-6pVRaEcvmwXAsSkwRLe8NjGiLH8AVBXeArqK8,5751
|
|
582
582
|
datahub/metadata/com/linkedin/pegasus2avro/businessattribute/__init__.py,sha256=N8kO-eUi0_Rt7weizIExxlnJ2_kZRtPrZLWCC1xtDMA,653
|
|
583
583
|
datahub/metadata/com/linkedin/pegasus2avro/chart/__init__.py,sha256=RNyyHLBNp_fxgFcBOLWO2UsXR1ofD_JczcBdPEQSusg,848
|
|
584
|
-
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py,sha256=
|
|
584
|
+
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py,sha256=ukX0VnveTrMx9G6uDaTkuk4Z2kxXr2hUK8srZuRPxj0,5520
|
|
585
585
|
datahub/metadata/com/linkedin/pegasus2avro/common/fieldtransformer/__init__.py,sha256=FN63vLiB3FCmIRqBjTA-0Xt7M6i7h5NhaVzbA1ysv18,396
|
|
586
586
|
datahub/metadata/com/linkedin/pegasus2avro/connection/__init__.py,sha256=qRtw-dB14pzVzgQ0pDK8kyBplNdpRxVKNj4D70e_FqI,564
|
|
587
587
|
datahub/metadata/com/linkedin/pegasus2avro/container/__init__.py,sha256=3yWt36KqDKFhRc9pzvt0AMnbMTlhKurGvT3BUvc25QU,510
|
|
@@ -705,7 +705,7 @@ datahub/metadata/schemas/DataHubViewInfo.avsc,sha256=U3fBIoG9ietLUpOknfQGNekqBdP
|
|
|
705
705
|
datahub/metadata/schemas/DataHubViewKey.avsc,sha256=p53axIdSVbubo3r23Vpsed7NqRcQBMGveVikEHAVAok,424
|
|
706
706
|
datahub/metadata/schemas/DataJobInfo.avsc,sha256=--obUbt_4X2paB39EeRKP13sBSiK-r0nq070EamoV1w,7212
|
|
707
707
|
datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=H1O8eAzZV34tvULdu67iBSWkdn08rt7wS208b8Nisbk,15268
|
|
708
|
-
datahub/metadata/schemas/DataJobKey.avsc,sha256=
|
|
708
|
+
datahub/metadata/schemas/DataJobKey.avsc,sha256=4F3myS-O6n7AlUqTvCkMSFvsYAjVhUq6uaQVbqLoYdM,1583
|
|
709
709
|
datahub/metadata/schemas/DataPlatformInfo.avsc,sha256=WGPFumBNHbR75vsLrivnRCbBc8vSCuxDw2UlylMieh4,2686
|
|
710
710
|
datahub/metadata/schemas/DataPlatformInstance.avsc,sha256=SNd3v_YyyLaDflv8Rd5cQR9GrVuky_cDTkYM6FqJiM8,1058
|
|
711
711
|
datahub/metadata/schemas/DataPlatformInstanceKey.avsc,sha256=sXUV5EMT6N-x8d6s8ebcJ5JdFIOsJCtiiU5Jtm-ncIk,800
|
|
@@ -721,6 +721,7 @@ datahub/metadata/schemas/DataProcessInstanceRunEvent.avsc,sha256=zwTYULEnpMbqwkL
|
|
|
721
721
|
datahub/metadata/schemas/DataProcessKey.avsc,sha256=mY1BDiEYo8RchI9DckQEz9Vks5Ibt2RdWZU8OYGnrHA,2240
|
|
722
722
|
datahub/metadata/schemas/DataProductKey.avsc,sha256=tcdQNWk3pLA3xZzOnHvZuq2u4SQuk2YcAlsxE8CcEeU,621
|
|
723
723
|
datahub/metadata/schemas/DataProductProperties.avsc,sha256=nYEK6JgpTprU0iZaqWLZsBGYJLkh6HCi1qCu-wbYhvM,6925
|
|
724
|
+
datahub/metadata/schemas/DataTransformLogic.avsc,sha256=wDng1GK9znVoK0INHGiSCSa-AH5MrDkVdMzz4wOWmrY,2011
|
|
724
725
|
datahub/metadata/schemas/DataTypeInfo.avsc,sha256=MCjzal71P8uIXZg161LrU8rZTJocZeizK-YxYA0Det0,704
|
|
725
726
|
datahub/metadata/schemas/DataTypeKey.avsc,sha256=Gs5uc_azwg10e36ZbwDTFQMevr0IfiFvJoEGHRzEilw,546
|
|
726
727
|
datahub/metadata/schemas/DatahubIngestionCheckpoint.avsc,sha256=m2Zyrx3ZWDc5gHuwbmBSRJ3JN4NFkpUhDEKM2Yeuqrw,5681
|
|
@@ -982,8 +983,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
982
983
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
983
984
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
984
985
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
985
|
-
acryl_datahub-0.15.0.
|
|
986
|
-
acryl_datahub-0.15.0.
|
|
987
|
-
acryl_datahub-0.15.0.
|
|
988
|
-
acryl_datahub-0.15.0.
|
|
989
|
-
acryl_datahub-0.15.0.
|
|
986
|
+
acryl_datahub-0.15.0.1rc13.dist-info/METADATA,sha256=KnCOYV5Kg855hgL3B3zmYHzPnXVeMoZYf_3ScEj1cyA,173444
|
|
987
|
+
acryl_datahub-0.15.0.1rc13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
988
|
+
acryl_datahub-0.15.0.1rc13.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
989
|
+
acryl_datahub-0.15.0.1rc13.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
990
|
+
acryl_datahub-0.15.0.1rc13.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -146,12 +146,55 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
146
146
|
aspect_value=source_info_aspect,
|
|
147
147
|
)
|
|
148
148
|
|
|
149
|
+
@staticmethod
|
|
150
|
+
def _convert_sets_to_lists(obj: Any) -> Any:
|
|
151
|
+
"""
|
|
152
|
+
Recursively converts all sets to lists in a Python object.
|
|
153
|
+
Works with nested dictionaries, lists, and sets.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
obj: Any Python object that might contain sets
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
The object with all sets converted to lists
|
|
160
|
+
"""
|
|
161
|
+
if isinstance(obj, dict):
|
|
162
|
+
return {
|
|
163
|
+
key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
|
|
164
|
+
for key, value in obj.items()
|
|
165
|
+
}
|
|
166
|
+
elif isinstance(obj, list):
|
|
167
|
+
return [
|
|
168
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
169
|
+
for element in obj
|
|
170
|
+
]
|
|
171
|
+
elif isinstance(obj, set):
|
|
172
|
+
return [
|
|
173
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
174
|
+
for element in obj
|
|
175
|
+
]
|
|
176
|
+
elif isinstance(obj, tuple):
|
|
177
|
+
return tuple(
|
|
178
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
179
|
+
for element in obj
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
return obj
|
|
183
|
+
|
|
149
184
|
def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
|
|
150
185
|
assert ctx.pipeline_config
|
|
151
186
|
if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
|
|
152
187
|
return ""
|
|
153
188
|
else:
|
|
154
|
-
|
|
189
|
+
redacted_recipe = redact_raw_config(ctx.pipeline_config.get_raw_dict())
|
|
190
|
+
# This is required otherwise json dumps will fail
|
|
191
|
+
# with a TypeError: Object of type set is not JSON serializable
|
|
192
|
+
converted_recipe = (
|
|
193
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(
|
|
194
|
+
redacted_recipe
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
return json.dumps(converted_recipe)
|
|
155
198
|
|
|
156
199
|
def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
|
|
157
200
|
self.sink.write_record_async(
|
|
@@ -167,7 +167,7 @@ class DataJobEntity:
|
|
|
167
167
|
class DataProcessCleanupReport(SourceReport):
|
|
168
168
|
num_aspects_removed: int = 0
|
|
169
169
|
num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
170
|
-
|
|
170
|
+
sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
171
171
|
default_factory=TopKDict
|
|
172
172
|
)
|
|
173
173
|
num_data_flows_found: int = 0
|
|
@@ -286,9 +286,9 @@ class DataProcessCleanup:
|
|
|
286
286
|
self.report.num_aspect_removed_by_type[type] = (
|
|
287
287
|
self.report.num_aspect_removed_by_type.get(type, 0) + 1
|
|
288
288
|
)
|
|
289
|
-
if type not in self.report.
|
|
290
|
-
self.report.
|
|
291
|
-
self.report.
|
|
289
|
+
if type not in self.report.sample_soft_deleted_aspects_by_type:
|
|
290
|
+
self.report.sample_soft_deleted_aspects_by_type[type] = LossyList()
|
|
291
|
+
self.report.sample_soft_deleted_aspects_by_type[type].append(urn)
|
|
292
292
|
|
|
293
293
|
if self.dry_run:
|
|
294
294
|
logger.info(
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
|
-
from concurrent.futures import ThreadPoolExecutor,
|
|
3
|
+
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
|
-
from
|
|
6
|
+
from threading import Lock
|
|
7
|
+
from typing import Dict, Iterable, List, Optional
|
|
7
8
|
|
|
8
9
|
from pydantic import Field
|
|
9
10
|
|
|
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
|
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
22
|
+
QUERY_QUERY_ENTITY = """
|
|
23
|
+
query listQueries($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
+
scrollAcrossEntities(input: $input) {
|
|
25
|
+
nextScrollId
|
|
26
|
+
count
|
|
27
|
+
searchResults {
|
|
28
|
+
entity {
|
|
29
|
+
... on QueryEntity {
|
|
30
|
+
urn
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
|
|
21
38
|
|
|
22
39
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
23
40
|
enabled: bool = Field(
|
|
24
41
|
default=True, description="Whether to do soft deletion cleanup."
|
|
25
42
|
)
|
|
26
|
-
retention_days:
|
|
43
|
+
retention_days: int = Field(
|
|
27
44
|
10,
|
|
28
45
|
description="Number of days to retain metadata in DataHub",
|
|
29
46
|
)
|
|
@@ -62,23 +79,30 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
62
79
|
default=None,
|
|
63
80
|
description="Query to filter entities",
|
|
64
81
|
)
|
|
82
|
+
|
|
65
83
|
limit_entities_delete: Optional[int] = Field(
|
|
66
84
|
25000, description="Max number of entities to delete."
|
|
67
85
|
)
|
|
68
86
|
|
|
69
|
-
|
|
70
|
-
|
|
87
|
+
futures_max_at_time: int = Field(
|
|
88
|
+
1000, description="Max number of futures to have at a time."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
runtime_limit_seconds: int = Field(
|
|
92
|
+
7200, # 2 hours by default
|
|
71
93
|
description="Runtime limit in seconds",
|
|
72
94
|
)
|
|
73
95
|
|
|
74
96
|
|
|
75
97
|
@dataclass
|
|
76
98
|
class SoftDeletedEntitiesReport(SourceReport):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
99
|
+
num_queries_found: int = 0
|
|
100
|
+
num_soft_deleted_entity_processed: int = 0
|
|
101
|
+
num_soft_deleted_retained_due_to_age: int = 0
|
|
102
|
+
num_soft_deleted_entity_removal_started: int = 0
|
|
103
|
+
num_hard_deleted: int = 0
|
|
104
|
+
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
105
|
+
sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
82
106
|
default_factory=TopKDict
|
|
83
107
|
)
|
|
84
108
|
|
|
@@ -103,48 +127,53 @@ class SoftDeletedEntitiesCleanup:
|
|
|
103
127
|
self.config = config
|
|
104
128
|
self.report = report
|
|
105
129
|
self.dry_run = dry_run
|
|
130
|
+
self.start_time = 0.0
|
|
131
|
+
self._report_lock: Lock = Lock()
|
|
132
|
+
self.last_print_time = 0.0
|
|
133
|
+
|
|
134
|
+
def _increment_retained_count(self) -> None:
|
|
135
|
+
"""Thread-safe method to update report fields"""
|
|
136
|
+
with self._report_lock:
|
|
137
|
+
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
138
|
+
|
|
139
|
+
def _increment_removal_started_count(self) -> None:
|
|
140
|
+
"""Thread-safe method to update report fields"""
|
|
141
|
+
with self._report_lock:
|
|
142
|
+
self.report.num_soft_deleted_entity_removal_started += 1
|
|
143
|
+
|
|
144
|
+
def _update_report(self, urn: str, entity_type: str) -> None:
|
|
145
|
+
"""Thread-safe method to update report fields"""
|
|
146
|
+
with self._report_lock:
|
|
147
|
+
self.report.num_hard_deleted += 1
|
|
148
|
+
|
|
149
|
+
current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
|
|
150
|
+
self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
|
|
151
|
+
if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
|
|
152
|
+
self.report.sample_hard_deleted_aspects_by_type[
|
|
153
|
+
entity_type
|
|
154
|
+
] = LossyList()
|
|
155
|
+
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
106
156
|
|
|
107
157
|
def delete_entity(self, urn: str) -> None:
|
|
108
158
|
assert self.ctx.graph
|
|
109
159
|
|
|
110
160
|
entity_urn = Urn.from_string(urn)
|
|
111
|
-
self.report.num_soft_deleted_entity_removed += 1
|
|
112
|
-
self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
|
|
113
|
-
self.report.num_soft_deleted_entity_removed_by_type.get(
|
|
114
|
-
entity_urn.entity_type, 0
|
|
115
|
-
)
|
|
116
|
-
+ 1
|
|
117
|
-
)
|
|
118
|
-
if (
|
|
119
|
-
entity_urn.entity_type
|
|
120
|
-
not in self.report.sample_soft_deleted_removed_aspects_by_type
|
|
121
|
-
):
|
|
122
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
123
|
-
entity_urn.entity_type
|
|
124
|
-
] = LossyList()
|
|
125
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
126
|
-
entity_urn.entity_type
|
|
127
|
-
].append(urn)
|
|
128
|
-
|
|
129
161
|
if self.dry_run:
|
|
130
162
|
logger.info(
|
|
131
163
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
132
164
|
)
|
|
133
165
|
return
|
|
134
|
-
|
|
166
|
+
self._increment_removal_started_count()
|
|
135
167
|
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
136
168
|
self.ctx.graph.delete_references_to_urn(
|
|
137
169
|
urn=urn,
|
|
138
170
|
dry_run=False,
|
|
139
171
|
)
|
|
172
|
+
self._update_report(urn, entity_urn.entity_type)
|
|
140
173
|
|
|
141
174
|
def delete_soft_deleted_entity(self, urn: str) -> None:
|
|
142
175
|
assert self.ctx.graph
|
|
143
176
|
|
|
144
|
-
if self.config.retention_days is None:
|
|
145
|
-
logger.info("Retention days is not set, skipping soft delete cleanup")
|
|
146
|
-
return
|
|
147
|
-
|
|
148
177
|
retention_time = (
|
|
149
178
|
int(datetime.now(timezone.utc).timestamp())
|
|
150
179
|
- self.config.retention_days * 24 * 60 * 60
|
|
@@ -157,15 +186,85 @@ class SoftDeletedEntitiesCleanup:
|
|
|
157
186
|
]["created"]["time"] < (retention_time * 1000):
|
|
158
187
|
logger.debug(f"Hard deleting {urn}")
|
|
159
188
|
self.delete_entity(urn)
|
|
189
|
+
else:
|
|
190
|
+
self._increment_retained_count()
|
|
191
|
+
|
|
192
|
+
def _print_report(self) -> None:
|
|
193
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
194
|
+
# Print report every 2 minutes
|
|
195
|
+
if time_taken > 120:
|
|
196
|
+
self.last_print_time = time.time()
|
|
197
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
198
|
+
|
|
199
|
+
def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
|
|
200
|
+
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
201
|
+
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
202
|
+
|
|
203
|
+
for future in done:
|
|
204
|
+
self._print_report()
|
|
205
|
+
if future.exception():
|
|
206
|
+
logger.error(
|
|
207
|
+
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
+
)
|
|
209
|
+
self.report.failure(
|
|
210
|
+
f"Failed to delete entity {futures[future]}",
|
|
211
|
+
exc=future.exception(),
|
|
212
|
+
)
|
|
213
|
+
self.report.num_soft_deleted_entity_processed += 1
|
|
214
|
+
if (
|
|
215
|
+
self.report.num_soft_deleted_entity_processed % self.config.batch_size
|
|
216
|
+
== 0
|
|
217
|
+
):
|
|
218
|
+
if self.config.delay:
|
|
219
|
+
logger.debug(
|
|
220
|
+
f"Sleeping for {self.config.delay} seconds before further processing batch"
|
|
221
|
+
)
|
|
222
|
+
time.sleep(self.config.delay)
|
|
223
|
+
return futures
|
|
160
224
|
|
|
161
|
-
def
|
|
162
|
-
if not self.config.enabled:
|
|
163
|
-
return
|
|
225
|
+
def _get_soft_deleted_queries(self) -> Iterable[str]:
|
|
164
226
|
assert self.ctx.graph
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
227
|
+
scroll_id: Optional[str] = None
|
|
228
|
+
while True:
|
|
229
|
+
try:
|
|
230
|
+
result = self.ctx.graph.execute_graphql(
|
|
231
|
+
QUERY_QUERY_ENTITY,
|
|
232
|
+
{
|
|
233
|
+
"input": {
|
|
234
|
+
"types": ["QUERY"],
|
|
235
|
+
"query": "*",
|
|
236
|
+
"scrollId": scroll_id if scroll_id else None,
|
|
237
|
+
"count": self.config.batch_size,
|
|
238
|
+
"orFilters": [
|
|
239
|
+
{
|
|
240
|
+
"and": [
|
|
241
|
+
{
|
|
242
|
+
"field": "removed",
|
|
243
|
+
"values": ["true"],
|
|
244
|
+
"condition": "EQUAL",
|
|
245
|
+
}
|
|
246
|
+
]
|
|
247
|
+
}
|
|
248
|
+
],
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
except Exception as e:
|
|
253
|
+
self.report.failure(
|
|
254
|
+
f"While trying to get queries with {scroll_id}", exc=e
|
|
255
|
+
)
|
|
256
|
+
break
|
|
257
|
+
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
258
|
+
if not scroll_across_entities:
|
|
259
|
+
break
|
|
260
|
+
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
261
|
+
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
262
|
+
for query in scroll_across_entities.get("searchResults"):
|
|
263
|
+
yield query["entity"]["urn"]
|
|
264
|
+
|
|
265
|
+
def _get_urns(self) -> Iterable[str]:
|
|
266
|
+
assert self.ctx.graph
|
|
267
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
169
268
|
entity_types=self.config.entity_types,
|
|
170
269
|
platform=self.config.platform,
|
|
171
270
|
env=self.config.env,
|
|
@@ -173,52 +272,41 @@ class SoftDeletedEntitiesCleanup:
|
|
|
173
272
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
174
273
|
batch_size=self.config.batch_size,
|
|
175
274
|
)
|
|
275
|
+
yield from self._get_soft_deleted_queries()
|
|
276
|
+
|
|
277
|
+
def cleanup_soft_deleted_entities(self) -> None:
|
|
278
|
+
if not self.config.enabled:
|
|
279
|
+
return
|
|
280
|
+
self.start_time = time.time()
|
|
176
281
|
|
|
177
|
-
futures =
|
|
282
|
+
futures: Dict[Future, str] = dict()
|
|
178
283
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
284
|
+
for urn in self._get_urns():
|
|
285
|
+
self._print_report()
|
|
286
|
+
while len(futures) >= self.config.futures_max_at_time:
|
|
287
|
+
futures = self._process_futures(futures)
|
|
182
288
|
if (
|
|
183
289
|
self.config.limit_entities_delete
|
|
184
|
-
and
|
|
290
|
+
and self.report.num_hard_deleted > self.config.limit_entities_delete
|
|
185
291
|
):
|
|
186
292
|
logger.info(
|
|
187
|
-
f"Limit of {self.config.limit_entities_delete} entities reached.
|
|
293
|
+
f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
|
|
188
294
|
)
|
|
189
295
|
break
|
|
190
296
|
if (
|
|
191
297
|
self.config.runtime_limit_seconds
|
|
192
|
-
and time.time() -
|
|
298
|
+
and time.time() - self.start_time
|
|
299
|
+
> self.config.runtime_limit_seconds
|
|
193
300
|
):
|
|
194
301
|
logger.info(
|
|
195
|
-
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached.
|
|
302
|
+
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
|
|
196
303
|
)
|
|
197
304
|
break
|
|
198
305
|
|
|
199
306
|
future = executor.submit(self.delete_soft_deleted_entity, urn)
|
|
200
307
|
futures[future] = urn
|
|
201
308
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
logger.error(
|
|
207
|
-
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
-
)
|
|
209
|
-
self.report.failure(
|
|
210
|
-
f"Failed to delete entity {futures[future]}",
|
|
211
|
-
exc=future.exception(),
|
|
212
|
-
)
|
|
213
|
-
deleted_count_retention += 1
|
|
214
|
-
|
|
215
|
-
if deleted_count_retention % self.config.batch_size == 0:
|
|
216
|
-
logger.info(
|
|
217
|
-
f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
if self.config.delay:
|
|
221
|
-
logger.debug(
|
|
222
|
-
f"Sleeping for {self.config.delay} seconds before getting next batch"
|
|
223
|
-
)
|
|
224
|
-
time.sleep(self.config.delay)
|
|
309
|
+
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
310
|
+
while len(futures) > 0:
|
|
311
|
+
self._print_report()
|
|
312
|
+
futures = self._process_futures(futures)
|
|
@@ -4053,6 +4053,60 @@ class DataPlatformInstanceClass(_Aspect):
|
|
|
4053
4053
|
self._inner_dict['instance'] = value
|
|
4054
4054
|
|
|
4055
4055
|
|
|
4056
|
+
class DataTransformClass(DictWrapper):
|
|
4057
|
+
"""Information about a transformation. It may be a query,"""
|
|
4058
|
+
|
|
4059
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransform")
|
|
4060
|
+
def __init__(self,
|
|
4061
|
+
queryStatement: Union[None, "QueryStatementClass"]=None,
|
|
4062
|
+
):
|
|
4063
|
+
super().__init__()
|
|
4064
|
+
|
|
4065
|
+
self.queryStatement = queryStatement
|
|
4066
|
+
|
|
4067
|
+
def _restore_defaults(self) -> None:
|
|
4068
|
+
self.queryStatement = self.RECORD_SCHEMA.fields_dict["queryStatement"].default
|
|
4069
|
+
|
|
4070
|
+
|
|
4071
|
+
@property
|
|
4072
|
+
def queryStatement(self) -> Union[None, "QueryStatementClass"]:
|
|
4073
|
+
"""The data transform may be defined by a query statement"""
|
|
4074
|
+
return self._inner_dict.get('queryStatement') # type: ignore
|
|
4075
|
+
|
|
4076
|
+
@queryStatement.setter
|
|
4077
|
+
def queryStatement(self, value: Union[None, "QueryStatementClass"]) -> None:
|
|
4078
|
+
self._inner_dict['queryStatement'] = value
|
|
4079
|
+
|
|
4080
|
+
|
|
4081
|
+
class DataTransformLogicClass(_Aspect):
|
|
4082
|
+
"""Information about a Query against one or more data assets (e.g. Tables or Views)."""
|
|
4083
|
+
|
|
4084
|
+
|
|
4085
|
+
ASPECT_NAME = 'dataTransformLogic'
|
|
4086
|
+
ASPECT_INFO = {}
|
|
4087
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransformLogic")
|
|
4088
|
+
|
|
4089
|
+
def __init__(self,
|
|
4090
|
+
transforms: List["DataTransformClass"],
|
|
4091
|
+
):
|
|
4092
|
+
super().__init__()
|
|
4093
|
+
|
|
4094
|
+
self.transforms = transforms
|
|
4095
|
+
|
|
4096
|
+
def _restore_defaults(self) -> None:
|
|
4097
|
+
self.transforms = list()
|
|
4098
|
+
|
|
4099
|
+
|
|
4100
|
+
@property
|
|
4101
|
+
def transforms(self) -> List["DataTransformClass"]:
|
|
4102
|
+
"""List of transformations applied"""
|
|
4103
|
+
return self._inner_dict.get('transforms') # type: ignore
|
|
4104
|
+
|
|
4105
|
+
@transforms.setter
|
|
4106
|
+
def transforms(self, value: List["DataTransformClass"]) -> None:
|
|
4107
|
+
self._inner_dict['transforms'] = value
|
|
4108
|
+
|
|
4109
|
+
|
|
4056
4110
|
class DeprecationClass(_Aspect):
|
|
4057
4111
|
"""Deprecation status of an entity"""
|
|
4058
4112
|
|
|
@@ -14624,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
|
|
|
14624
14678
|
|
|
14625
14679
|
|
|
14626
14680
|
ASPECT_NAME = 'dataJobKey'
|
|
14627
|
-
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults']}
|
|
14681
|
+
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
|
|
14628
14682
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
|
|
14629
14683
|
|
|
14630
14684
|
def __init__(self,
|
|
@@ -24715,6 +24769,8 @@ __SCHEMA_TYPES = {
|
|
|
24715
24769
|
'com.linkedin.pegasus2avro.common.CostCostDiscriminator': CostCostDiscriminatorClass,
|
|
24716
24770
|
'com.linkedin.pegasus2avro.common.CostType': CostTypeClass,
|
|
24717
24771
|
'com.linkedin.pegasus2avro.common.DataPlatformInstance': DataPlatformInstanceClass,
|
|
24772
|
+
'com.linkedin.pegasus2avro.common.DataTransform': DataTransformClass,
|
|
24773
|
+
'com.linkedin.pegasus2avro.common.DataTransformLogic': DataTransformLogicClass,
|
|
24718
24774
|
'com.linkedin.pegasus2avro.common.Deprecation': DeprecationClass,
|
|
24719
24775
|
'com.linkedin.pegasus2avro.common.Documentation': DocumentationClass,
|
|
24720
24776
|
'com.linkedin.pegasus2avro.common.DocumentationAssociation': DocumentationAssociationClass,
|
|
@@ -25182,6 +25238,8 @@ __SCHEMA_TYPES = {
|
|
|
25182
25238
|
'CostCostDiscriminator': CostCostDiscriminatorClass,
|
|
25183
25239
|
'CostType': CostTypeClass,
|
|
25184
25240
|
'DataPlatformInstance': DataPlatformInstanceClass,
|
|
25241
|
+
'DataTransform': DataTransformClass,
|
|
25242
|
+
'DataTransformLogic': DataTransformLogicClass,
|
|
25185
25243
|
'Deprecation': DeprecationClass,
|
|
25186
25244
|
'Documentation': DocumentationClass,
|
|
25187
25245
|
'DocumentationAssociation': DocumentationAssociationClass,
|
|
@@ -25588,6 +25646,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
|
|
|
25588
25646
|
CostClass,
|
|
25589
25647
|
BrowsePathsClass,
|
|
25590
25648
|
InstitutionalMemoryClass,
|
|
25649
|
+
DataTransformLogicClass,
|
|
25591
25650
|
SubTypesClass,
|
|
25592
25651
|
FormsClass,
|
|
25593
25652
|
DeprecationClass,
|
|
@@ -25802,6 +25861,7 @@ class AspectBag(TypedDict, total=False):
|
|
|
25802
25861
|
cost: CostClass
|
|
25803
25862
|
browsePaths: BrowsePathsClass
|
|
25804
25863
|
institutionalMemory: InstitutionalMemoryClass
|
|
25864
|
+
dataTransformLogic: DataTransformLogicClass
|
|
25805
25865
|
subTypes: SubTypesClass
|
|
25806
25866
|
forms: FormsClass
|
|
25807
25867
|
deprecation: DeprecationClass
|
|
@@ -19,6 +19,8 @@ from .....schema_classes import CostCostClass
|
|
|
19
19
|
from .....schema_classes import CostCostDiscriminatorClass
|
|
20
20
|
from .....schema_classes import CostTypeClass
|
|
21
21
|
from .....schema_classes import DataPlatformInstanceClass
|
|
22
|
+
from .....schema_classes import DataTransformClass
|
|
23
|
+
from .....schema_classes import DataTransformLogicClass
|
|
22
24
|
from .....schema_classes import DeprecationClass
|
|
23
25
|
from .....schema_classes import DocumentationClass
|
|
24
26
|
from .....schema_classes import DocumentationAssociationClass
|
|
@@ -79,6 +81,8 @@ CostCost = CostCostClass
|
|
|
79
81
|
CostCostDiscriminator = CostCostDiscriminatorClass
|
|
80
82
|
CostType = CostTypeClass
|
|
81
83
|
DataPlatformInstance = DataPlatformInstanceClass
|
|
84
|
+
DataTransform = DataTransformClass
|
|
85
|
+
DataTransformLogic = DataTransformLogicClass
|
|
82
86
|
Deprecation = DeprecationClass
|
|
83
87
|
Documentation = DocumentationClass
|
|
84
88
|
DocumentationAssociation = DocumentationAssociationClass
|