acryl-datahub 0.15.0.1rc12__py3-none-any.whl → 0.15.0.1rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=LdmQMvZSZxJJKW3u06itY2EkTfgOjWTGkJHo9YvmkV0,577
1
+ datahub/__init__.py,sha256=kKM5imQ7UziWDSMvn1Ic5ZENvcshwalM2y2qGjZxUHY,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -169,7 +169,7 @@ datahub/ingestion/graph/config.py,sha256=3b_Gxa5wcBnphP63bBiAFdWS7PJhUHRE1WZL_q4
169
169
  datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
170
170
  datahub/ingestion/graph/filters.py,sha256=UeUZQHoimavIYx-jXLA0WGkOUe10TaO8uEZkfa-QgNE,6188
171
171
  datahub/ingestion/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py,sha256=rrnlgptYF3YkxWlLYpkLm3mgrmzHcy6AwTHUG18bKVA,8373
172
+ datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py,sha256=O2SGDU2_qMtyr_1BH9-WkNOojFWig2z4O3M21nTRo70,9908
173
173
  datahub/ingestion/reporting/file_reporter.py,sha256=tiWukmMxHrTQI3rOAumsq6lRlw8T6spqpS6XBDYnrZU,1640
174
174
  datahub/ingestion/reporting/reporting_provider_registry.py,sha256=jTYSh3T4sensjnHQfPLiIcbA2dG8w0px9ghChAJjGdU,310
175
175
  datahub/ingestion/run/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -303,9 +303,9 @@ datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP
303
303
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
304
304
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
305
305
  datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
306
- datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=9brJW_HVrxJk1kAP20M7flmgYOMemOmaEl2zheWFW3c,17105
306
+ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
307
307
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
308
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=wRnRaIVUG483tY4nyDkEn6Xi2RL5MjrVvoCoZimqwSg,7514
308
+ datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=LvDGTaAaI-T0OZ3fkaFwipLdzPePunuSVWoEuSBsfEM,11099
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
310
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
311
311
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
@@ -566,8 +566,8 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
566
566
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
567
567
  datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
568
568
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
569
- datahub/metadata/_schema_classes.py,sha256=NbZUezNRH72XQUl4i_DlV-oRT4KzEBYFYcHcwGqXq9A,962516
570
- datahub/metadata/schema.avsc,sha256=Ulqzumt0EK7nD_OATi0hbCgw42ngoenja9SXWWsobIk,728543
569
+ datahub/metadata/_schema_classes.py,sha256=IAWpWPxOeGmvmc96dapE0CySk1Rikbh-YieT-K9YTMY,964636
570
+ datahub/metadata/schema.avsc,sha256=CeVb_Z7k0e5kmeqDUXUW7JDL6KSKBCdfAZzqRI_mLZo,729869
571
571
  datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
572
572
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
573
573
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
@@ -581,7 +581,7 @@ datahub/metadata/com/linkedin/pegasus2avro/access/token/__init__.py,sha256=P9M7N
581
581
  datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py,sha256=PgK5O-6pVRaEcvmwXAsSkwRLe8NjGiLH8AVBXeArqK8,5751
582
582
  datahub/metadata/com/linkedin/pegasus2avro/businessattribute/__init__.py,sha256=N8kO-eUi0_Rt7weizIExxlnJ2_kZRtPrZLWCC1xtDMA,653
583
583
  datahub/metadata/com/linkedin/pegasus2avro/chart/__init__.py,sha256=RNyyHLBNp_fxgFcBOLWO2UsXR1ofD_JczcBdPEQSusg,848
584
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py,sha256=yD2OyoQhMT3KnvxRyzPXiHsyeH-wHG1NBlLn64iCE4A,5333
584
+ datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py,sha256=ukX0VnveTrMx9G6uDaTkuk4Z2kxXr2hUK8srZuRPxj0,5520
585
585
  datahub/metadata/com/linkedin/pegasus2avro/common/fieldtransformer/__init__.py,sha256=FN63vLiB3FCmIRqBjTA-0Xt7M6i7h5NhaVzbA1ysv18,396
586
586
  datahub/metadata/com/linkedin/pegasus2avro/connection/__init__.py,sha256=qRtw-dB14pzVzgQ0pDK8kyBplNdpRxVKNj4D70e_FqI,564
587
587
  datahub/metadata/com/linkedin/pegasus2avro/container/__init__.py,sha256=3yWt36KqDKFhRc9pzvt0AMnbMTlhKurGvT3BUvc25QU,510
@@ -705,7 +705,7 @@ datahub/metadata/schemas/DataHubViewInfo.avsc,sha256=U3fBIoG9ietLUpOknfQGNekqBdP
705
705
  datahub/metadata/schemas/DataHubViewKey.avsc,sha256=p53axIdSVbubo3r23Vpsed7NqRcQBMGveVikEHAVAok,424
706
706
  datahub/metadata/schemas/DataJobInfo.avsc,sha256=--obUbt_4X2paB39EeRKP13sBSiK-r0nq070EamoV1w,7212
707
707
  datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=H1O8eAzZV34tvULdu67iBSWkdn08rt7wS208b8Nisbk,15268
708
- datahub/metadata/schemas/DataJobKey.avsc,sha256=_fSAQDgP_UPtZfqAZPhJmsHxxltuMh9btgw20z4R6Xk,1555
708
+ datahub/metadata/schemas/DataJobKey.avsc,sha256=4F3myS-O6n7AlUqTvCkMSFvsYAjVhUq6uaQVbqLoYdM,1583
709
709
  datahub/metadata/schemas/DataPlatformInfo.avsc,sha256=WGPFumBNHbR75vsLrivnRCbBc8vSCuxDw2UlylMieh4,2686
710
710
  datahub/metadata/schemas/DataPlatformInstance.avsc,sha256=SNd3v_YyyLaDflv8Rd5cQR9GrVuky_cDTkYM6FqJiM8,1058
711
711
  datahub/metadata/schemas/DataPlatformInstanceKey.avsc,sha256=sXUV5EMT6N-x8d6s8ebcJ5JdFIOsJCtiiU5Jtm-ncIk,800
@@ -721,6 +721,7 @@ datahub/metadata/schemas/DataProcessInstanceRunEvent.avsc,sha256=zwTYULEnpMbqwkL
721
721
  datahub/metadata/schemas/DataProcessKey.avsc,sha256=mY1BDiEYo8RchI9DckQEz9Vks5Ibt2RdWZU8OYGnrHA,2240
722
722
  datahub/metadata/schemas/DataProductKey.avsc,sha256=tcdQNWk3pLA3xZzOnHvZuq2u4SQuk2YcAlsxE8CcEeU,621
723
723
  datahub/metadata/schemas/DataProductProperties.avsc,sha256=nYEK6JgpTprU0iZaqWLZsBGYJLkh6HCi1qCu-wbYhvM,6925
724
+ datahub/metadata/schemas/DataTransformLogic.avsc,sha256=wDng1GK9znVoK0INHGiSCSa-AH5MrDkVdMzz4wOWmrY,2011
724
725
  datahub/metadata/schemas/DataTypeInfo.avsc,sha256=MCjzal71P8uIXZg161LrU8rZTJocZeizK-YxYA0Det0,704
725
726
  datahub/metadata/schemas/DataTypeKey.avsc,sha256=Gs5uc_azwg10e36ZbwDTFQMevr0IfiFvJoEGHRzEilw,546
726
727
  datahub/metadata/schemas/DatahubIngestionCheckpoint.avsc,sha256=m2Zyrx3ZWDc5gHuwbmBSRJ3JN4NFkpUhDEKM2Yeuqrw,5681
@@ -982,8 +983,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
982
983
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
983
984
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
984
985
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
985
- acryl_datahub-0.15.0.1rc12.dist-info/METADATA,sha256=w8H0vrzaAbeZnX_mqNopX7I929V_AYXeWGKidlrUrE8,173444
986
- acryl_datahub-0.15.0.1rc12.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
- acryl_datahub-0.15.0.1rc12.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
- acryl_datahub-0.15.0.1rc12.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
- acryl_datahub-0.15.0.1rc12.dist-info/RECORD,,
986
+ acryl_datahub-0.15.0.1rc13.dist-info/METADATA,sha256=KnCOYV5Kg855hgL3B3zmYHzPnXVeMoZYf_3ScEj1cyA,173444
987
+ acryl_datahub-0.15.0.1rc13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
988
+ acryl_datahub-0.15.0.1rc13.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
989
+ acryl_datahub-0.15.0.1rc13.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
990
+ acryl_datahub-0.15.0.1rc13.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc12"
6
+ __version__ = "0.15.0.1rc13"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -146,12 +146,55 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
146
146
  aspect_value=source_info_aspect,
147
147
  )
148
148
 
149
+ @staticmethod
150
+ def _convert_sets_to_lists(obj: Any) -> Any:
151
+ """
152
+ Recursively converts all sets to lists in a Python object.
153
+ Works with nested dictionaries, lists, and sets.
154
+
155
+ Args:
156
+ obj: Any Python object that might contain sets
157
+
158
+ Returns:
159
+ The object with all sets converted to lists
160
+ """
161
+ if isinstance(obj, dict):
162
+ return {
163
+ key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
164
+ for key, value in obj.items()
165
+ }
166
+ elif isinstance(obj, list):
167
+ return [
168
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
169
+ for element in obj
170
+ ]
171
+ elif isinstance(obj, set):
172
+ return [
173
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
174
+ for element in obj
175
+ ]
176
+ elif isinstance(obj, tuple):
177
+ return tuple(
178
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
179
+ for element in obj
180
+ )
181
+ else:
182
+ return obj
183
+
149
184
  def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
150
185
  assert ctx.pipeline_config
151
186
  if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
152
187
  return ""
153
188
  else:
154
- return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict()))
189
+ redacted_recipe = redact_raw_config(ctx.pipeline_config.get_raw_dict())
190
+ # This is required otherwise json dumps will fail
191
+ # with a TypeError: Object of type set is not JSON serializable
192
+ converted_recipe = (
193
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(
194
+ redacted_recipe
195
+ )
196
+ )
197
+ return json.dumps(converted_recipe)
155
198
 
156
199
  def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
157
200
  self.sink.write_record_async(
@@ -167,7 +167,7 @@ class DataJobEntity:
167
167
  class DataProcessCleanupReport(SourceReport):
168
168
  num_aspects_removed: int = 0
169
169
  num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
170
- sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
170
+ sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
171
171
  default_factory=TopKDict
172
172
  )
173
173
  num_data_flows_found: int = 0
@@ -286,9 +286,9 @@ class DataProcessCleanup:
286
286
  self.report.num_aspect_removed_by_type[type] = (
287
287
  self.report.num_aspect_removed_by_type.get(type, 0) + 1
288
288
  )
289
- if type not in self.report.sample_removed_aspects_by_type:
290
- self.report.sample_removed_aspects_by_type[type] = LossyList()
291
- self.report.sample_removed_aspects_by_type[type].append(urn)
289
+ if type not in self.report.sample_soft_deleted_aspects_by_type:
290
+ self.report.sample_soft_deleted_aspects_by_type[type] = LossyList()
291
+ self.report.sample_soft_deleted_aspects_by_type[type].append(urn)
292
292
 
293
293
  if self.dry_run:
294
294
  logger.info(
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  import time
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime, timezone
6
- from typing import List, Optional
6
+ from threading import Lock
7
+ from typing import Dict, Iterable, List, Optional
7
8
 
8
9
  from pydantic import Field
9
10
 
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+ QUERY_QUERY_ENTITY = """
23
+ query listQueries($input: ScrollAcrossEntitiesInput!) {
24
+ scrollAcrossEntities(input: $input) {
25
+ nextScrollId
26
+ count
27
+ searchResults {
28
+ entity {
29
+ ... on QueryEntity {
30
+ urn
31
+ }
32
+ }
33
+ }
34
+ }
35
+ }
36
+ """
37
+
21
38
 
22
39
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
23
40
  enabled: bool = Field(
24
41
  default=True, description="Whether to do soft deletion cleanup."
25
42
  )
26
- retention_days: Optional[int] = Field(
43
+ retention_days: int = Field(
27
44
  10,
28
45
  description="Number of days to retain metadata in DataHub",
29
46
  )
@@ -62,23 +79,30 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
62
79
  default=None,
63
80
  description="Query to filter entities",
64
81
  )
82
+
65
83
  limit_entities_delete: Optional[int] = Field(
66
84
  25000, description="Max number of entities to delete."
67
85
  )
68
86
 
69
- runtime_limit_seconds: Optional[int] = Field(
70
- None,
87
+ futures_max_at_time: int = Field(
88
+ 1000, description="Max number of futures to have at a time."
89
+ )
90
+
91
+ runtime_limit_seconds: int = Field(
92
+ 7200, # 2 hours by default
71
93
  description="Runtime limit in seconds",
72
94
  )
73
95
 
74
96
 
75
97
  @dataclass
76
98
  class SoftDeletedEntitiesReport(SourceReport):
77
- num_soft_deleted_entity_removed: int = 0
78
- num_soft_deleted_entity_removed_by_type: TopKDict[str, int] = field(
79
- default_factory=TopKDict
80
- )
81
- sample_soft_deleted_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
99
+ num_queries_found: int = 0
100
+ num_soft_deleted_entity_processed: int = 0
101
+ num_soft_deleted_retained_due_to_age: int = 0
102
+ num_soft_deleted_entity_removal_started: int = 0
103
+ num_hard_deleted: int = 0
104
+ num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
105
+ sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
82
106
  default_factory=TopKDict
83
107
  )
84
108
 
@@ -103,48 +127,53 @@ class SoftDeletedEntitiesCleanup:
103
127
  self.config = config
104
128
  self.report = report
105
129
  self.dry_run = dry_run
130
+ self.start_time = 0.0
131
+ self._report_lock: Lock = Lock()
132
+ self.last_print_time = 0.0
133
+
134
+ def _increment_retained_count(self) -> None:
135
+ """Thread-safe method to update report fields"""
136
+ with self._report_lock:
137
+ self.report.num_soft_deleted_retained_due_to_age += 1
138
+
139
+ def _increment_removal_started_count(self) -> None:
140
+ """Thread-safe method to update report fields"""
141
+ with self._report_lock:
142
+ self.report.num_soft_deleted_entity_removal_started += 1
143
+
144
+ def _update_report(self, urn: str, entity_type: str) -> None:
145
+ """Thread-safe method to update report fields"""
146
+ with self._report_lock:
147
+ self.report.num_hard_deleted += 1
148
+
149
+ current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
150
+ self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
151
+ if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
152
+ self.report.sample_hard_deleted_aspects_by_type[
153
+ entity_type
154
+ ] = LossyList()
155
+ self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
106
156
 
107
157
  def delete_entity(self, urn: str) -> None:
108
158
  assert self.ctx.graph
109
159
 
110
160
  entity_urn = Urn.from_string(urn)
111
- self.report.num_soft_deleted_entity_removed += 1
112
- self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
113
- self.report.num_soft_deleted_entity_removed_by_type.get(
114
- entity_urn.entity_type, 0
115
- )
116
- + 1
117
- )
118
- if (
119
- entity_urn.entity_type
120
- not in self.report.sample_soft_deleted_removed_aspects_by_type
121
- ):
122
- self.report.sample_soft_deleted_removed_aspects_by_type[
123
- entity_urn.entity_type
124
- ] = LossyList()
125
- self.report.sample_soft_deleted_removed_aspects_by_type[
126
- entity_urn.entity_type
127
- ].append(urn)
128
-
129
161
  if self.dry_run:
130
162
  logger.info(
131
163
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
132
164
  )
133
165
  return
134
-
166
+ self._increment_removal_started_count()
135
167
  self.ctx.graph.delete_entity(urn=urn, hard=True)
136
168
  self.ctx.graph.delete_references_to_urn(
137
169
  urn=urn,
138
170
  dry_run=False,
139
171
  )
172
+ self._update_report(urn, entity_urn.entity_type)
140
173
 
141
174
  def delete_soft_deleted_entity(self, urn: str) -> None:
142
175
  assert self.ctx.graph
143
176
 
144
- if self.config.retention_days is None:
145
- logger.info("Retention days is not set, skipping soft delete cleanup")
146
- return
147
-
148
177
  retention_time = (
149
178
  int(datetime.now(timezone.utc).timestamp())
150
179
  - self.config.retention_days * 24 * 60 * 60
@@ -157,15 +186,85 @@ class SoftDeletedEntitiesCleanup:
157
186
  ]["created"]["time"] < (retention_time * 1000):
158
187
  logger.debug(f"Hard deleting {urn}")
159
188
  self.delete_entity(urn)
189
+ else:
190
+ self._increment_retained_count()
191
+
192
+ def _print_report(self) -> None:
193
+ time_taken = round(time.time() - self.last_print_time, 1)
194
+ # Print report every 2 minutes
195
+ if time_taken > 120:
196
+ self.last_print_time = time.time()
197
+ logger.info(f"\n{self.report.as_string()}")
198
+
199
+ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
200
+ done, not_done = wait(futures, return_when=FIRST_COMPLETED)
201
+ futures = {future: urn for future, urn in futures.items() if future in not_done}
202
+
203
+ for future in done:
204
+ self._print_report()
205
+ if future.exception():
206
+ logger.error(
207
+ f"Failed to delete entity {futures[future]}: {future.exception()}"
208
+ )
209
+ self.report.failure(
210
+ f"Failed to delete entity {futures[future]}",
211
+ exc=future.exception(),
212
+ )
213
+ self.report.num_soft_deleted_entity_processed += 1
214
+ if (
215
+ self.report.num_soft_deleted_entity_processed % self.config.batch_size
216
+ == 0
217
+ ):
218
+ if self.config.delay:
219
+ logger.debug(
220
+ f"Sleeping for {self.config.delay} seconds before further processing batch"
221
+ )
222
+ time.sleep(self.config.delay)
223
+ return futures
160
224
 
161
- def cleanup_soft_deleted_entities(self) -> None:
162
- if not self.config.enabled:
163
- return
225
+ def _get_soft_deleted_queries(self) -> Iterable[str]:
164
226
  assert self.ctx.graph
165
- start_time = time.time()
166
-
167
- deleted_count_retention = 0
168
- urns = self.ctx.graph.get_urns_by_filter(
227
+ scroll_id: Optional[str] = None
228
+ while True:
229
+ try:
230
+ result = self.ctx.graph.execute_graphql(
231
+ QUERY_QUERY_ENTITY,
232
+ {
233
+ "input": {
234
+ "types": ["QUERY"],
235
+ "query": "*",
236
+ "scrollId": scroll_id if scroll_id else None,
237
+ "count": self.config.batch_size,
238
+ "orFilters": [
239
+ {
240
+ "and": [
241
+ {
242
+ "field": "removed",
243
+ "values": ["true"],
244
+ "condition": "EQUAL",
245
+ }
246
+ ]
247
+ }
248
+ ],
249
+ }
250
+ },
251
+ )
252
+ except Exception as e:
253
+ self.report.failure(
254
+ f"While trying to get queries with {scroll_id}", exc=e
255
+ )
256
+ break
257
+ scroll_across_entities = result.get("scrollAcrossEntities")
258
+ if not scroll_across_entities:
259
+ break
260
+ scroll_id = scroll_across_entities.get("nextScrollId")
261
+ self.report.num_queries_found += scroll_across_entities.get("count")
262
+ for query in scroll_across_entities.get("searchResults"):
263
+ yield query["entity"]["urn"]
264
+
265
+ def _get_urns(self) -> Iterable[str]:
266
+ assert self.ctx.graph
267
+ yield from self.ctx.graph.get_urns_by_filter(
169
268
  entity_types=self.config.entity_types,
170
269
  platform=self.config.platform,
171
270
  env=self.config.env,
@@ -173,52 +272,41 @@ class SoftDeletedEntitiesCleanup:
173
272
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
174
273
  batch_size=self.config.batch_size,
175
274
  )
275
+ yield from self._get_soft_deleted_queries()
276
+
277
+ def cleanup_soft_deleted_entities(self) -> None:
278
+ if not self.config.enabled:
279
+ return
280
+ self.start_time = time.time()
176
281
 
177
- futures = {}
282
+ futures: Dict[Future, str] = dict()
178
283
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
179
- num_urns_submitted = 0
180
- for urn in urns:
181
- num_urns_submitted += 1
284
+ for urn in self._get_urns():
285
+ self._print_report()
286
+ while len(futures) >= self.config.futures_max_at_time:
287
+ futures = self._process_futures(futures)
182
288
  if (
183
289
  self.config.limit_entities_delete
184
- and num_urns_submitted > self.config.limit_entities_delete
290
+ and self.report.num_hard_deleted > self.config.limit_entities_delete
185
291
  ):
186
292
  logger.info(
187
- f"Limit of {self.config.limit_entities_delete} entities reached. Stopping"
293
+ f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
188
294
  )
189
295
  break
190
296
  if (
191
297
  self.config.runtime_limit_seconds
192
- and time.time() - start_time > self.config.runtime_limit_seconds
298
+ and time.time() - self.start_time
299
+ > self.config.runtime_limit_seconds
193
300
  ):
194
301
  logger.info(
195
- f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Stopping"
302
+ f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
196
303
  )
197
304
  break
198
305
 
199
306
  future = executor.submit(self.delete_soft_deleted_entity, urn)
200
307
  futures[future] = urn
201
308
 
202
- if not futures:
203
- return
204
- for future in as_completed(futures):
205
- if future.exception():
206
- logger.error(
207
- f"Failed to delete entity {futures[future]}: {future.exception()}"
208
- )
209
- self.report.failure(
210
- f"Failed to delete entity {futures[future]}",
211
- exc=future.exception(),
212
- )
213
- deleted_count_retention += 1
214
-
215
- if deleted_count_retention % self.config.batch_size == 0:
216
- logger.info(
217
- f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
218
- )
219
-
220
- if self.config.delay:
221
- logger.debug(
222
- f"Sleeping for {self.config.delay} seconds before getting next batch"
223
- )
224
- time.sleep(self.config.delay)
309
+ logger.info(f"Waiting for {len(futures)} futures to complete")
310
+ while len(futures) > 0:
311
+ self._print_report()
312
+ futures = self._process_futures(futures)
@@ -4053,6 +4053,60 @@ class DataPlatformInstanceClass(_Aspect):
4053
4053
  self._inner_dict['instance'] = value
4054
4054
 
4055
4055
 
4056
+ class DataTransformClass(DictWrapper):
4057
+ """Information about a transformation. It may be a query,"""
4058
+
4059
+ RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransform")
4060
+ def __init__(self,
4061
+ queryStatement: Union[None, "QueryStatementClass"]=None,
4062
+ ):
4063
+ super().__init__()
4064
+
4065
+ self.queryStatement = queryStatement
4066
+
4067
+ def _restore_defaults(self) -> None:
4068
+ self.queryStatement = self.RECORD_SCHEMA.fields_dict["queryStatement"].default
4069
+
4070
+
4071
+ @property
4072
+ def queryStatement(self) -> Union[None, "QueryStatementClass"]:
4073
+ """The data transform may be defined by a query statement"""
4074
+ return self._inner_dict.get('queryStatement') # type: ignore
4075
+
4076
+ @queryStatement.setter
4077
+ def queryStatement(self, value: Union[None, "QueryStatementClass"]) -> None:
4078
+ self._inner_dict['queryStatement'] = value
4079
+
4080
+
4081
+ class DataTransformLogicClass(_Aspect):
4082
+ """Information about a Query against one or more data assets (e.g. Tables or Views)."""
4083
+
4084
+
4085
+ ASPECT_NAME = 'dataTransformLogic'
4086
+ ASPECT_INFO = {}
4087
+ RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransformLogic")
4088
+
4089
+ def __init__(self,
4090
+ transforms: List["DataTransformClass"],
4091
+ ):
4092
+ super().__init__()
4093
+
4094
+ self.transforms = transforms
4095
+
4096
+ def _restore_defaults(self) -> None:
4097
+ self.transforms = list()
4098
+
4099
+
4100
+ @property
4101
+ def transforms(self) -> List["DataTransformClass"]:
4102
+ """List of transformations applied"""
4103
+ return self._inner_dict.get('transforms') # type: ignore
4104
+
4105
+ @transforms.setter
4106
+ def transforms(self, value: List["DataTransformClass"]) -> None:
4107
+ self._inner_dict['transforms'] = value
4108
+
4109
+
4056
4110
  class DeprecationClass(_Aspect):
4057
4111
  """Deprecation status of an entity"""
4058
4112
 
@@ -14624,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
14624
14678
 
14625
14679
 
14626
14680
  ASPECT_NAME = 'dataJobKey'
14627
- ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults']}
14681
+ ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
14628
14682
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
14629
14683
 
14630
14684
  def __init__(self,
@@ -24715,6 +24769,8 @@ __SCHEMA_TYPES = {
24715
24769
  'com.linkedin.pegasus2avro.common.CostCostDiscriminator': CostCostDiscriminatorClass,
24716
24770
  'com.linkedin.pegasus2avro.common.CostType': CostTypeClass,
24717
24771
  'com.linkedin.pegasus2avro.common.DataPlatformInstance': DataPlatformInstanceClass,
24772
+ 'com.linkedin.pegasus2avro.common.DataTransform': DataTransformClass,
24773
+ 'com.linkedin.pegasus2avro.common.DataTransformLogic': DataTransformLogicClass,
24718
24774
  'com.linkedin.pegasus2avro.common.Deprecation': DeprecationClass,
24719
24775
  'com.linkedin.pegasus2avro.common.Documentation': DocumentationClass,
24720
24776
  'com.linkedin.pegasus2avro.common.DocumentationAssociation': DocumentationAssociationClass,
@@ -25182,6 +25238,8 @@ __SCHEMA_TYPES = {
25182
25238
  'CostCostDiscriminator': CostCostDiscriminatorClass,
25183
25239
  'CostType': CostTypeClass,
25184
25240
  'DataPlatformInstance': DataPlatformInstanceClass,
25241
+ 'DataTransform': DataTransformClass,
25242
+ 'DataTransformLogic': DataTransformLogicClass,
25185
25243
  'Deprecation': DeprecationClass,
25186
25244
  'Documentation': DocumentationClass,
25187
25245
  'DocumentationAssociation': DocumentationAssociationClass,
@@ -25588,6 +25646,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
25588
25646
  CostClass,
25589
25647
  BrowsePathsClass,
25590
25648
  InstitutionalMemoryClass,
25649
+ DataTransformLogicClass,
25591
25650
  SubTypesClass,
25592
25651
  FormsClass,
25593
25652
  DeprecationClass,
@@ -25802,6 +25861,7 @@ class AspectBag(TypedDict, total=False):
25802
25861
  cost: CostClass
25803
25862
  browsePaths: BrowsePathsClass
25804
25863
  institutionalMemory: InstitutionalMemoryClass
25864
+ dataTransformLogic: DataTransformLogicClass
25805
25865
  subTypes: SubTypesClass
25806
25866
  forms: FormsClass
25807
25867
  deprecation: DeprecationClass
@@ -19,6 +19,8 @@ from .....schema_classes import CostCostClass
19
19
  from .....schema_classes import CostCostDiscriminatorClass
20
20
  from .....schema_classes import CostTypeClass
21
21
  from .....schema_classes import DataPlatformInstanceClass
22
+ from .....schema_classes import DataTransformClass
23
+ from .....schema_classes import DataTransformLogicClass
22
24
  from .....schema_classes import DeprecationClass
23
25
  from .....schema_classes import DocumentationClass
24
26
  from .....schema_classes import DocumentationAssociationClass
@@ -79,6 +81,8 @@ CostCost = CostCostClass
79
81
  CostCostDiscriminator = CostCostDiscriminatorClass
80
82
  CostType = CostTypeClass
81
83
  DataPlatformInstance = DataPlatformInstanceClass
84
+ DataTransform = DataTransformClass
85
+ DataTransformLogic = DataTransformLogicClass
82
86
  Deprecation = DeprecationClass
83
87
  Documentation = DocumentationClass
84
88
  DocumentationAssociation = DocumentationAssociationClass