deltacat 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/model/compact_partition_params.py +25 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
- deltacat/compute/compactor/model/delta_file_envelope.py +21 -3
- deltacat/compute/compactor/model/table_object_store.py +51 -0
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor_v2/compaction_session.py +80 -14
- deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
- deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +83 -0
- deltacat/compute/compactor_v2/deletes/delete_strategy.py +82 -0
- deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +161 -0
- deltacat/compute/compactor_v2/deletes/model.py +23 -0
- deltacat/compute/compactor_v2/deletes/utils.py +164 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_input.py +24 -1
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +221 -50
- deltacat/compute/compactor_v2/utils/delta.py +11 -1
- deltacat/compute/compactor_v2/utils/merge.py +10 -0
- deltacat/compute/compactor_v2/utils/task_options.py +94 -8
- deltacat/io/memcached_object_store.py +20 -0
- deltacat/io/ray_plasma_object_store.py +6 -0
- deltacat/logs.py +29 -2
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +2 -0
- deltacat/storage/model/delete_parameters.py +40 -0
- deltacat/storage/model/delta.py +25 -1
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +1930 -0
- deltacat/tests/compute/compact_partition_test_cases.py +16 -822
- deltacat/tests/compute/compactor/utils/test_io.py +4 -4
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -0
- deltacat/tests/compute/test_compact_partition_params.py +5 -0
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +32 -20
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +28 -10
- deltacat/tests/io/test_memcached_object_store.py +19 -0
- deltacat/tests/local_deltacat_storage/__init__.py +3 -0
- deltacat/tests/test_utils/constants.py +1 -2
- deltacat/tests/test_utils/pyarrow.py +27 -10
- deltacat/utils/pandas.py +1 -1
- deltacat/utils/ray_utils/runtime.py +3 -3
- deltacat/utils/resources.py +7 -5
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/METADATA +1 -1
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/RECORD +47 -38
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/LICENSE +0 -0
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/WHEEL +0 -0
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -20,6 +20,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
20
20
|
AVERAGE_RECORD_SIZE_BYTES,
|
21
21
|
TASK_MAX_PARALLELISM,
|
22
22
|
DROP_DUPLICATES,
|
23
|
+
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
23
24
|
)
|
24
25
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
25
26
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -57,6 +58,7 @@ class CompactPartitionParams(dict):
|
|
57
58
|
"compacted_file_content_type", ContentType.PARQUET
|
58
59
|
)
|
59
60
|
result.object_store = params.get("object_store", RayPlasmaObjectStore())
|
61
|
+
|
60
62
|
result.enable_profiler = params.get("enable_profiler", False)
|
61
63
|
result.deltacat_storage = params.get(
|
62
64
|
"deltacat_storage", unimplemented_deltacat_storage
|
@@ -84,12 +86,17 @@ class CompactPartitionParams(dict):
|
|
84
86
|
result.average_record_size_bytes = params.get(
|
85
87
|
"average_record_size_bytes", AVERAGE_RECORD_SIZE_BYTES
|
86
88
|
)
|
89
|
+
result.total_memory_buffer_percentage = params.get(
|
90
|
+
"total_memory_buffer_percentage", TOTAL_MEMORY_BUFFER_PERCENTAGE
|
91
|
+
)
|
87
92
|
result.hash_group_count = params.get(
|
88
93
|
"hash_group_count", result.hash_bucket_count
|
89
94
|
)
|
90
95
|
result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
|
91
96
|
result.ray_custom_resources = params.get("ray_custom_resources")
|
92
97
|
|
98
|
+
result.memory_logs_enabled = params.get("memory_logs_enabled", False)
|
99
|
+
|
93
100
|
result.metrics_config = params.get("metrics_config")
|
94
101
|
|
95
102
|
if not importlib.util.find_spec("memray"):
|
@@ -189,6 +196,16 @@ class CompactPartitionParams(dict):
|
|
189
196
|
def average_record_size_bytes(self, average_record_size_bytes: float) -> None:
|
190
197
|
self["average_record_size_bytes"] = average_record_size_bytes
|
191
198
|
|
199
|
+
@property
|
200
|
+
def total_memory_buffer_percentage(self) -> int:
|
201
|
+
return self["total_memory_buffer_percentage"]
|
202
|
+
|
203
|
+
@total_memory_buffer_percentage.setter
|
204
|
+
def total_memory_buffer_percentage(
|
205
|
+
self, total_memory_buffer_percentage: int
|
206
|
+
) -> None:
|
207
|
+
self["total_memory_buffer_percentage"] = total_memory_buffer_percentage
|
208
|
+
|
192
209
|
@property
|
193
210
|
def min_files_in_batch(self) -> float:
|
194
211
|
return self["min_files_in_batch"]
|
@@ -354,6 +371,14 @@ class CompactPartitionParams(dict):
|
|
354
371
|
def sort_keys(self, keys: List[SortKey]) -> None:
|
355
372
|
self["sort_keys"] = keys
|
356
373
|
|
374
|
+
@property
|
375
|
+
def memory_logs_enabled(self) -> bool:
|
376
|
+
return self.get("memory_logs_enabled")
|
377
|
+
|
378
|
+
@memory_logs_enabled.setter
|
379
|
+
def memory_logs_enabled(self, value: bool) -> None:
|
380
|
+
self["memory_logs_enabled"] = value
|
381
|
+
|
357
382
|
@property
|
358
383
|
def metrics_config(self) -> Optional[MetricsConfig]:
|
359
384
|
return self.get("metrics_config")
|
@@ -84,6 +84,13 @@ class CompactionSessionAuditInfo(dict):
|
|
84
84
|
"""
|
85
85
|
return self.get("recordsDeduped")
|
86
86
|
|
87
|
+
@property
|
88
|
+
def records_deleted(self) -> int:
|
89
|
+
"""
|
90
|
+
The total count of deleted records in a compaction session if delete deltas are present.
|
91
|
+
"""
|
92
|
+
return self.get("recordsDeleted")
|
93
|
+
|
87
94
|
@property
|
88
95
|
def input_size_bytes(self) -> float:
|
89
96
|
"""
|
@@ -461,6 +468,10 @@ class CompactionSessionAuditInfo(dict):
|
|
461
468
|
self["recordsDeduped"] = records_deduped
|
462
469
|
return self
|
463
470
|
|
471
|
+
def set_records_deleted(self, records_deleted: int) -> CompactionSessionAuditInfo:
|
472
|
+
self["recordsDeleted"] = records_deleted
|
473
|
+
return self
|
474
|
+
|
464
475
|
def set_input_size_bytes(
|
465
476
|
self, input_size_bytes: float
|
466
477
|
) -> CompactionSessionAuditInfo:
|
@@ -5,6 +5,9 @@ import numpy as np
|
|
5
5
|
import pyarrow as pa
|
6
6
|
|
7
7
|
from deltacat.storage import DeltaType, LocalTable
|
8
|
+
from deltacat.compute.compactor.model.table_object_store import (
|
9
|
+
LocalTableStorageStrategy,
|
10
|
+
)
|
8
11
|
|
9
12
|
from typing import Optional
|
10
13
|
|
@@ -20,18 +23,21 @@ class DeltaFileEnvelope(dict):
|
|
20
23
|
file_index: int = None,
|
21
24
|
is_src_delta: np.bool_ = True,
|
22
25
|
file_record_count: Optional[int] = None,
|
26
|
+
table_storage_strategy: [LocalTableStorageStrategy] = None,
|
23
27
|
) -> DeltaFileEnvelope:
|
24
|
-
"""
|
28
|
+
"""
|
29
|
+
Static factory builder for a Delta File Envelope
|
25
30
|
`
|
26
31
|
Args:
|
27
32
|
stream_position: Stream position of a delta.
|
28
|
-
file_index: Manifest file index number of a delta.
|
29
33
|
delta_type: A delta type.
|
30
34
|
table: The table object that represents the delta file.
|
35
|
+
file_index: Manifest file index number of a delta.
|
31
36
|
is_src_delta: True if this Delta File Locator is
|
32
37
|
pointing to a file from the uncompacted source table, False if
|
33
38
|
this Locator is pointing to a file in the compacted destination
|
34
39
|
table.
|
40
|
+
table_storage_strategy: The way the table object is stored in the delta file envelope. If None just stores the table normally
|
35
41
|
Returns:
|
36
42
|
A delta file envelope.
|
37
43
|
|
@@ -46,7 +52,11 @@ class DeltaFileEnvelope(dict):
|
|
46
52
|
delta_file_envelope["streamPosition"] = stream_position
|
47
53
|
delta_file_envelope["fileIndex"] = file_index
|
48
54
|
delta_file_envelope["deltaType"] = delta_type.value
|
49
|
-
|
55
|
+
if table_storage_strategy is None:
|
56
|
+
delta_file_envelope["table"] = table
|
57
|
+
else:
|
58
|
+
delta_file_envelope["table"] = table_storage_strategy.store_table(table)
|
59
|
+
delta_file_envelope["table_storage_strategy"] = table_storage_strategy
|
50
60
|
delta_file_envelope["is_src_delta"] = is_src_delta
|
51
61
|
delta_file_envelope["file_record_count"] = file_record_count
|
52
62
|
return delta_file_envelope
|
@@ -63,8 +73,16 @@ class DeltaFileEnvelope(dict):
|
|
63
73
|
def delta_type(self) -> DeltaType:
|
64
74
|
return DeltaType(self["deltaType"])
|
65
75
|
|
76
|
+
@property
|
77
|
+
def table_storage_strategy(self) -> Optional[LocalTableStorageStrategy]:
|
78
|
+
return self["table_storage_strategy"]
|
79
|
+
|
66
80
|
@property
|
67
81
|
def table(self) -> LocalTable:
|
82
|
+
val = self.table_storage_strategy
|
83
|
+
if val is not None:
|
84
|
+
table_storage_strategy = val
|
85
|
+
return table_storage_strategy.get_table(self["table"])
|
68
86
|
return self["table"]
|
69
87
|
|
70
88
|
@property
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from ray.types import ObjectRef
|
4
|
+
|
5
|
+
from typing import Any, Union
|
6
|
+
|
7
|
+
from abc import ABC, abstractmethod, abstractproperty
|
8
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
9
|
+
from deltacat.storage import (
|
10
|
+
LocalTable,
|
11
|
+
)
|
12
|
+
from deltacat.io.object_store import IObjectStore
|
13
|
+
|
14
|
+
LocalTableReference = Union[ObjectRef, LocalTable]
|
15
|
+
|
16
|
+
|
17
|
+
class LocalTableStorageStrategy(ABC):
|
18
|
+
@abstractproperty
|
19
|
+
def object_store(cls) -> IObjectStore:
|
20
|
+
pass
|
21
|
+
|
22
|
+
@abstractmethod
|
23
|
+
def store_table(self, table: LocalTable) -> LocalTableReference:
|
24
|
+
pass
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
def get_table(self, table_like: LocalTableReference) -> LocalTable:
|
28
|
+
pass
|
29
|
+
|
30
|
+
|
31
|
+
class LocalTableRayObjectStoreReferenceStorageStrategy(LocalTableStorageStrategy):
|
32
|
+
"""
|
33
|
+
Stores the table in the RayPlasmaObjectStore - see deltacat/io/ray_plasma_object_store.py
|
34
|
+
"""
|
35
|
+
|
36
|
+
_object_store: IObjectStore = RayPlasmaObjectStore()
|
37
|
+
|
38
|
+
@property
|
39
|
+
def object_store(cls) -> IObjectStore:
|
40
|
+
return cls._object_store
|
41
|
+
|
42
|
+
def store_table(self, table: LocalTable) -> LocalTableReference:
|
43
|
+
obj_ref: ObjectRef = self.object_store.put(table)
|
44
|
+
return obj_ref
|
45
|
+
|
46
|
+
def get_table(self, table_like: LocalTableReference) -> LocalTable:
|
47
|
+
table = self.object_store.get(table_like)
|
48
|
+
return table
|
49
|
+
|
50
|
+
def get_table_reference(self, table_ref: Any) -> LocalTableReference:
|
51
|
+
return self.object_store.deserialize_references([table_ref])[0]
|
@@ -24,6 +24,16 @@ from deltacat.compute.compactor.model.materialize_result import MaterializeResul
|
|
24
24
|
from deltacat.compute.compactor_v2.utils.merge import (
|
25
25
|
generate_local_merge_input,
|
26
26
|
)
|
27
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
28
|
+
from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
|
29
|
+
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
30
|
+
DeleteStrategy,
|
31
|
+
)
|
32
|
+
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
33
|
+
DeleteFileEnvelope,
|
34
|
+
)
|
35
|
+
from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
|
36
|
+
|
27
37
|
from deltacat.storage import (
|
28
38
|
Delta,
|
29
39
|
DeltaLocator,
|
@@ -52,6 +62,7 @@ from deltacat.utils.resources import (
|
|
52
62
|
from deltacat.compute.compactor_v2.utils.task_options import (
|
53
63
|
hash_bucket_resource_options_provider,
|
54
64
|
merge_resource_options_provider,
|
65
|
+
local_merge_resource_options_provider,
|
55
66
|
)
|
56
67
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
57
68
|
|
@@ -95,7 +106,7 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
95
106
|
**params.s3_client_kwargs,
|
96
107
|
)
|
97
108
|
else:
|
98
|
-
logger.
|
109
|
+
logger.warning("No new partition was committed during compaction.")
|
99
110
|
|
100
111
|
logger.info(
|
101
112
|
f"Completed compaction session for: {params.source_partition_locator}"
|
@@ -149,7 +160,7 @@ def _execute_compaction(
|
|
149
160
|
)
|
150
161
|
if not round_completion_info:
|
151
162
|
logger.info(
|
152
|
-
|
163
|
+
"Both rebase partition and round completion file not found. Performing an entire backfill on source."
|
153
164
|
)
|
154
165
|
else:
|
155
166
|
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
@@ -175,7 +186,7 @@ def _execute_compaction(
|
|
175
186
|
|
176
187
|
delta_discovery_start = time.monotonic()
|
177
188
|
|
178
|
-
input_deltas = io.discover_deltas(
|
189
|
+
input_deltas: List[Delta] = io.discover_deltas(
|
179
190
|
params.source_partition_locator,
|
180
191
|
params.last_stream_position_to_compact,
|
181
192
|
params.rebase_source_partition_locator,
|
@@ -185,8 +196,24 @@ def _execute_compaction(
|
|
185
196
|
params.deltacat_storage_kwargs,
|
186
197
|
params.list_deltas_kwargs,
|
187
198
|
)
|
199
|
+
if not input_deltas:
|
200
|
+
logger.info("No input deltas found to compact.")
|
201
|
+
return None, None, None
|
188
202
|
|
189
|
-
|
203
|
+
delete_strategy: Optional[DeleteStrategy] = None
|
204
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
|
205
|
+
delete_file_size_bytes: int = 0
|
206
|
+
if contains_delete_deltas(input_deltas):
|
207
|
+
input_deltas, delete_file_envelopes, delete_strategy = prepare_deletes(
|
208
|
+
params, input_deltas
|
209
|
+
)
|
210
|
+
for delete_file_envelope in delete_file_envelopes:
|
211
|
+
delete_file_size_bytes += delete_file_envelope.table_size_bytes
|
212
|
+
logger.info(
|
213
|
+
f" Input deltas contain DELETE-type deltas. Total delete file size={delete_file_size_bytes}."
|
214
|
+
f" Total length of delete file envelopes={len(delete_file_envelopes)}"
|
215
|
+
)
|
216
|
+
uniform_deltas: List[DeltaAnnotated] = io.create_uniform_input_deltas(
|
190
217
|
input_deltas=input_deltas,
|
191
218
|
hash_bucket_count=params.hash_bucket_count,
|
192
219
|
compaction_audit=compaction_audit,
|
@@ -212,10 +239,6 @@ def _execute_compaction(
|
|
212
239
|
**params.s3_client_kwargs,
|
213
240
|
)
|
214
241
|
|
215
|
-
if not input_deltas:
|
216
|
-
logger.info("No input deltas found to compact.")
|
217
|
-
return None, None, None
|
218
|
-
|
219
242
|
# create a new stream for this round
|
220
243
|
compacted_stream_locator = params.destination_partition_locator.stream_locator
|
221
244
|
compacted_stream = params.deltacat_storage.get_stream(
|
@@ -236,8 +259,10 @@ def _execute_compaction(
|
|
236
259
|
resource_amount_provider=hash_bucket_resource_options_provider,
|
237
260
|
previous_inflation=params.previous_inflation,
|
238
261
|
average_record_size_bytes=params.average_record_size_bytes,
|
262
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
239
263
|
primary_keys=params.primary_keys,
|
240
264
|
ray_custom_resources=params.ray_custom_resources,
|
265
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
241
266
|
)
|
242
267
|
|
243
268
|
total_input_records_count = np.int64(0)
|
@@ -246,9 +271,36 @@ def _execute_compaction(
|
|
246
271
|
if params.hash_bucket_count == 1:
|
247
272
|
merge_start = time.monotonic()
|
248
273
|
local_merge_input = generate_local_merge_input(
|
249
|
-
params,
|
274
|
+
params,
|
275
|
+
uniform_deltas,
|
276
|
+
compacted_partition,
|
277
|
+
round_completion_info,
|
278
|
+
delete_strategy,
|
279
|
+
delete_file_envelopes,
|
280
|
+
)
|
281
|
+
estimated_da_bytes = (
|
282
|
+
compaction_audit.estimated_in_memory_size_bytes_during_discovery
|
283
|
+
)
|
284
|
+
estimated_num_records = sum(
|
285
|
+
[
|
286
|
+
entry.meta.record_count
|
287
|
+
for delta in uniform_deltas
|
288
|
+
for entry in delta.manifest.entries
|
289
|
+
]
|
290
|
+
)
|
291
|
+
local_merge_options = local_merge_resource_options_provider(
|
292
|
+
estimated_da_size=estimated_da_bytes,
|
293
|
+
estimated_num_rows=estimated_num_records,
|
294
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
295
|
+
round_completion_info=round_completion_info,
|
296
|
+
compacted_delta_manifest=previous_compacted_delta_manifest,
|
297
|
+
ray_custom_resources=params.ray_custom_resources,
|
298
|
+
primary_keys=params.primary_keys,
|
299
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
300
|
+
)
|
301
|
+
local_merge_result = ray.get(
|
302
|
+
mg.merge.options(**local_merge_options).remote(local_merge_input)
|
250
303
|
)
|
251
|
-
local_merge_result = ray.get(mg.merge.remote(local_merge_input))
|
252
304
|
total_input_records_count += local_merge_result.input_record_count
|
253
305
|
merge_results = [local_merge_result]
|
254
306
|
merge_invoke_end = time.monotonic()
|
@@ -269,6 +321,7 @@ def _execute_compaction(
|
|
269
321
|
object_store=params.object_store,
|
270
322
|
deltacat_storage=params.deltacat_storage,
|
271
323
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
324
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
272
325
|
)
|
273
326
|
}
|
274
327
|
|
@@ -345,6 +398,9 @@ def _execute_compaction(
|
|
345
398
|
)
|
346
399
|
|
347
400
|
# BSP Step 2: Merge
|
401
|
+
# NOTE: DELETE-type deltas are stored in Plasma object store
|
402
|
+
# in prepare_deletes and therefore don't need to included
|
403
|
+
# in merge task resource estimation
|
348
404
|
merge_options_provider = functools.partial(
|
349
405
|
task_resource_options_provider,
|
350
406
|
pg_config=params.pg_config,
|
@@ -352,12 +408,14 @@ def _execute_compaction(
|
|
352
408
|
num_hash_groups=params.hash_group_count,
|
353
409
|
hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
|
354
410
|
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
411
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
355
412
|
round_completion_info=round_completion_info,
|
356
413
|
compacted_delta_manifest=previous_compacted_delta_manifest,
|
357
414
|
primary_keys=params.primary_keys,
|
358
415
|
deltacat_storage=params.deltacat_storage,
|
359
416
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
360
417
|
ray_custom_resources=params.ray_custom_resources,
|
418
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
361
419
|
)
|
362
420
|
|
363
421
|
def merge_input_provider(index, item):
|
@@ -385,6 +443,9 @@ def _execute_compaction(
|
|
385
443
|
object_store=params.object_store,
|
386
444
|
deltacat_storage=params.deltacat_storage,
|
387
445
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
446
|
+
delete_strategy=delete_strategy,
|
447
|
+
delete_file_envelopes=delete_file_envelopes,
|
448
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
388
449
|
)
|
389
450
|
}
|
390
451
|
|
@@ -406,7 +467,12 @@ def _execute_compaction(
|
|
406
467
|
merge_end = time.monotonic()
|
407
468
|
|
408
469
|
total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
|
409
|
-
|
470
|
+
total_deleted_record_count = sum(
|
471
|
+
[ddr.deleted_record_count for ddr in merge_results]
|
472
|
+
)
|
473
|
+
logger.info(
|
474
|
+
f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
|
475
|
+
)
|
410
476
|
|
411
477
|
compaction_audit.set_input_records(total_input_records_count.item())
|
412
478
|
|
@@ -419,7 +485,7 @@ def _execute_compaction(
|
|
419
485
|
)
|
420
486
|
|
421
487
|
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
422
|
-
|
488
|
+
compaction_audit.set_records_deleted(total_deleted_record_count.item())
|
423
489
|
mat_results = []
|
424
490
|
for merge_result in merge_results:
|
425
491
|
mat_results.extend(merge_result.materialize_results)
|
@@ -466,6 +532,7 @@ def _execute_compaction(
|
|
466
532
|
record_info_msg = (
|
467
533
|
f"Hash bucket records: {total_hb_record_count},"
|
468
534
|
f" Deduped records: {total_dd_record_count}, "
|
535
|
+
f" Deleted records: {total_deleted_record_count}, "
|
469
536
|
f" Materialized records: {merged_delta.meta.record_count}"
|
470
537
|
)
|
471
538
|
logger.info(record_info_msg)
|
@@ -526,7 +593,7 @@ def _execute_compaction(
|
|
526
593
|
)
|
527
594
|
|
528
595
|
# After all incremental delta related calculations, we update
|
529
|
-
# the input sizes to
|
596
|
+
# the input sizes to accommodate the compacted table
|
530
597
|
if round_completion_info:
|
531
598
|
compaction_audit.set_input_file_count(
|
532
599
|
(compaction_audit.input_file_count or 0)
|
@@ -565,7 +632,6 @@ def _execute_compaction(
|
|
565
632
|
f"partition-{params.source_partition_locator.partition_values},"
|
566
633
|
f"compacted at: {params.last_stream_position_to_compact},"
|
567
634
|
)
|
568
|
-
|
569
635
|
return (
|
570
636
|
compacted_partition,
|
571
637
|
new_round_completion_info,
|
File without changes
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
from typing import Any, List, Optional
|
4
|
+
from deltacat.storage import DeltaType, LocalTable
|
5
|
+
from deltacat.compute.compactor import (
|
6
|
+
DeltaFileEnvelope,
|
7
|
+
)
|
8
|
+
import numpy as np
|
9
|
+
import pyarrow as pa
|
10
|
+
|
11
|
+
from deltacat.compute.compactor.model.table_object_store import (
|
12
|
+
LocalTableStorageStrategy,
|
13
|
+
LocalTableRayObjectStoreReferenceStorageStrategy,
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
class DeleteFileEnvelope(DeltaFileEnvelope):
|
18
|
+
@staticmethod
|
19
|
+
def of(
|
20
|
+
stream_position: int,
|
21
|
+
delta_type: DeltaType,
|
22
|
+
table: LocalTable,
|
23
|
+
delete_columns: List[str],
|
24
|
+
file_index: int = None,
|
25
|
+
is_src_delta: np.bool_ = True,
|
26
|
+
file_record_count: Optional[int] = None,
|
27
|
+
table_storage_strategy: LocalTableStorageStrategy = LocalTableRayObjectStoreReferenceStorageStrategy(),
|
28
|
+
) -> DeleteFileEnvelope:
|
29
|
+
"""
|
30
|
+
Static factory builder for a DeleteFileEnvelope. Subclasses from DeltaFileEnvelope
|
31
|
+
`
|
32
|
+
Args:
|
33
|
+
stream_position: Stream position of a delta.
|
34
|
+
delta_type: A delta type.
|
35
|
+
table: The table object that represents the delta file.
|
36
|
+
delete_columns: delete column_names needed for equality-based deletes,
|
37
|
+
file_index: Manifest file index number of a delta.
|
38
|
+
is_src_delta: True if this Delta File Locator is
|
39
|
+
pointing to a file from the uncompacted source table, False if
|
40
|
+
this Locator is pointing to a file in the compacted destination
|
41
|
+
table.
|
42
|
+
table_storage_strategy: The way the table object is stored in the delta file envelope. Defaults to LocalTableRayObjectStoreReferenceStorageStrategy
|
43
|
+
Returns:
|
44
|
+
A delete file envelope.
|
45
|
+
|
46
|
+
"""
|
47
|
+
delete_file_envelope = DeltaFileEnvelope.of(
|
48
|
+
stream_position,
|
49
|
+
delta_type,
|
50
|
+
table,
|
51
|
+
file_index,
|
52
|
+
is_src_delta,
|
53
|
+
file_record_count,
|
54
|
+
table_storage_strategy,
|
55
|
+
)
|
56
|
+
assert len(delete_columns) > 0, "At least 1 delete column is expected"
|
57
|
+
delete_file_envelope["delete_columns"] = delete_columns
|
58
|
+
if isinstance(table, pa.Table):
|
59
|
+
delete_file_envelope["table_size_bytes"] = table.nbytes
|
60
|
+
return DeleteFileEnvelope(**delete_file_envelope)
|
61
|
+
|
62
|
+
@property
|
63
|
+
def table_size_bytes(self) -> int:
|
64
|
+
val = self.get("table_size_bytes")
|
65
|
+
if val is not None:
|
66
|
+
return val
|
67
|
+
else:
|
68
|
+
raise ValueError(
|
69
|
+
f"Table type: {type(self.table)} not for supported for size method."
|
70
|
+
)
|
71
|
+
|
72
|
+
@property
|
73
|
+
def delete_columns(self) -> List[str]:
|
74
|
+
return self["delete_columns"]
|
75
|
+
|
76
|
+
@property
|
77
|
+
def table_reference(self) -> Optional[Any]:
|
78
|
+
if self.table_storage_strategy is not None and isinstance(
|
79
|
+
self.table_storage_strategy,
|
80
|
+
LocalTableRayObjectStoreReferenceStorageStrategy,
|
81
|
+
):
|
82
|
+
return self.table_storage_strategy.get_table_reference(self["table"])
|
83
|
+
return None
|
@@ -0,0 +1,82 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
|
6
|
+
from typing import Tuple
|
7
|
+
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
8
|
+
DeleteFileEnvelope,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
class DeleteStrategy(ABC):
|
13
|
+
"""
|
14
|
+
Encapsulates a strategy for applying row-level deletes on tables during compaction
|
15
|
+
|
16
|
+
This abstract base class defines the interface for applying delete operations
|
17
|
+
on intermediate in-memory pyarrow tables during compaction. Concrete subclasses must implement the `apply_deletes` and
|
18
|
+
`apply_many_deletes` methods, as well as the `name` property.
|
19
|
+
|
20
|
+
Example:
|
21
|
+
>>> class MyDeleteStrategy(DeleteStrategy):
|
22
|
+
... @property
|
23
|
+
... def name(self) -> str:
|
24
|
+
... return "MyDeleteStrategy"
|
25
|
+
...
|
26
|
+
... def apply_deletes(self, table: Optional[pa.Table], delete_file_envelope: DeleteFileEnvelope) -> ReturnTuple[pa.Table, int]:
|
27
|
+
... # Implement delete logic here
|
28
|
+
... pass
|
29
|
+
...
|
30
|
+
... def apply_many_deletes(self, table: Optional[pa.Table], delete_file_envelopes: List[DeleteFileEnvelope]) -> ReturnTuple[pa.Table, int]:
|
31
|
+
... # Implement delete logic here
|
32
|
+
... pass
|
33
|
+
"""
|
34
|
+
|
35
|
+
@property
|
36
|
+
def name(self) -> str:
|
37
|
+
"""
|
38
|
+
The name of the delete strategy.
|
39
|
+
"""
|
40
|
+
pass
|
41
|
+
|
42
|
+
@abstractmethod
|
43
|
+
def apply_deletes(
|
44
|
+
self,
|
45
|
+
table: Optional[pa.Table],
|
46
|
+
delete_file_envelope: DeleteFileEnvelope,
|
47
|
+
*args,
|
48
|
+
**kwargs,
|
49
|
+
) -> Tuple[pa.Table, int]:
|
50
|
+
"""
|
51
|
+
Apply delete operations on the given table using the provided delete file envelope.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
table (Optional[pa.Table]): The pyarrow table to apply deletes on.
|
55
|
+
delete_file_envelope (DeleteFileEnvelope): The delete file envelope containing delete parameters.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
Tuple[pa.Table, int]: A tuple containing the updated Arrow table after applying deletes,
|
59
|
+
and the number of rows deleted.
|
60
|
+
"""
|
61
|
+
pass
|
62
|
+
|
63
|
+
@abstractmethod
|
64
|
+
def apply_many_deletes(
|
65
|
+
self,
|
66
|
+
table: Optional[pa.Table],
|
67
|
+
delete_file_envelopes: List[DeleteFileEnvelope],
|
68
|
+
*args,
|
69
|
+
**kwargs,
|
70
|
+
) -> Tuple[pa.Table, int]:
|
71
|
+
"""
|
72
|
+
Apply delete operations on the given table using all provided delete file envelopes.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
table (Optional[pa.Table]): The Arrow table to apply deletes on.
|
76
|
+
delete_file_envelopes (List[DeleteFileEnvelope]): A list of delete file envelopes containing delete parameters.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
Tuple[pa.Table, int]: A tuple containing the updated Arrow table after applying all deletes,
|
80
|
+
and the total number of rows deleted.
|
81
|
+
"""
|
82
|
+
pass
|