deltacat 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/model/compact_partition_params.py +25 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
- deltacat/compute/compactor/model/delta_file_envelope.py +21 -3
- deltacat/compute/compactor/model/table_object_store.py +51 -0
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor_v2/compaction_session.py +80 -14
- deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
- deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +83 -0
- deltacat/compute/compactor_v2/deletes/delete_strategy.py +82 -0
- deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +161 -0
- deltacat/compute/compactor_v2/deletes/model.py +23 -0
- deltacat/compute/compactor_v2/deletes/utils.py +164 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_input.py +24 -1
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +221 -50
- deltacat/compute/compactor_v2/utils/delta.py +11 -1
- deltacat/compute/compactor_v2/utils/merge.py +10 -0
- deltacat/compute/compactor_v2/utils/task_options.py +94 -8
- deltacat/io/memcached_object_store.py +20 -0
- deltacat/io/ray_plasma_object_store.py +6 -0
- deltacat/logs.py +29 -2
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +2 -0
- deltacat/storage/model/delete_parameters.py +40 -0
- deltacat/storage/model/delta.py +25 -1
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +1930 -0
- deltacat/tests/compute/compact_partition_test_cases.py +16 -822
- deltacat/tests/compute/compactor/utils/test_io.py +4 -4
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -0
- deltacat/tests/compute/test_compact_partition_params.py +5 -0
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +32 -20
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +28 -10
- deltacat/tests/io/test_memcached_object_store.py +19 -0
- deltacat/tests/local_deltacat_storage/__init__.py +3 -0
- deltacat/tests/test_utils/constants.py +1 -2
- deltacat/tests/test_utils/pyarrow.py +27 -10
- deltacat/utils/pandas.py +1 -1
- deltacat/utils/ray_utils/runtime.py +3 -3
- deltacat/utils/resources.py +7 -5
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/METADATA +1 -1
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/RECORD +47 -38
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/LICENSE +0 -0
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/WHEEL +0 -0
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/top_level.txt +0 -0
@@ -4,13 +4,15 @@ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
|
4
4
|
import numpy as np
|
5
5
|
import pyarrow as pa
|
6
6
|
import ray
|
7
|
+
import itertools
|
7
8
|
import time
|
8
9
|
import pyarrow.compute as pc
|
9
10
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
10
11
|
from uuid import uuid4
|
11
12
|
from deltacat import logs
|
12
|
-
from typing import List, Optional, Tuple
|
13
|
+
from typing import Callable, Iterator, List, Optional, Tuple
|
13
14
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
15
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
|
14
16
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
15
17
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
16
18
|
from deltacat.compute.compactor import RoundCompletionInfo, DeltaFileEnvelope
|
@@ -69,17 +71,11 @@ def _drop_delta_type_rows(table: pa.Table, delta_type: DeltaType) -> pa.Table:
|
|
69
71
|
|
70
72
|
|
71
73
|
def _build_incremental_table(
|
72
|
-
|
74
|
+
df_envelopes: List[DeltaFileEnvelope],
|
73
75
|
) -> pa.Table:
|
74
76
|
|
75
77
|
hb_tables = []
|
76
78
|
# sort by delta file stream position now instead of sorting every row later
|
77
|
-
df_envelopes = [d for dfe_list in df_envelopes_list for d in dfe_list]
|
78
|
-
df_envelopes = sorted(
|
79
|
-
df_envelopes,
|
80
|
-
key=lambda df: (df.stream_position, df.file_index),
|
81
|
-
reverse=False, # ascending
|
82
|
-
)
|
83
79
|
is_delete = False
|
84
80
|
for df_envelope in df_envelopes:
|
85
81
|
assert (
|
@@ -96,9 +92,7 @@ def _build_incremental_table(
|
|
96
92
|
)
|
97
93
|
|
98
94
|
hb_tables.append(table)
|
99
|
-
|
100
95
|
result = pa.concat_tables(hb_tables)
|
101
|
-
|
102
96
|
return result
|
103
97
|
|
104
98
|
|
@@ -111,7 +105,7 @@ def _merge_tables(
|
|
111
105
|
"""
|
112
106
|
Merges the table with compacted table dropping duplicates where necessary.
|
113
107
|
|
114
|
-
This method ensures the appropriate deltas of types
|
108
|
+
This method ensures the appropriate deltas of types [UPSERT] are correctly
|
115
109
|
appended to the table.
|
116
110
|
"""
|
117
111
|
|
@@ -214,7 +208,7 @@ def _copy_all_manifest_files_from_old_hash_buckets(
|
|
214
208
|
hb_index_to_indices = round_completion_info.hb_index_to_entry_range
|
215
209
|
|
216
210
|
if hb_index_to_indices is None:
|
217
|
-
logger.info(
|
211
|
+
logger.info("Nothing to copy by reference. Skipping...")
|
218
212
|
return []
|
219
213
|
|
220
214
|
for hb_index in hb_index_copy_by_reference:
|
@@ -257,62 +251,209 @@ def _copy_all_manifest_files_from_old_hash_buckets(
|
|
257
251
|
return materialize_result_list
|
258
252
|
|
259
253
|
|
254
|
+
def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
|
255
|
+
"""
|
256
|
+
Checks if the given hash bucket index has a compacted table available from the previous compaction round.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
input (MergeInput): The input for the merge operation.
|
260
|
+
hb_idx (int): The hash bucket index to check.
|
261
|
+
|
262
|
+
Returns:
|
263
|
+
bool: True if the hash bucket index has a compacted table available, False otherwise.
|
264
|
+
"""
|
265
|
+
return (
|
266
|
+
input.round_completion_info
|
267
|
+
and input.round_completion_info.hb_index_to_entry_range
|
268
|
+
and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
|
269
|
+
is not None
|
270
|
+
)
|
271
|
+
|
272
|
+
|
273
|
+
def _can_copy_by_reference(
|
274
|
+
has_delete: bool, merge_file_group: MergeFileGroup, input: MergeInput
|
275
|
+
) -> bool:
|
276
|
+
"""
|
277
|
+
Can copy by reference only if there are no deletes to merge in
|
278
|
+
and previous compacted stream id matches that of new stream
|
279
|
+
"""
|
280
|
+
return (
|
281
|
+
not has_delete
|
282
|
+
and not merge_file_group.dfe_groups
|
283
|
+
and input.round_completion_info is not None
|
284
|
+
and (
|
285
|
+
input.write_to_partition.stream_id
|
286
|
+
== input.round_completion_info.compacted_delta_locator.stream_id
|
287
|
+
)
|
288
|
+
)
|
289
|
+
|
290
|
+
|
291
|
+
def _flatten_dfe_list(
|
292
|
+
df_envelopes_list: List[List[DeltaFileEnvelope]],
|
293
|
+
) -> List[DeltaFileEnvelope]:
|
294
|
+
"""
|
295
|
+
Flattens a list of lists of DeltaFileEnvelope objects into a single list of DeltaFileEnvelope objects.
|
296
|
+
|
297
|
+
Args:
|
298
|
+
df_envelopes_list (List[List[DeltaFileEnvelope]]): A list of lists of DeltaFileEnvelope objects.
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
List[DeltaFileEnvelope]: A flattened list of DeltaFileEnvelope objects.
|
302
|
+
"""
|
303
|
+
if not df_envelopes_list:
|
304
|
+
return []
|
305
|
+
return [d for dfe_list in df_envelopes_list for d in dfe_list]
|
306
|
+
|
307
|
+
|
308
|
+
def _sort_df_envelopes(
|
309
|
+
df_envelopes: List[DeltaFileEnvelope],
|
310
|
+
key: Callable = lambda df: (df.stream_position, df.file_index),
|
311
|
+
) -> List[DeltaFileEnvelope]:
|
312
|
+
"""
|
313
|
+
Sorts a list of DeltaFileEnvelope objects based on a specified key function.
|
314
|
+
|
315
|
+
Args:
|
316
|
+
df_envelopes (List[DeltaFileEnvelope]): A list of DeltaFileEnvelope objects.
|
317
|
+
key (Callable, optional): A function that takes a DeltaFileEnvelope object and returns a key for sorting.
|
318
|
+
Defaults to lambda df: (df.stream_position, df.file_index).
|
319
|
+
|
320
|
+
Returns:
|
321
|
+
List[DeltaFileEnvelope]: A sorted list of DeltaFileEnvelope objects.
|
322
|
+
"""
|
323
|
+
if not df_envelopes:
|
324
|
+
return []
|
325
|
+
return sorted(
|
326
|
+
df_envelopes,
|
327
|
+
key=key,
|
328
|
+
reverse=False, # ascending
|
329
|
+
)
|
330
|
+
|
331
|
+
|
332
|
+
def _group_sequence_by_delta_type(
|
333
|
+
df_envelopes: List[DeltaFileEnvelope],
|
334
|
+
) -> Iterator[Tuple[List, List]]:
|
335
|
+
"""
|
336
|
+
Groups a list of DeltaFileEnvelope objects by their delta_type.
|
337
|
+
|
338
|
+
Args:
|
339
|
+
df_envelopes (List[DeltaFileEnvelope]): A list of DeltaFileEnvelope objects.
|
340
|
+
|
341
|
+
Yields:
|
342
|
+
Iterator[Tuple[DeltaType, List[DeltaFileEnvelope]]]: A tuple containing the delta_type
|
343
|
+
and a list of DeltaFileEnvelope objects that share the same delta_type.
|
344
|
+
"""
|
345
|
+
iter_df_envelopes = iter(df_envelopes)
|
346
|
+
for delta_type, delta_type_sequence in itertools.groupby(
|
347
|
+
iter_df_envelopes, lambda x: x.delta_type
|
348
|
+
):
|
349
|
+
yield delta_type, list(delta_type_sequence)
|
350
|
+
|
351
|
+
|
260
352
|
def _compact_tables(
|
261
|
-
input: MergeInput,
|
262
|
-
|
353
|
+
input: MergeInput,
|
354
|
+
dfe_list: Optional[List[List[DeltaFileEnvelope]]],
|
355
|
+
hb_idx: int,
|
356
|
+
compacted_table: Optional[pa.Table] = None,
|
357
|
+
) -> Tuple[pa.Table, int, int, int]:
|
358
|
+
"""
|
359
|
+
Compacts a list of DeltaFileEnvelope objects into a single PyArrow table.
|
360
|
+
|
361
|
+
Args:
|
362
|
+
input (MergeInput): The input for the merge operation.
|
363
|
+
dfe_list (List[List[DeltaFileEnvelope]]): A list of lists of DeltaFileEnvelope objects.
|
364
|
+
hb_idx (int): The hash bucket index for the compaction.
|
365
|
+
|
366
|
+
Returns:
|
367
|
+
Tuple[pa.Table, int, int, int]: A tuple containing:
|
368
|
+
1. The compacted PyArrow table.
|
369
|
+
2. The total number of records in the incremental data.
|
370
|
+
3. The total number of deduplicated records.
|
371
|
+
4. The total number of deleted records due to DELETE operations.
|
372
|
+
"""
|
373
|
+
df_envelopes: List[DeltaFileEnvelope] = _flatten_dfe_list(dfe_list)
|
374
|
+
delete_file_envelopes = input.delete_file_envelopes or []
|
375
|
+
reordered_all_dfes: List[DeltaFileEnvelope] = _sort_df_envelopes(
|
376
|
+
delete_file_envelopes + df_envelopes
|
377
|
+
)
|
378
|
+
assert all(
|
379
|
+
dfe.delta_type in (DeltaType.UPSERT, DeltaType.DELETE)
|
380
|
+
for dfe in reordered_all_dfes
|
381
|
+
), "All reordered delta file envelopes must be of the UPSERT or DELETE"
|
382
|
+
table = compacted_table
|
383
|
+
aggregated_incremental_len = 0
|
384
|
+
aggregated_deduped_records = 0
|
385
|
+
aggregated_dropped_records = 0
|
386
|
+
for i, (delta_type, delta_type_sequence) in enumerate(
|
387
|
+
_group_sequence_by_delta_type(reordered_all_dfes)
|
388
|
+
):
|
389
|
+
if delta_type is DeltaType.UPSERT:
|
390
|
+
(
|
391
|
+
table,
|
392
|
+
incremental_len,
|
393
|
+
deduped_records,
|
394
|
+
merge_time,
|
395
|
+
) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
|
396
|
+
logger.info(
|
397
|
+
f" [Merge task index {input.merge_task_index}] Merged"
|
398
|
+
f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
|
399
|
+
)
|
400
|
+
aggregated_incremental_len += incremental_len
|
401
|
+
aggregated_deduped_records += deduped_records
|
402
|
+
elif delta_type is DeltaType.DELETE:
|
403
|
+
table_size_before_delete = len(table) if table else 0
|
404
|
+
(table, dropped_rows), delete_time = timed_invocation(
|
405
|
+
func=input.delete_strategy.apply_many_deletes,
|
406
|
+
table=table,
|
407
|
+
delete_file_envelopes=delta_type_sequence,
|
408
|
+
)
|
409
|
+
logger.info(
|
410
|
+
f" [Merge task index {input.merge_task_index}]"
|
411
|
+
+ f" Dropped record count: {dropped_rows} from table"
|
412
|
+
+ f" of record count {table_size_before_delete} took: {delete_time}s"
|
413
|
+
)
|
414
|
+
aggregated_dropped_records += dropped_rows
|
415
|
+
return (
|
416
|
+
table,
|
417
|
+
aggregated_incremental_len,
|
418
|
+
aggregated_deduped_records,
|
419
|
+
aggregated_dropped_records,
|
420
|
+
)
|
421
|
+
|
422
|
+
|
423
|
+
def _apply_upserts(
|
424
|
+
input: MergeInput,
|
425
|
+
dfe_list: List[DeltaFileEnvelope],
|
426
|
+
hb_idx,
|
427
|
+
prev_table=None,
|
428
|
+
) -> Tuple[pa.Table, int, int, int]:
|
429
|
+
assert all(
|
430
|
+
dfe.delta_type is DeltaType.UPSERT for dfe in dfe_list
|
431
|
+
), "All incoming delta file envelopes must of the DeltaType.UPSERT"
|
263
432
|
logger.info(
|
264
433
|
f"[Hash bucket index {hb_idx}] Reading dedupe input for "
|
265
434
|
f"{len(dfe_list)} delta file envelope lists..."
|
266
435
|
)
|
267
436
|
table = _build_incremental_table(dfe_list)
|
268
|
-
|
269
437
|
incremental_len = len(table)
|
270
438
|
logger.info(
|
271
439
|
f"[Hash bucket index {hb_idx}] Got the incremental table of length {incremental_len}"
|
272
440
|
)
|
273
|
-
|
274
441
|
if input.sort_keys:
|
275
442
|
# Incremental is sorted and merged, as sorting
|
276
443
|
# on non event based sort key does not produce consistent
|
277
444
|
# compaction results. E.g., compaction(delta1, delta2, delta3)
|
278
445
|
# will not be equal to compaction(compaction(delta1, delta2), delta3).
|
279
446
|
table = table.sort_by(input.sort_keys)
|
280
|
-
|
281
|
-
compacted_table = None
|
282
|
-
|
283
|
-
if (
|
284
|
-
input.round_completion_info
|
285
|
-
and input.round_completion_info.hb_index_to_entry_range
|
286
|
-
and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
|
287
|
-
is not None
|
288
|
-
):
|
289
|
-
compacted_table = _download_compacted_table(
|
290
|
-
hb_index=hb_idx,
|
291
|
-
rcf=input.round_completion_info,
|
292
|
-
read_kwargs_provider=input.read_kwargs_provider,
|
293
|
-
deltacat_storage=input.deltacat_storage,
|
294
|
-
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
295
|
-
)
|
296
|
-
|
297
|
-
hb_table_record_count = len(table) + (
|
298
|
-
len(compacted_table) if compacted_table else 0
|
299
|
-
)
|
300
|
-
|
447
|
+
hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
|
301
448
|
table, merge_time = timed_invocation(
|
302
449
|
func=_merge_tables,
|
303
450
|
table=table,
|
304
451
|
primary_keys=input.primary_keys,
|
305
452
|
can_drop_duplicates=input.drop_duplicates,
|
306
|
-
compacted_table=
|
453
|
+
compacted_table=prev_table,
|
307
454
|
)
|
308
|
-
|
309
|
-
|
310
|
-
logger.info(
|
311
|
-
f"[Merge task index {input.merge_task_index}] Merged "
|
312
|
-
f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
|
313
|
-
)
|
314
|
-
|
315
|
-
return table, incremental_len, total_deduped_records
|
455
|
+
deduped_records = hb_table_record_count - len(table)
|
456
|
+
return table, incremental_len, deduped_records, merge_time
|
316
457
|
|
317
458
|
|
318
459
|
def _copy_manifests_from_hash_bucketing(
|
@@ -345,20 +486,47 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
345
486
|
f"merge_{worker_id}_{task_id}.bin"
|
346
487
|
) if input.enable_profiler else nullcontext():
|
347
488
|
total_input_records, total_deduped_records = 0, 0
|
489
|
+
total_dropped_records = 0
|
348
490
|
materialized_results: List[MaterializeResult] = []
|
349
491
|
merge_file_groups = input.merge_file_groups_provider.create()
|
350
492
|
hb_index_copy_by_ref_ids = []
|
351
493
|
|
352
494
|
for merge_file_group in merge_file_groups:
|
353
|
-
|
495
|
+
compacted_table = None
|
496
|
+
has_delete = input.delete_file_envelopes is not None
|
497
|
+
if has_delete:
|
498
|
+
assert (
|
499
|
+
input.delete_strategy is not None
|
500
|
+
), "Merge input missing delete_strategy"
|
501
|
+
if _can_copy_by_reference(
|
502
|
+
has_delete=has_delete, merge_file_group=merge_file_group, input=input
|
503
|
+
):
|
354
504
|
hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
|
355
505
|
continue
|
356
506
|
|
357
|
-
|
358
|
-
|
507
|
+
if _has_previous_compacted_table(input, merge_file_group.hb_index):
|
508
|
+
compacted_table = _download_compacted_table(
|
509
|
+
hb_index=merge_file_group.hb_index,
|
510
|
+
rcf=input.round_completion_info,
|
511
|
+
read_kwargs_provider=input.read_kwargs_provider,
|
512
|
+
deltacat_storage=input.deltacat_storage,
|
513
|
+
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
514
|
+
)
|
515
|
+
if not merge_file_group.dfe_groups and compacted_table is None:
|
516
|
+
logger.warning(
|
517
|
+
f" [Hash bucket index {merge_file_group.hb_index}]"
|
518
|
+
+ f" No new deltas and no compacted table found. Skipping compaction for {merge_file_group.hb_index}"
|
519
|
+
)
|
520
|
+
continue
|
521
|
+
table, input_records, deduped_records, dropped_records = _compact_tables(
|
522
|
+
input,
|
523
|
+
merge_file_group.dfe_groups,
|
524
|
+
merge_file_group.hb_index,
|
525
|
+
compacted_table,
|
359
526
|
)
|
360
527
|
total_input_records += input_records
|
361
528
|
total_deduped_records += deduped_records
|
529
|
+
total_dropped_records += dropped_records
|
362
530
|
materialized_results.append(
|
363
531
|
merge_utils.materialize(input, merge_file_group.hb_index, [table])
|
364
532
|
)
|
@@ -382,6 +550,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
382
550
|
materialized_results,
|
383
551
|
np.int64(total_input_records),
|
384
552
|
np.int64(total_deduped_records),
|
553
|
+
np.int64(total_dropped_records),
|
385
554
|
np.double(peak_memory_usage_bytes),
|
386
555
|
np.double(0.0),
|
387
556
|
np.double(time.time()),
|
@@ -400,7 +569,8 @@ def merge(input: MergeInput) -> MergeResult:
|
|
400
569
|
f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
|
401
570
|
)
|
402
571
|
|
403
|
-
|
572
|
+
if input.memory_logs_enabled:
|
573
|
+
process_util.schedule_callback(log_peak_memory, 10)
|
404
574
|
|
405
575
|
merge_result, duration = timed_invocation(func=_timed_merge, input=input)
|
406
576
|
|
@@ -420,6 +590,7 @@ def merge(input: MergeInput) -> MergeResult:
|
|
420
590
|
merge_result[1],
|
421
591
|
merge_result[2],
|
422
592
|
merge_result[3],
|
593
|
+
merge_result[4],
|
423
594
|
np.double(emit_metrics_time),
|
424
595
|
merge_result[4],
|
425
596
|
)
|
@@ -5,7 +5,10 @@ from deltacat.compute.compactor import (
|
|
5
5
|
DeltaAnnotated,
|
6
6
|
DeltaFileEnvelope,
|
7
7
|
)
|
8
|
-
|
8
|
+
from deltacat.storage import (
|
9
|
+
Delta,
|
10
|
+
)
|
11
|
+
from deltacat.storage.model.delta import DeltaType
|
9
12
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
10
13
|
from deltacat.types.media import StorageType
|
11
14
|
from deltacat.utils.common import ReadKwargsProvider
|
@@ -18,6 +21,13 @@ import logging
|
|
18
21
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
19
22
|
|
20
23
|
|
24
|
+
def contains_delete_deltas(deltas: List[Delta]) -> bool:
|
25
|
+
for delta in deltas:
|
26
|
+
if delta.type is DeltaType.DELETE:
|
27
|
+
return True
|
28
|
+
return False
|
29
|
+
|
30
|
+
|
21
31
|
def read_delta_file_envelopes(
|
22
32
|
annotated_delta: DeltaAnnotated,
|
23
33
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
@@ -25,6 +25,12 @@ from deltacat.utils.performance import timed_invocation
|
|
25
25
|
from deltacat.storage import (
|
26
26
|
Partition,
|
27
27
|
)
|
28
|
+
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
29
|
+
DeleteStrategy,
|
30
|
+
)
|
31
|
+
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
32
|
+
DeleteFileEnvelope,
|
33
|
+
)
|
28
34
|
|
29
35
|
|
30
36
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -86,6 +92,8 @@ def generate_local_merge_input(
|
|
86
92
|
annotated_deltas: List[DeltaAnnotated],
|
87
93
|
compacted_partition: Partition,
|
88
94
|
round_completion_info: Optional[RoundCompletionInfo],
|
95
|
+
delete_strategy: Optional[DeleteStrategy] = None,
|
96
|
+
delete_file_envelopes: Optional[DeleteFileEnvelope] = None,
|
89
97
|
):
|
90
98
|
"""
|
91
99
|
Generates a merge input for local deltas that do not reside in the Ray object store and
|
@@ -123,4 +131,6 @@ def generate_local_merge_input(
|
|
123
131
|
object_store=params.object_store,
|
124
132
|
deltacat_storage=params.deltacat_storage,
|
125
133
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
134
|
+
delete_strategy=delete_strategy,
|
135
|
+
delete_file_envelopes=delete_file_envelopes,
|
126
136
|
)
|
@@ -1,7 +1,10 @@
|
|
1
1
|
import botocore
|
2
2
|
import logging
|
3
|
-
from typing import Dict, Optional, List, Tuple
|
3
|
+
from typing import Dict, Optional, List, Tuple, Any
|
4
4
|
from deltacat import logs
|
5
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
6
|
+
LocalMergeFileGroupsProvider,
|
7
|
+
)
|
5
8
|
from deltacat.types.media import ContentEncoding, ContentType
|
6
9
|
from deltacat.types.partial_download import PartialParquetParameters
|
7
10
|
from deltacat.storage import (
|
@@ -15,7 +18,6 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
15
18
|
hash_group_index_to_hash_bucket_indices,
|
16
19
|
)
|
17
20
|
from deltacat.compute.compactor_v2.constants import (
|
18
|
-
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
19
21
|
PARQUET_TO_PYARROW_INFLATION,
|
20
22
|
)
|
21
23
|
|
@@ -133,8 +135,10 @@ def hash_bucket_resource_options_provider(
|
|
133
135
|
item: DeltaAnnotated,
|
134
136
|
previous_inflation: float,
|
135
137
|
average_record_size_bytes: float,
|
138
|
+
total_memory_buffer_percentage: int,
|
136
139
|
primary_keys: List[str] = None,
|
137
140
|
ray_custom_resources: Optional[Dict] = None,
|
141
|
+
memory_logs_enabled: Optional[bool] = None,
|
138
142
|
**kwargs,
|
139
143
|
) -> Dict:
|
140
144
|
debug_memory_params = {"hash_bucket_task_index": index}
|
@@ -189,10 +193,11 @@ def hash_bucket_resource_options_provider(
|
|
189
193
|
debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
|
190
194
|
|
191
195
|
# Consider buffer
|
192
|
-
total_memory = total_memory * (1 +
|
196
|
+
total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
|
193
197
|
debug_memory_params["total_memory_with_buffer"] = total_memory
|
194
|
-
logger.
|
195
|
-
f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
|
198
|
+
logger.debug_conditional(
|
199
|
+
f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}",
|
200
|
+
memory_logs_enabled,
|
196
201
|
)
|
197
202
|
|
198
203
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -204,12 +209,14 @@ def merge_resource_options_provider(
|
|
204
209
|
num_hash_groups: int,
|
205
210
|
hash_group_size_bytes: Dict[int, int],
|
206
211
|
hash_group_num_rows: Dict[int, int],
|
212
|
+
total_memory_buffer_percentage: int,
|
207
213
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
208
214
|
compacted_delta_manifest: Optional[Manifest] = None,
|
209
215
|
ray_custom_resources: Optional[Dict] = None,
|
210
216
|
primary_keys: Optional[List[str]] = None,
|
211
217
|
deltacat_storage=unimplemented_deltacat_storage,
|
212
218
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
219
|
+
memory_logs_enabled: Optional[bool] = None,
|
213
220
|
**kwargs,
|
214
221
|
) -> Dict:
|
215
222
|
debug_memory_params = {"merge_task_index": index}
|
@@ -224,6 +231,84 @@ def merge_resource_options_provider(
|
|
224
231
|
pk_size_bytes = data_size
|
225
232
|
incremental_index_array_size = num_rows * 4
|
226
233
|
|
234
|
+
return get_merge_task_options(
|
235
|
+
index,
|
236
|
+
hb_group_idx,
|
237
|
+
data_size,
|
238
|
+
pk_size_bytes,
|
239
|
+
num_rows,
|
240
|
+
num_hash_groups,
|
241
|
+
total_memory_buffer_percentage,
|
242
|
+
incremental_index_array_size,
|
243
|
+
debug_memory_params,
|
244
|
+
ray_custom_resources,
|
245
|
+
round_completion_info=round_completion_info,
|
246
|
+
compacted_delta_manifest=compacted_delta_manifest,
|
247
|
+
primary_keys=primary_keys,
|
248
|
+
deltacat_storage=deltacat_storage,
|
249
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
250
|
+
memory_logs_enabled=memory_logs_enabled,
|
251
|
+
)
|
252
|
+
|
253
|
+
|
254
|
+
def local_merge_resource_options_provider(
|
255
|
+
estimated_da_size: float,
|
256
|
+
estimated_num_rows: int,
|
257
|
+
total_memory_buffer_percentage: int,
|
258
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
259
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
260
|
+
ray_custom_resources: Optional[Dict] = None,
|
261
|
+
primary_keys: Optional[List[str]] = None,
|
262
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
263
|
+
deltacat_storage_kwargs: Optional[Dict] = {},
|
264
|
+
memory_logs_enabled: Optional[bool] = None,
|
265
|
+
**kwargs,
|
266
|
+
) -> Dict:
|
267
|
+
index = hb_group_idx = LocalMergeFileGroupsProvider.LOCAL_HASH_BUCKET_INDEX
|
268
|
+
debug_memory_params = {"merge_task_index": index}
|
269
|
+
|
270
|
+
# upper bound for pk size of incremental
|
271
|
+
pk_size_bytes = estimated_da_size
|
272
|
+
incremental_index_array_size = estimated_num_rows * 4
|
273
|
+
|
274
|
+
return get_merge_task_options(
|
275
|
+
index=index,
|
276
|
+
hb_group_idx=hb_group_idx,
|
277
|
+
data_size=estimated_da_size,
|
278
|
+
pk_size_bytes=pk_size_bytes,
|
279
|
+
num_rows=estimated_num_rows,
|
280
|
+
num_hash_groups=1,
|
281
|
+
incremental_index_array_size=incremental_index_array_size,
|
282
|
+
total_memory_buffer_percentage=total_memory_buffer_percentage,
|
283
|
+
debug_memory_params=debug_memory_params,
|
284
|
+
ray_custom_resources=ray_custom_resources,
|
285
|
+
round_completion_info=round_completion_info,
|
286
|
+
compacted_delta_manifest=compacted_delta_manifest,
|
287
|
+
primary_keys=primary_keys,
|
288
|
+
deltacat_storage=deltacat_storage,
|
289
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
290
|
+
memory_logs_enabled=memory_logs_enabled,
|
291
|
+
)
|
292
|
+
|
293
|
+
|
294
|
+
def get_merge_task_options(
|
295
|
+
index: int,
|
296
|
+
hb_group_idx: int,
|
297
|
+
data_size: float,
|
298
|
+
pk_size_bytes: float,
|
299
|
+
num_rows: int,
|
300
|
+
num_hash_groups: int,
|
301
|
+
total_memory_buffer_percentage: int,
|
302
|
+
incremental_index_array_size: int,
|
303
|
+
debug_memory_params: Dict[str, Any],
|
304
|
+
ray_custom_resources: Optional[Dict],
|
305
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
306
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
307
|
+
primary_keys: Optional[List[str]] = None,
|
308
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
309
|
+
deltacat_storage_kwargs: Optional[Dict] = {},
|
310
|
+
memory_logs_enabled: Optional[bool] = None,
|
311
|
+
) -> Dict[str, Any]:
|
227
312
|
if (
|
228
313
|
round_completion_info
|
229
314
|
and compacted_delta_manifest
|
@@ -296,10 +381,11 @@ def merge_resource_options_provider(
|
|
296
381
|
debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
|
297
382
|
debug_memory_params["total_memory"] = total_memory
|
298
383
|
|
299
|
-
total_memory = total_memory * (1 +
|
384
|
+
total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
|
300
385
|
debug_memory_params["total_memory_with_buffer"] = total_memory
|
301
|
-
logger.
|
302
|
-
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
|
386
|
+
logger.debug_conditional(
|
387
|
+
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
388
|
+
memory_logs_enabled,
|
303
389
|
)
|
304
390
|
|
305
391
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -181,15 +181,35 @@ class MemcachedObjectStore(IObjectStore):
|
|
181
181
|
for chunk_index in range(chunk_count):
|
182
182
|
ref = self._create_ref(uid, ip, chunk_index)
|
183
183
|
chunk = client.get(ref)
|
184
|
+
if chunk is None:
|
185
|
+
raise ValueError(
|
186
|
+
f"Expected uid: {uid}, chunk index: {chunk_index} from client ip: {ip}"
|
187
|
+
f" to be non-empty."
|
188
|
+
)
|
184
189
|
serialized.extend(chunk)
|
185
190
|
|
186
191
|
return cloudpickle.loads(serialized)
|
187
192
|
|
193
|
+
def clear(self) -> bool:
|
194
|
+
flushed = all(
|
195
|
+
[
|
196
|
+
self._get_client_by_ip(ip).flush_all(noreply=False)
|
197
|
+
for ip in self.storage_node_ips
|
198
|
+
]
|
199
|
+
)
|
200
|
+
self.client_cache.clear()
|
201
|
+
|
202
|
+
if flushed:
|
203
|
+
logger.info("Successfully cleared cache contents.")
|
204
|
+
|
205
|
+
return flushed
|
206
|
+
|
188
207
|
def close(self) -> None:
|
189
208
|
for client in self.client_cache.values():
|
190
209
|
client.close()
|
191
210
|
|
192
211
|
self.client_cache.clear()
|
212
|
+
logger.info("Successfully closed object store clients.")
|
193
213
|
|
194
214
|
def _create_ref(self, uid, ip, chunk_index) -> str:
|
195
215
|
return f"{uid}{self.SEPARATOR}{ip}{self.SEPARATOR}{chunk_index}"
|
@@ -2,6 +2,7 @@ import ray
|
|
2
2
|
from ray import cloudpickle
|
3
3
|
from deltacat.io.object_store import IObjectStore
|
4
4
|
from typing import Any, List
|
5
|
+
from ray.types import ObjectRef
|
5
6
|
|
6
7
|
|
7
8
|
class RayPlasmaObjectStore(IObjectStore):
|
@@ -21,3 +22,8 @@ class RayPlasmaObjectStore(IObjectStore):
|
|
21
22
|
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
22
23
|
loaded_refs = [cloudpickle.loads(obj_id) for obj_id in refs]
|
23
24
|
return ray.get(loaded_refs)
|
25
|
+
|
26
|
+
def deserialize_references(
|
27
|
+
self, refs: List[Any], *args, **kwargs
|
28
|
+
) -> List[ObjectRef]:
|
29
|
+
return [cloudpickle.loads(obj_id) for obj_id in refs]
|