deltacat 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +25 -0
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
  4. deltacat/compute/compactor/model/delta_file_envelope.py +21 -3
  5. deltacat/compute/compactor/model/table_object_store.py +51 -0
  6. deltacat/compute/compactor/utils/io.py +1 -1
  7. deltacat/compute/compactor_v2/compaction_session.py +80 -14
  8. deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  9. deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +83 -0
  10. deltacat/compute/compactor_v2/deletes/delete_strategy.py +82 -0
  11. deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +161 -0
  12. deltacat/compute/compactor_v2/deletes/model.py +23 -0
  13. deltacat/compute/compactor_v2/deletes/utils.py +164 -0
  14. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  15. deltacat/compute/compactor_v2/model/merge_input.py +24 -1
  16. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  17. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -6
  18. deltacat/compute/compactor_v2/steps/merge.py +221 -50
  19. deltacat/compute/compactor_v2/utils/delta.py +11 -1
  20. deltacat/compute/compactor_v2/utils/merge.py +10 -0
  21. deltacat/compute/compactor_v2/utils/task_options.py +94 -8
  22. deltacat/io/memcached_object_store.py +20 -0
  23. deltacat/io/ray_plasma_object_store.py +6 -0
  24. deltacat/logs.py +29 -2
  25. deltacat/storage/__init__.py +3 -0
  26. deltacat/storage/interface.py +2 -0
  27. deltacat/storage/model/delete_parameters.py +40 -0
  28. deltacat/storage/model/delta.py +25 -1
  29. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +1930 -0
  30. deltacat/tests/compute/compact_partition_test_cases.py +16 -822
  31. deltacat/tests/compute/compactor/utils/test_io.py +4 -4
  32. deltacat/tests/compute/test_compact_partition_incremental.py +4 -0
  33. deltacat/tests/compute/test_compact_partition_params.py +5 -0
  34. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +32 -20
  35. deltacat/tests/compute/test_util_create_table_deltas_repo.py +28 -10
  36. deltacat/tests/io/test_memcached_object_store.py +19 -0
  37. deltacat/tests/local_deltacat_storage/__init__.py +3 -0
  38. deltacat/tests/test_utils/constants.py +1 -2
  39. deltacat/tests/test_utils/pyarrow.py +27 -10
  40. deltacat/utils/pandas.py +1 -1
  41. deltacat/utils/ray_utils/runtime.py +3 -3
  42. deltacat/utils/resources.py +7 -5
  43. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/METADATA +1 -1
  44. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/RECORD +47 -38
  45. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/LICENSE +0 -0
  46. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/WHEEL +0 -0
  47. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/top_level.txt +0 -0
@@ -4,13 +4,15 @@ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
4
4
  import numpy as np
5
5
  import pyarrow as pa
6
6
  import ray
7
+ import itertools
7
8
  import time
8
9
  import pyarrow.compute as pc
9
10
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
10
11
  from uuid import uuid4
11
12
  from deltacat import logs
12
- from typing import List, Optional, Tuple
13
+ from typing import Callable, Iterator, List, Optional, Tuple
13
14
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
15
+ from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
14
16
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
15
17
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
16
18
  from deltacat.compute.compactor import RoundCompletionInfo, DeltaFileEnvelope
@@ -69,17 +71,11 @@ def _drop_delta_type_rows(table: pa.Table, delta_type: DeltaType) -> pa.Table:
69
71
 
70
72
 
71
73
  def _build_incremental_table(
72
- df_envelopes_list: List[List[DeltaFileEnvelope]],
74
+ df_envelopes: List[DeltaFileEnvelope],
73
75
  ) -> pa.Table:
74
76
 
75
77
  hb_tables = []
76
78
  # sort by delta file stream position now instead of sorting every row later
77
- df_envelopes = [d for dfe_list in df_envelopes_list for d in dfe_list]
78
- df_envelopes = sorted(
79
- df_envelopes,
80
- key=lambda df: (df.stream_position, df.file_index),
81
- reverse=False, # ascending
82
- )
83
79
  is_delete = False
84
80
  for df_envelope in df_envelopes:
85
81
  assert (
@@ -96,9 +92,7 @@ def _build_incremental_table(
96
92
  )
97
93
 
98
94
  hb_tables.append(table)
99
-
100
95
  result = pa.concat_tables(hb_tables)
101
-
102
96
  return result
103
97
 
104
98
 
@@ -111,7 +105,7 @@ def _merge_tables(
111
105
  """
112
106
  Merges the table with compacted table dropping duplicates where necessary.
113
107
 
114
- This method ensures the appropriate deltas of types DELETE/UPSERT are correctly
108
+ This method ensures the appropriate deltas of types [UPSERT] are correctly
115
109
  appended to the table.
116
110
  """
117
111
 
@@ -214,7 +208,7 @@ def _copy_all_manifest_files_from_old_hash_buckets(
214
208
  hb_index_to_indices = round_completion_info.hb_index_to_entry_range
215
209
 
216
210
  if hb_index_to_indices is None:
217
- logger.info(f"Nothing to copy by reference. Skipping...")
211
+ logger.info("Nothing to copy by reference. Skipping...")
218
212
  return []
219
213
 
220
214
  for hb_index in hb_index_copy_by_reference:
@@ -257,62 +251,209 @@ def _copy_all_manifest_files_from_old_hash_buckets(
257
251
  return materialize_result_list
258
252
 
259
253
 
254
+ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
255
+ """
256
+ Checks if the given hash bucket index has a compacted table available from the previous compaction round.
257
+
258
+ Args:
259
+ input (MergeInput): The input for the merge operation.
260
+ hb_idx (int): The hash bucket index to check.
261
+
262
+ Returns:
263
+ bool: True if the hash bucket index has a compacted table available, False otherwise.
264
+ """
265
+ return (
266
+ input.round_completion_info
267
+ and input.round_completion_info.hb_index_to_entry_range
268
+ and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
269
+ is not None
270
+ )
271
+
272
+
273
+ def _can_copy_by_reference(
274
+ has_delete: bool, merge_file_group: MergeFileGroup, input: MergeInput
275
+ ) -> bool:
276
+ """
277
+ Can copy by reference only if there are no deletes to merge in
278
+ and previous compacted stream id matches that of new stream
279
+ """
280
+ return (
281
+ not has_delete
282
+ and not merge_file_group.dfe_groups
283
+ and input.round_completion_info is not None
284
+ and (
285
+ input.write_to_partition.stream_id
286
+ == input.round_completion_info.compacted_delta_locator.stream_id
287
+ )
288
+ )
289
+
290
+
291
+ def _flatten_dfe_list(
292
+ df_envelopes_list: List[List[DeltaFileEnvelope]],
293
+ ) -> List[DeltaFileEnvelope]:
294
+ """
295
+ Flattens a list of lists of DeltaFileEnvelope objects into a single list of DeltaFileEnvelope objects.
296
+
297
+ Args:
298
+ df_envelopes_list (List[List[DeltaFileEnvelope]]): A list of lists of DeltaFileEnvelope objects.
299
+
300
+ Returns:
301
+ List[DeltaFileEnvelope]: A flattened list of DeltaFileEnvelope objects.
302
+ """
303
+ if not df_envelopes_list:
304
+ return []
305
+ return [d for dfe_list in df_envelopes_list for d in dfe_list]
306
+
307
+
308
+ def _sort_df_envelopes(
309
+ df_envelopes: List[DeltaFileEnvelope],
310
+ key: Callable = lambda df: (df.stream_position, df.file_index),
311
+ ) -> List[DeltaFileEnvelope]:
312
+ """
313
+ Sorts a list of DeltaFileEnvelope objects based on a specified key function.
314
+
315
+ Args:
316
+ df_envelopes (List[DeltaFileEnvelope]): A list of DeltaFileEnvelope objects.
317
+ key (Callable, optional): A function that takes a DeltaFileEnvelope object and returns a key for sorting.
318
+ Defaults to lambda df: (df.stream_position, df.file_index).
319
+
320
+ Returns:
321
+ List[DeltaFileEnvelope]: A sorted list of DeltaFileEnvelope objects.
322
+ """
323
+ if not df_envelopes:
324
+ return []
325
+ return sorted(
326
+ df_envelopes,
327
+ key=key,
328
+ reverse=False, # ascending
329
+ )
330
+
331
+
332
+ def _group_sequence_by_delta_type(
333
+ df_envelopes: List[DeltaFileEnvelope],
334
+ ) -> Iterator[Tuple[List, List]]:
335
+ """
336
+ Groups a list of DeltaFileEnvelope objects by their delta_type.
337
+
338
+ Args:
339
+ df_envelopes (List[DeltaFileEnvelope]): A list of DeltaFileEnvelope objects.
340
+
341
+ Yields:
342
+ Iterator[Tuple[DeltaType, List[DeltaFileEnvelope]]]: A tuple containing the delta_type
343
+ and a list of DeltaFileEnvelope objects that share the same delta_type.
344
+ """
345
+ iter_df_envelopes = iter(df_envelopes)
346
+ for delta_type, delta_type_sequence in itertools.groupby(
347
+ iter_df_envelopes, lambda x: x.delta_type
348
+ ):
349
+ yield delta_type, list(delta_type_sequence)
350
+
351
+
260
352
  def _compact_tables(
261
- input: MergeInput, dfe_list: List[List[DeltaFileEnvelope]], hb_idx: int
262
- ) -> Tuple[pa.Table, int, int]:
353
+ input: MergeInput,
354
+ dfe_list: Optional[List[List[DeltaFileEnvelope]]],
355
+ hb_idx: int,
356
+ compacted_table: Optional[pa.Table] = None,
357
+ ) -> Tuple[pa.Table, int, int, int]:
358
+ """
359
+ Compacts a list of DeltaFileEnvelope objects into a single PyArrow table.
360
+
361
+ Args:
362
+ input (MergeInput): The input for the merge operation.
363
+ dfe_list (List[List[DeltaFileEnvelope]]): A list of lists of DeltaFileEnvelope objects.
364
+ hb_idx (int): The hash bucket index for the compaction.
365
+
366
+ Returns:
367
+ Tuple[pa.Table, int, int, int]: A tuple containing:
368
+ 1. The compacted PyArrow table.
369
+ 2. The total number of records in the incremental data.
370
+ 3. The total number of deduplicated records.
371
+ 4. The total number of deleted records due to DELETE operations.
372
+ """
373
+ df_envelopes: List[DeltaFileEnvelope] = _flatten_dfe_list(dfe_list)
374
+ delete_file_envelopes = input.delete_file_envelopes or []
375
+ reordered_all_dfes: List[DeltaFileEnvelope] = _sort_df_envelopes(
376
+ delete_file_envelopes + df_envelopes
377
+ )
378
+ assert all(
379
+ dfe.delta_type in (DeltaType.UPSERT, DeltaType.DELETE)
380
+ for dfe in reordered_all_dfes
381
+ ), "All reordered delta file envelopes must be of the UPSERT or DELETE"
382
+ table = compacted_table
383
+ aggregated_incremental_len = 0
384
+ aggregated_deduped_records = 0
385
+ aggregated_dropped_records = 0
386
+ for i, (delta_type, delta_type_sequence) in enumerate(
387
+ _group_sequence_by_delta_type(reordered_all_dfes)
388
+ ):
389
+ if delta_type is DeltaType.UPSERT:
390
+ (
391
+ table,
392
+ incremental_len,
393
+ deduped_records,
394
+ merge_time,
395
+ ) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
396
+ logger.info(
397
+ f" [Merge task index {input.merge_task_index}] Merged"
398
+ f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
399
+ )
400
+ aggregated_incremental_len += incremental_len
401
+ aggregated_deduped_records += deduped_records
402
+ elif delta_type is DeltaType.DELETE:
403
+ table_size_before_delete = len(table) if table else 0
404
+ (table, dropped_rows), delete_time = timed_invocation(
405
+ func=input.delete_strategy.apply_many_deletes,
406
+ table=table,
407
+ delete_file_envelopes=delta_type_sequence,
408
+ )
409
+ logger.info(
410
+ f" [Merge task index {input.merge_task_index}]"
411
+ + f" Dropped record count: {dropped_rows} from table"
412
+ + f" of record count {table_size_before_delete} took: {delete_time}s"
413
+ )
414
+ aggregated_dropped_records += dropped_rows
415
+ return (
416
+ table,
417
+ aggregated_incremental_len,
418
+ aggregated_deduped_records,
419
+ aggregated_dropped_records,
420
+ )
421
+
422
+
423
+ def _apply_upserts(
424
+ input: MergeInput,
425
+ dfe_list: List[DeltaFileEnvelope],
426
+ hb_idx,
427
+ prev_table=None,
428
+ ) -> Tuple[pa.Table, int, int, int]:
429
+ assert all(
430
+ dfe.delta_type is DeltaType.UPSERT for dfe in dfe_list
431
+ ), "All incoming delta file envelopes must of the DeltaType.UPSERT"
263
432
  logger.info(
264
433
  f"[Hash bucket index {hb_idx}] Reading dedupe input for "
265
434
  f"{len(dfe_list)} delta file envelope lists..."
266
435
  )
267
436
  table = _build_incremental_table(dfe_list)
268
-
269
437
  incremental_len = len(table)
270
438
  logger.info(
271
439
  f"[Hash bucket index {hb_idx}] Got the incremental table of length {incremental_len}"
272
440
  )
273
-
274
441
  if input.sort_keys:
275
442
  # Incremental is sorted and merged, as sorting
276
443
  # on non event based sort key does not produce consistent
277
444
  # compaction results. E.g., compaction(delta1, delta2, delta3)
278
445
  # will not be equal to compaction(compaction(delta1, delta2), delta3).
279
446
  table = table.sort_by(input.sort_keys)
280
-
281
- compacted_table = None
282
-
283
- if (
284
- input.round_completion_info
285
- and input.round_completion_info.hb_index_to_entry_range
286
- and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
287
- is not None
288
- ):
289
- compacted_table = _download_compacted_table(
290
- hb_index=hb_idx,
291
- rcf=input.round_completion_info,
292
- read_kwargs_provider=input.read_kwargs_provider,
293
- deltacat_storage=input.deltacat_storage,
294
- deltacat_storage_kwargs=input.deltacat_storage_kwargs,
295
- )
296
-
297
- hb_table_record_count = len(table) + (
298
- len(compacted_table) if compacted_table else 0
299
- )
300
-
447
+ hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
301
448
  table, merge_time = timed_invocation(
302
449
  func=_merge_tables,
303
450
  table=table,
304
451
  primary_keys=input.primary_keys,
305
452
  can_drop_duplicates=input.drop_duplicates,
306
- compacted_table=compacted_table,
453
+ compacted_table=prev_table,
307
454
  )
308
- total_deduped_records = hb_table_record_count - len(table)
309
-
310
- logger.info(
311
- f"[Merge task index {input.merge_task_index}] Merged "
312
- f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
313
- )
314
-
315
- return table, incremental_len, total_deduped_records
455
+ deduped_records = hb_table_record_count - len(table)
456
+ return table, incremental_len, deduped_records, merge_time
316
457
 
317
458
 
318
459
  def _copy_manifests_from_hash_bucketing(
@@ -345,20 +486,47 @@ def _timed_merge(input: MergeInput) -> MergeResult:
345
486
  f"merge_{worker_id}_{task_id}.bin"
346
487
  ) if input.enable_profiler else nullcontext():
347
488
  total_input_records, total_deduped_records = 0, 0
489
+ total_dropped_records = 0
348
490
  materialized_results: List[MaterializeResult] = []
349
491
  merge_file_groups = input.merge_file_groups_provider.create()
350
492
  hb_index_copy_by_ref_ids = []
351
493
 
352
494
  for merge_file_group in merge_file_groups:
353
- if not merge_file_group.dfe_groups:
495
+ compacted_table = None
496
+ has_delete = input.delete_file_envelopes is not None
497
+ if has_delete:
498
+ assert (
499
+ input.delete_strategy is not None
500
+ ), "Merge input missing delete_strategy"
501
+ if _can_copy_by_reference(
502
+ has_delete=has_delete, merge_file_group=merge_file_group, input=input
503
+ ):
354
504
  hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
355
505
  continue
356
506
 
357
- table, input_records, deduped_records = _compact_tables(
358
- input, merge_file_group.dfe_groups, merge_file_group.hb_index
507
+ if _has_previous_compacted_table(input, merge_file_group.hb_index):
508
+ compacted_table = _download_compacted_table(
509
+ hb_index=merge_file_group.hb_index,
510
+ rcf=input.round_completion_info,
511
+ read_kwargs_provider=input.read_kwargs_provider,
512
+ deltacat_storage=input.deltacat_storage,
513
+ deltacat_storage_kwargs=input.deltacat_storage_kwargs,
514
+ )
515
+ if not merge_file_group.dfe_groups and compacted_table is None:
516
+ logger.warning(
517
+ f" [Hash bucket index {merge_file_group.hb_index}]"
518
+ + f" No new deltas and no compacted table found. Skipping compaction for {merge_file_group.hb_index}"
519
+ )
520
+ continue
521
+ table, input_records, deduped_records, dropped_records = _compact_tables(
522
+ input,
523
+ merge_file_group.dfe_groups,
524
+ merge_file_group.hb_index,
525
+ compacted_table,
359
526
  )
360
527
  total_input_records += input_records
361
528
  total_deduped_records += deduped_records
529
+ total_dropped_records += dropped_records
362
530
  materialized_results.append(
363
531
  merge_utils.materialize(input, merge_file_group.hb_index, [table])
364
532
  )
@@ -382,6 +550,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
382
550
  materialized_results,
383
551
  np.int64(total_input_records),
384
552
  np.int64(total_deduped_records),
553
+ np.int64(total_dropped_records),
385
554
  np.double(peak_memory_usage_bytes),
386
555
  np.double(0.0),
387
556
  np.double(time.time()),
@@ -400,7 +569,8 @@ def merge(input: MergeInput) -> MergeResult:
400
569
  f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
401
570
  )
402
571
 
403
- process_util.schedule_callback(log_peak_memory, 10)
572
+ if input.memory_logs_enabled:
573
+ process_util.schedule_callback(log_peak_memory, 10)
404
574
 
405
575
  merge_result, duration = timed_invocation(func=_timed_merge, input=input)
406
576
 
@@ -420,6 +590,7 @@ def merge(input: MergeInput) -> MergeResult:
420
590
  merge_result[1],
421
591
  merge_result[2],
422
592
  merge_result[3],
593
+ merge_result[4],
423
594
  np.double(emit_metrics_time),
424
595
  merge_result[4],
425
596
  )
@@ -5,7 +5,10 @@ from deltacat.compute.compactor import (
5
5
  DeltaAnnotated,
6
6
  DeltaFileEnvelope,
7
7
  )
8
-
8
+ from deltacat.storage import (
9
+ Delta,
10
+ )
11
+ from deltacat.storage.model.delta import DeltaType
9
12
  from deltacat.storage import interface as unimplemented_deltacat_storage
10
13
  from deltacat.types.media import StorageType
11
14
  from deltacat.utils.common import ReadKwargsProvider
@@ -18,6 +21,13 @@ import logging
18
21
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
19
22
 
20
23
 
24
+ def contains_delete_deltas(deltas: List[Delta]) -> bool:
25
+ for delta in deltas:
26
+ if delta.type is DeltaType.DELETE:
27
+ return True
28
+ return False
29
+
30
+
21
31
  def read_delta_file_envelopes(
22
32
  annotated_delta: DeltaAnnotated,
23
33
  read_kwargs_provider: Optional[ReadKwargsProvider],
@@ -25,6 +25,12 @@ from deltacat.utils.performance import timed_invocation
25
25
  from deltacat.storage import (
26
26
  Partition,
27
27
  )
28
+ from deltacat.compute.compactor_v2.deletes.delete_strategy import (
29
+ DeleteStrategy,
30
+ )
31
+ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
32
+ DeleteFileEnvelope,
33
+ )
28
34
 
29
35
 
30
36
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -86,6 +92,8 @@ def generate_local_merge_input(
86
92
  annotated_deltas: List[DeltaAnnotated],
87
93
  compacted_partition: Partition,
88
94
  round_completion_info: Optional[RoundCompletionInfo],
95
+ delete_strategy: Optional[DeleteStrategy] = None,
96
+ delete_file_envelopes: Optional[DeleteFileEnvelope] = None,
89
97
  ):
90
98
  """
91
99
  Generates a merge input for local deltas that do not reside in the Ray object store and
@@ -123,4 +131,6 @@ def generate_local_merge_input(
123
131
  object_store=params.object_store,
124
132
  deltacat_storage=params.deltacat_storage,
125
133
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
134
+ delete_strategy=delete_strategy,
135
+ delete_file_envelopes=delete_file_envelopes,
126
136
  )
@@ -1,7 +1,10 @@
1
1
  import botocore
2
2
  import logging
3
- from typing import Dict, Optional, List, Tuple
3
+ from typing import Dict, Optional, List, Tuple, Any
4
4
  from deltacat import logs
5
+ from deltacat.compute.compactor_v2.model.merge_file_group import (
6
+ LocalMergeFileGroupsProvider,
7
+ )
5
8
  from deltacat.types.media import ContentEncoding, ContentType
6
9
  from deltacat.types.partial_download import PartialParquetParameters
7
10
  from deltacat.storage import (
@@ -15,7 +18,6 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
15
18
  hash_group_index_to_hash_bucket_indices,
16
19
  )
17
20
  from deltacat.compute.compactor_v2.constants import (
18
- TOTAL_MEMORY_BUFFER_PERCENTAGE,
19
21
  PARQUET_TO_PYARROW_INFLATION,
20
22
  )
21
23
 
@@ -133,8 +135,10 @@ def hash_bucket_resource_options_provider(
133
135
  item: DeltaAnnotated,
134
136
  previous_inflation: float,
135
137
  average_record_size_bytes: float,
138
+ total_memory_buffer_percentage: int,
136
139
  primary_keys: List[str] = None,
137
140
  ray_custom_resources: Optional[Dict] = None,
141
+ memory_logs_enabled: Optional[bool] = None,
138
142
  **kwargs,
139
143
  ) -> Dict:
140
144
  debug_memory_params = {"hash_bucket_task_index": index}
@@ -189,10 +193,11 @@ def hash_bucket_resource_options_provider(
189
193
  debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
190
194
 
191
195
  # Consider buffer
192
- total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
196
+ total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
193
197
  debug_memory_params["total_memory_with_buffer"] = total_memory
194
- logger.debug(
195
- f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
198
+ logger.debug_conditional(
199
+ f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}",
200
+ memory_logs_enabled,
196
201
  )
197
202
 
198
203
  return get_task_options(0.01, total_memory, ray_custom_resources)
@@ -204,12 +209,14 @@ def merge_resource_options_provider(
204
209
  num_hash_groups: int,
205
210
  hash_group_size_bytes: Dict[int, int],
206
211
  hash_group_num_rows: Dict[int, int],
212
+ total_memory_buffer_percentage: int,
207
213
  round_completion_info: Optional[RoundCompletionInfo] = None,
208
214
  compacted_delta_manifest: Optional[Manifest] = None,
209
215
  ray_custom_resources: Optional[Dict] = None,
210
216
  primary_keys: Optional[List[str]] = None,
211
217
  deltacat_storage=unimplemented_deltacat_storage,
212
218
  deltacat_storage_kwargs: Optional[Dict] = {},
219
+ memory_logs_enabled: Optional[bool] = None,
213
220
  **kwargs,
214
221
  ) -> Dict:
215
222
  debug_memory_params = {"merge_task_index": index}
@@ -224,6 +231,84 @@ def merge_resource_options_provider(
224
231
  pk_size_bytes = data_size
225
232
  incremental_index_array_size = num_rows * 4
226
233
 
234
+ return get_merge_task_options(
235
+ index,
236
+ hb_group_idx,
237
+ data_size,
238
+ pk_size_bytes,
239
+ num_rows,
240
+ num_hash_groups,
241
+ total_memory_buffer_percentage,
242
+ incremental_index_array_size,
243
+ debug_memory_params,
244
+ ray_custom_resources,
245
+ round_completion_info=round_completion_info,
246
+ compacted_delta_manifest=compacted_delta_manifest,
247
+ primary_keys=primary_keys,
248
+ deltacat_storage=deltacat_storage,
249
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
250
+ memory_logs_enabled=memory_logs_enabled,
251
+ )
252
+
253
+
254
+ def local_merge_resource_options_provider(
255
+ estimated_da_size: float,
256
+ estimated_num_rows: int,
257
+ total_memory_buffer_percentage: int,
258
+ round_completion_info: Optional[RoundCompletionInfo] = None,
259
+ compacted_delta_manifest: Optional[Manifest] = None,
260
+ ray_custom_resources: Optional[Dict] = None,
261
+ primary_keys: Optional[List[str]] = None,
262
+ deltacat_storage=unimplemented_deltacat_storage,
263
+ deltacat_storage_kwargs: Optional[Dict] = {},
264
+ memory_logs_enabled: Optional[bool] = None,
265
+ **kwargs,
266
+ ) -> Dict:
267
+ index = hb_group_idx = LocalMergeFileGroupsProvider.LOCAL_HASH_BUCKET_INDEX
268
+ debug_memory_params = {"merge_task_index": index}
269
+
270
+ # upper bound for pk size of incremental
271
+ pk_size_bytes = estimated_da_size
272
+ incremental_index_array_size = estimated_num_rows * 4
273
+
274
+ return get_merge_task_options(
275
+ index=index,
276
+ hb_group_idx=hb_group_idx,
277
+ data_size=estimated_da_size,
278
+ pk_size_bytes=pk_size_bytes,
279
+ num_rows=estimated_num_rows,
280
+ num_hash_groups=1,
281
+ incremental_index_array_size=incremental_index_array_size,
282
+ total_memory_buffer_percentage=total_memory_buffer_percentage,
283
+ debug_memory_params=debug_memory_params,
284
+ ray_custom_resources=ray_custom_resources,
285
+ round_completion_info=round_completion_info,
286
+ compacted_delta_manifest=compacted_delta_manifest,
287
+ primary_keys=primary_keys,
288
+ deltacat_storage=deltacat_storage,
289
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
290
+ memory_logs_enabled=memory_logs_enabled,
291
+ )
292
+
293
+
294
+ def get_merge_task_options(
295
+ index: int,
296
+ hb_group_idx: int,
297
+ data_size: float,
298
+ pk_size_bytes: float,
299
+ num_rows: int,
300
+ num_hash_groups: int,
301
+ total_memory_buffer_percentage: int,
302
+ incremental_index_array_size: int,
303
+ debug_memory_params: Dict[str, Any],
304
+ ray_custom_resources: Optional[Dict],
305
+ round_completion_info: Optional[RoundCompletionInfo] = None,
306
+ compacted_delta_manifest: Optional[Manifest] = None,
307
+ primary_keys: Optional[List[str]] = None,
308
+ deltacat_storage=unimplemented_deltacat_storage,
309
+ deltacat_storage_kwargs: Optional[Dict] = {},
310
+ memory_logs_enabled: Optional[bool] = None,
311
+ ) -> Dict[str, Any]:
227
312
  if (
228
313
  round_completion_info
229
314
  and compacted_delta_manifest
@@ -296,10 +381,11 @@ def merge_resource_options_provider(
296
381
  debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
297
382
  debug_memory_params["total_memory"] = total_memory
298
383
 
299
- total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
384
+ total_memory = total_memory * (1 + total_memory_buffer_percentage / 100.0)
300
385
  debug_memory_params["total_memory_with_buffer"] = total_memory
301
- logger.debug(
302
- f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
386
+ logger.debug_conditional(
387
+ f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
388
+ memory_logs_enabled,
303
389
  )
304
390
 
305
391
  return get_task_options(0.01, total_memory, ray_custom_resources)
@@ -181,15 +181,35 @@ class MemcachedObjectStore(IObjectStore):
181
181
  for chunk_index in range(chunk_count):
182
182
  ref = self._create_ref(uid, ip, chunk_index)
183
183
  chunk = client.get(ref)
184
+ if chunk is None:
185
+ raise ValueError(
186
+ f"Expected uid: {uid}, chunk index: {chunk_index} from client ip: {ip}"
187
+ f" to be non-empty."
188
+ )
184
189
  serialized.extend(chunk)
185
190
 
186
191
  return cloudpickle.loads(serialized)
187
192
 
193
+ def clear(self) -> bool:
194
+ flushed = all(
195
+ [
196
+ self._get_client_by_ip(ip).flush_all(noreply=False)
197
+ for ip in self.storage_node_ips
198
+ ]
199
+ )
200
+ self.client_cache.clear()
201
+
202
+ if flushed:
203
+ logger.info("Successfully cleared cache contents.")
204
+
205
+ return flushed
206
+
188
207
  def close(self) -> None:
189
208
  for client in self.client_cache.values():
190
209
  client.close()
191
210
 
192
211
  self.client_cache.clear()
212
+ logger.info("Successfully closed object store clients.")
193
213
 
194
214
  def _create_ref(self, uid, ip, chunk_index) -> str:
195
215
  return f"{uid}{self.SEPARATOR}{ip}{self.SEPARATOR}{chunk_index}"
@@ -2,6 +2,7 @@ import ray
2
2
  from ray import cloudpickle
3
3
  from deltacat.io.object_store import IObjectStore
4
4
  from typing import Any, List
5
+ from ray.types import ObjectRef
5
6
 
6
7
 
7
8
  class RayPlasmaObjectStore(IObjectStore):
@@ -21,3 +22,8 @@ class RayPlasmaObjectStore(IObjectStore):
21
22
  def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
22
23
  loaded_refs = [cloudpickle.loads(obj_id) for obj_id in refs]
23
24
  return ray.get(loaded_refs)
25
+
26
+ def deserialize_references(
27
+ self, refs: List[Any], *args, **kwargs
28
+ ) -> List[ObjectRef]:
29
+ return [cloudpickle.loads(obj_id) for obj_id in refs]