deltacat 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +25 -0
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
  4. deltacat/compute/compactor/model/delta_file_envelope.py +21 -3
  5. deltacat/compute/compactor/model/table_object_store.py +51 -0
  6. deltacat/compute/compactor/utils/io.py +1 -1
  7. deltacat/compute/compactor_v2/compaction_session.py +80 -14
  8. deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  9. deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +83 -0
  10. deltacat/compute/compactor_v2/deletes/delete_strategy.py +82 -0
  11. deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +161 -0
  12. deltacat/compute/compactor_v2/deletes/model.py +23 -0
  13. deltacat/compute/compactor_v2/deletes/utils.py +164 -0
  14. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  15. deltacat/compute/compactor_v2/model/merge_input.py +24 -1
  16. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  17. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -6
  18. deltacat/compute/compactor_v2/steps/merge.py +221 -50
  19. deltacat/compute/compactor_v2/utils/delta.py +11 -1
  20. deltacat/compute/compactor_v2/utils/merge.py +10 -0
  21. deltacat/compute/compactor_v2/utils/task_options.py +94 -8
  22. deltacat/io/memcached_object_store.py +20 -0
  23. deltacat/io/ray_plasma_object_store.py +6 -0
  24. deltacat/logs.py +29 -2
  25. deltacat/storage/__init__.py +3 -0
  26. deltacat/storage/interface.py +2 -0
  27. deltacat/storage/model/delete_parameters.py +40 -0
  28. deltacat/storage/model/delta.py +25 -1
  29. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +1930 -0
  30. deltacat/tests/compute/compact_partition_test_cases.py +16 -822
  31. deltacat/tests/compute/compactor/utils/test_io.py +4 -4
  32. deltacat/tests/compute/test_compact_partition_incremental.py +4 -0
  33. deltacat/tests/compute/test_compact_partition_params.py +5 -0
  34. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +32 -20
  35. deltacat/tests/compute/test_util_create_table_deltas_repo.py +28 -10
  36. deltacat/tests/io/test_memcached_object_store.py +19 -0
  37. deltacat/tests/local_deltacat_storage/__init__.py +3 -0
  38. deltacat/tests/test_utils/constants.py +1 -2
  39. deltacat/tests/test_utils/pyarrow.py +27 -10
  40. deltacat/utils/pandas.py +1 -1
  41. deltacat/utils/ray_utils/runtime.py +3 -3
  42. deltacat/utils/resources.py +7 -5
  43. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/METADATA +1 -1
  44. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/RECORD +47 -38
  45. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/LICENSE +0 -0
  46. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/WHEEL +0 -0
  47. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.0.2"
47
+ __version__ = "1.1.1"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -20,6 +20,7 @@ from deltacat.compute.compactor_v2.constants import (
20
20
  AVERAGE_RECORD_SIZE_BYTES,
21
21
  TASK_MAX_PARALLELISM,
22
22
  DROP_DUPLICATES,
23
+ TOTAL_MEMORY_BUFFER_PERCENTAGE,
23
24
  )
24
25
  from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
25
26
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -57,6 +58,7 @@ class CompactPartitionParams(dict):
57
58
  "compacted_file_content_type", ContentType.PARQUET
58
59
  )
59
60
  result.object_store = params.get("object_store", RayPlasmaObjectStore())
61
+
60
62
  result.enable_profiler = params.get("enable_profiler", False)
61
63
  result.deltacat_storage = params.get(
62
64
  "deltacat_storage", unimplemented_deltacat_storage
@@ -84,12 +86,17 @@ class CompactPartitionParams(dict):
84
86
  result.average_record_size_bytes = params.get(
85
87
  "average_record_size_bytes", AVERAGE_RECORD_SIZE_BYTES
86
88
  )
89
+ result.total_memory_buffer_percentage = params.get(
90
+ "total_memory_buffer_percentage", TOTAL_MEMORY_BUFFER_PERCENTAGE
91
+ )
87
92
  result.hash_group_count = params.get(
88
93
  "hash_group_count", result.hash_bucket_count
89
94
  )
90
95
  result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
91
96
  result.ray_custom_resources = params.get("ray_custom_resources")
92
97
 
98
+ result.memory_logs_enabled = params.get("memory_logs_enabled", False)
99
+
93
100
  result.metrics_config = params.get("metrics_config")
94
101
 
95
102
  if not importlib.util.find_spec("memray"):
@@ -189,6 +196,16 @@ class CompactPartitionParams(dict):
189
196
  def average_record_size_bytes(self, average_record_size_bytes: float) -> None:
190
197
  self["average_record_size_bytes"] = average_record_size_bytes
191
198
 
199
+ @property
200
+ def total_memory_buffer_percentage(self) -> int:
201
+ return self["total_memory_buffer_percentage"]
202
+
203
+ @total_memory_buffer_percentage.setter
204
+ def total_memory_buffer_percentage(
205
+ self, total_memory_buffer_percentage: int
206
+ ) -> None:
207
+ self["total_memory_buffer_percentage"] = total_memory_buffer_percentage
208
+
192
209
  @property
193
210
  def min_files_in_batch(self) -> float:
194
211
  return self["min_files_in_batch"]
@@ -354,6 +371,14 @@ class CompactPartitionParams(dict):
354
371
  def sort_keys(self, keys: List[SortKey]) -> None:
355
372
  self["sort_keys"] = keys
356
373
 
374
+ @property
375
+ def memory_logs_enabled(self) -> bool:
376
+ return self.get("memory_logs_enabled")
377
+
378
+ @memory_logs_enabled.setter
379
+ def memory_logs_enabled(self, value: bool) -> None:
380
+ self["memory_logs_enabled"] = value
381
+
357
382
  @property
358
383
  def metrics_config(self) -> Optional[MetricsConfig]:
359
384
  return self.get("metrics_config")
@@ -84,6 +84,13 @@ class CompactionSessionAuditInfo(dict):
84
84
  """
85
85
  return self.get("recordsDeduped")
86
86
 
87
+ @property
88
+ def records_deleted(self) -> int:
89
+ """
90
+ The total count of deleted records in a compaction session if delete deltas are present.
91
+ """
92
+ return self.get("recordsDeleted")
93
+
87
94
  @property
88
95
  def input_size_bytes(self) -> float:
89
96
  """
@@ -461,6 +468,10 @@ class CompactionSessionAuditInfo(dict):
461
468
  self["recordsDeduped"] = records_deduped
462
469
  return self
463
470
 
471
+ def set_records_deleted(self, records_deleted: int) -> CompactionSessionAuditInfo:
472
+ self["recordsDeleted"] = records_deleted
473
+ return self
474
+
464
475
  def set_input_size_bytes(
465
476
  self, input_size_bytes: float
466
477
  ) -> CompactionSessionAuditInfo:
@@ -5,6 +5,9 @@ import numpy as np
5
5
  import pyarrow as pa
6
6
 
7
7
  from deltacat.storage import DeltaType, LocalTable
8
+ from deltacat.compute.compactor.model.table_object_store import (
9
+ LocalTableStorageStrategy,
10
+ )
8
11
 
9
12
  from typing import Optional
10
13
 
@@ -20,18 +23,21 @@ class DeltaFileEnvelope(dict):
20
23
  file_index: int = None,
21
24
  is_src_delta: np.bool_ = True,
22
25
  file_record_count: Optional[int] = None,
26
+ table_storage_strategy: [LocalTableStorageStrategy] = None,
23
27
  ) -> DeltaFileEnvelope:
24
- """Static factory builder for a Delta File Envelope
28
+ """
29
+ Static factory builder for a Delta File Envelope
25
30
  `
26
31
  Args:
27
32
  stream_position: Stream position of a delta.
28
- file_index: Manifest file index number of a delta.
29
33
  delta_type: A delta type.
30
34
  table: The table object that represents the delta file.
35
+ file_index: Manifest file index number of a delta.
31
36
  is_src_delta: True if this Delta File Locator is
32
37
  pointing to a file from the uncompacted source table, False if
33
38
  this Locator is pointing to a file in the compacted destination
34
39
  table.
40
+ table_storage_strategy: The way the table object is stored in the delta file envelope. If None just stores the table normally
35
41
  Returns:
36
42
  A delta file envelope.
37
43
 
@@ -46,7 +52,11 @@ class DeltaFileEnvelope(dict):
46
52
  delta_file_envelope["streamPosition"] = stream_position
47
53
  delta_file_envelope["fileIndex"] = file_index
48
54
  delta_file_envelope["deltaType"] = delta_type.value
49
- delta_file_envelope["table"] = table
55
+ if table_storage_strategy is None:
56
+ delta_file_envelope["table"] = table
57
+ else:
58
+ delta_file_envelope["table"] = table_storage_strategy.store_table(table)
59
+ delta_file_envelope["table_storage_strategy"] = table_storage_strategy
50
60
  delta_file_envelope["is_src_delta"] = is_src_delta
51
61
  delta_file_envelope["file_record_count"] = file_record_count
52
62
  return delta_file_envelope
@@ -63,8 +73,16 @@ class DeltaFileEnvelope(dict):
63
73
  def delta_type(self) -> DeltaType:
64
74
  return DeltaType(self["deltaType"])
65
75
 
76
+ @property
77
+ def table_storage_strategy(self) -> Optional[LocalTableStorageStrategy]:
78
+ return self["table_storage_strategy"]
79
+
66
80
  @property
67
81
  def table(self) -> LocalTable:
82
+ val = self.table_storage_strategy
83
+ if val is not None:
84
+ table_storage_strategy = val
85
+ return table_storage_strategy.get_table(self["table"])
68
86
  return self["table"]
69
87
 
70
88
  @property
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from ray.types import ObjectRef
4
+
5
+ from typing import Any, Union
6
+
7
+ from abc import ABC, abstractmethod, abstractproperty
8
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
9
+ from deltacat.storage import (
10
+ LocalTable,
11
+ )
12
+ from deltacat.io.object_store import IObjectStore
13
+
14
+ LocalTableReference = Union[ObjectRef, LocalTable]
15
+
16
+
17
+ class LocalTableStorageStrategy(ABC):
18
+ @abstractproperty
19
+ def object_store(cls) -> IObjectStore:
20
+ pass
21
+
22
+ @abstractmethod
23
+ def store_table(self, table: LocalTable) -> LocalTableReference:
24
+ pass
25
+
26
+ @abstractmethod
27
+ def get_table(self, table_like: LocalTableReference) -> LocalTable:
28
+ pass
29
+
30
+
31
+ class LocalTableRayObjectStoreReferenceStorageStrategy(LocalTableStorageStrategy):
32
+ """
33
+ Stores the table in the RayPlasmaObjectStore - see deltacat/io/ray_plasma_object_store.py
34
+ """
35
+
36
+ _object_store: IObjectStore = RayPlasmaObjectStore()
37
+
38
+ @property
39
+ def object_store(cls) -> IObjectStore:
40
+ return cls._object_store
41
+
42
+ def store_table(self, table: LocalTable) -> LocalTableReference:
43
+ obj_ref: ObjectRef = self.object_store.put(table)
44
+ return obj_ref
45
+
46
+ def get_table(self, table_like: LocalTableReference) -> LocalTable:
47
+ table = self.object_store.get(table_like)
48
+ return table
49
+
50
+ def get_table_reference(self, table_ref: Any) -> LocalTableReference:
51
+ return self.object_store.deserialize_references([table_ref])[0]
@@ -378,7 +378,7 @@ def _discover_deltas(
378
378
  )
379
379
  deltas = deltas_list_result.all_items()
380
380
  if not deltas:
381
- logger.warn(
381
+ logger.warning(
382
382
  f"Couldn't find any deltas to "
383
383
  f"compact in delta stream position range "
384
384
  f"('{start_position_exclusive}', "
@@ -24,6 +24,16 @@ from deltacat.compute.compactor.model.materialize_result import MaterializeResul
24
24
  from deltacat.compute.compactor_v2.utils.merge import (
25
25
  generate_local_merge_input,
26
26
  )
27
+ from deltacat.compute.compactor import DeltaAnnotated
28
+ from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
29
+ from deltacat.compute.compactor_v2.deletes.delete_strategy import (
30
+ DeleteStrategy,
31
+ )
32
+ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
33
+ DeleteFileEnvelope,
34
+ )
35
+ from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
36
+
27
37
  from deltacat.storage import (
28
38
  Delta,
29
39
  DeltaLocator,
@@ -52,6 +62,7 @@ from deltacat.utils.resources import (
52
62
  from deltacat.compute.compactor_v2.utils.task_options import (
53
63
  hash_bucket_resource_options_provider,
54
64
  merge_resource_options_provider,
65
+ local_merge_resource_options_provider,
55
66
  )
56
67
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
57
68
 
@@ -95,7 +106,7 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
95
106
  **params.s3_client_kwargs,
96
107
  )
97
108
  else:
98
- logger.warn("No new partition was committed during compaction.")
109
+ logger.warning("No new partition was committed during compaction.")
99
110
 
100
111
  logger.info(
101
112
  f"Completed compaction session for: {params.source_partition_locator}"
@@ -149,7 +160,7 @@ def _execute_compaction(
149
160
  )
150
161
  if not round_completion_info:
151
162
  logger.info(
152
- f"Both rebase partition and round completion file not found. Performing an entire backfill on source."
163
+ "Both rebase partition and round completion file not found. Performing an entire backfill on source."
153
164
  )
154
165
  else:
155
166
  compacted_delta_locator = round_completion_info.compacted_delta_locator
@@ -175,7 +186,7 @@ def _execute_compaction(
175
186
 
176
187
  delta_discovery_start = time.monotonic()
177
188
 
178
- input_deltas = io.discover_deltas(
189
+ input_deltas: List[Delta] = io.discover_deltas(
179
190
  params.source_partition_locator,
180
191
  params.last_stream_position_to_compact,
181
192
  params.rebase_source_partition_locator,
@@ -185,8 +196,24 @@ def _execute_compaction(
185
196
  params.deltacat_storage_kwargs,
186
197
  params.list_deltas_kwargs,
187
198
  )
199
+ if not input_deltas:
200
+ logger.info("No input deltas found to compact.")
201
+ return None, None, None
188
202
 
189
- uniform_deltas = io.create_uniform_input_deltas(
203
+ delete_strategy: Optional[DeleteStrategy] = None
204
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
205
+ delete_file_size_bytes: int = 0
206
+ if contains_delete_deltas(input_deltas):
207
+ input_deltas, delete_file_envelopes, delete_strategy = prepare_deletes(
208
+ params, input_deltas
209
+ )
210
+ for delete_file_envelope in delete_file_envelopes:
211
+ delete_file_size_bytes += delete_file_envelope.table_size_bytes
212
+ logger.info(
213
+ f" Input deltas contain DELETE-type deltas. Total delete file size={delete_file_size_bytes}."
214
+ f" Total length of delete file envelopes={len(delete_file_envelopes)}"
215
+ )
216
+ uniform_deltas: List[DeltaAnnotated] = io.create_uniform_input_deltas(
190
217
  input_deltas=input_deltas,
191
218
  hash_bucket_count=params.hash_bucket_count,
192
219
  compaction_audit=compaction_audit,
@@ -212,10 +239,6 @@ def _execute_compaction(
212
239
  **params.s3_client_kwargs,
213
240
  )
214
241
 
215
- if not input_deltas:
216
- logger.info("No input deltas found to compact.")
217
- return None, None, None
218
-
219
242
  # create a new stream for this round
220
243
  compacted_stream_locator = params.destination_partition_locator.stream_locator
221
244
  compacted_stream = params.deltacat_storage.get_stream(
@@ -236,8 +259,10 @@ def _execute_compaction(
236
259
  resource_amount_provider=hash_bucket_resource_options_provider,
237
260
  previous_inflation=params.previous_inflation,
238
261
  average_record_size_bytes=params.average_record_size_bytes,
262
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
239
263
  primary_keys=params.primary_keys,
240
264
  ray_custom_resources=params.ray_custom_resources,
265
+ memory_logs_enabled=params.memory_logs_enabled,
241
266
  )
242
267
 
243
268
  total_input_records_count = np.int64(0)
@@ -246,9 +271,36 @@ def _execute_compaction(
246
271
  if params.hash_bucket_count == 1:
247
272
  merge_start = time.monotonic()
248
273
  local_merge_input = generate_local_merge_input(
249
- params, uniform_deltas, compacted_partition, round_completion_info
274
+ params,
275
+ uniform_deltas,
276
+ compacted_partition,
277
+ round_completion_info,
278
+ delete_strategy,
279
+ delete_file_envelopes,
280
+ )
281
+ estimated_da_bytes = (
282
+ compaction_audit.estimated_in_memory_size_bytes_during_discovery
283
+ )
284
+ estimated_num_records = sum(
285
+ [
286
+ entry.meta.record_count
287
+ for delta in uniform_deltas
288
+ for entry in delta.manifest.entries
289
+ ]
290
+ )
291
+ local_merge_options = local_merge_resource_options_provider(
292
+ estimated_da_size=estimated_da_bytes,
293
+ estimated_num_rows=estimated_num_records,
294
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
295
+ round_completion_info=round_completion_info,
296
+ compacted_delta_manifest=previous_compacted_delta_manifest,
297
+ ray_custom_resources=params.ray_custom_resources,
298
+ primary_keys=params.primary_keys,
299
+ memory_logs_enabled=params.memory_logs_enabled,
300
+ )
301
+ local_merge_result = ray.get(
302
+ mg.merge.options(**local_merge_options).remote(local_merge_input)
250
303
  )
251
- local_merge_result = ray.get(mg.merge.remote(local_merge_input))
252
304
  total_input_records_count += local_merge_result.input_record_count
253
305
  merge_results = [local_merge_result]
254
306
  merge_invoke_end = time.monotonic()
@@ -269,6 +321,7 @@ def _execute_compaction(
269
321
  object_store=params.object_store,
270
322
  deltacat_storage=params.deltacat_storage,
271
323
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
324
+ memory_logs_enabled=params.memory_logs_enabled,
272
325
  )
273
326
  }
274
327
 
@@ -345,6 +398,9 @@ def _execute_compaction(
345
398
  )
346
399
 
347
400
  # BSP Step 2: Merge
401
+ # NOTE: DELETE-type deltas are stored in Plasma object store
402
+ # in prepare_deletes and therefore don't need to included
403
+ # in merge task resource estimation
348
404
  merge_options_provider = functools.partial(
349
405
  task_resource_options_provider,
350
406
  pg_config=params.pg_config,
@@ -352,12 +408,14 @@ def _execute_compaction(
352
408
  num_hash_groups=params.hash_group_count,
353
409
  hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
354
410
  hash_group_num_rows=all_hash_group_idx_to_num_rows,
411
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
355
412
  round_completion_info=round_completion_info,
356
413
  compacted_delta_manifest=previous_compacted_delta_manifest,
357
414
  primary_keys=params.primary_keys,
358
415
  deltacat_storage=params.deltacat_storage,
359
416
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
360
417
  ray_custom_resources=params.ray_custom_resources,
418
+ memory_logs_enabled=params.memory_logs_enabled,
361
419
  )
362
420
 
363
421
  def merge_input_provider(index, item):
@@ -385,6 +443,9 @@ def _execute_compaction(
385
443
  object_store=params.object_store,
386
444
  deltacat_storage=params.deltacat_storage,
387
445
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
446
+ delete_strategy=delete_strategy,
447
+ delete_file_envelopes=delete_file_envelopes,
448
+ memory_logs_enabled=params.memory_logs_enabled,
388
449
  )
389
450
  }
390
451
 
@@ -406,7 +467,12 @@ def _execute_compaction(
406
467
  merge_end = time.monotonic()
407
468
 
408
469
  total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
409
- logger.info(f"Deduped {total_dd_record_count} records...")
470
+ total_deleted_record_count = sum(
471
+ [ddr.deleted_record_count for ddr in merge_results]
472
+ )
473
+ logger.info(
474
+ f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
475
+ )
410
476
 
411
477
  compaction_audit.set_input_records(total_input_records_count.item())
412
478
 
@@ -419,7 +485,7 @@ def _execute_compaction(
419
485
  )
420
486
 
421
487
  compaction_audit.set_records_deduped(total_dd_record_count.item())
422
-
488
+ compaction_audit.set_records_deleted(total_deleted_record_count.item())
423
489
  mat_results = []
424
490
  for merge_result in merge_results:
425
491
  mat_results.extend(merge_result.materialize_results)
@@ -466,6 +532,7 @@ def _execute_compaction(
466
532
  record_info_msg = (
467
533
  f"Hash bucket records: {total_hb_record_count},"
468
534
  f" Deduped records: {total_dd_record_count}, "
535
+ f" Deleted records: {total_deleted_record_count}, "
469
536
  f" Materialized records: {merged_delta.meta.record_count}"
470
537
  )
471
538
  logger.info(record_info_msg)
@@ -526,7 +593,7 @@ def _execute_compaction(
526
593
  )
527
594
 
528
595
  # After all incremental delta related calculations, we update
529
- # the input sizes to accomodate the compacted table
596
+ # the input sizes to accommodate the compacted table
530
597
  if round_completion_info:
531
598
  compaction_audit.set_input_file_count(
532
599
  (compaction_audit.input_file_count or 0)
@@ -565,7 +632,6 @@ def _execute_compaction(
565
632
  f"partition-{params.source_partition_locator.partition_values},"
566
633
  f"compacted at: {params.last_stream_position_to_compact},"
567
634
  )
568
-
569
635
  return (
570
636
  compacted_partition,
571
637
  new_round_completion_info,
File without changes
@@ -0,0 +1,83 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+ from typing import Any, List, Optional
4
+ from deltacat.storage import DeltaType, LocalTable
5
+ from deltacat.compute.compactor import (
6
+ DeltaFileEnvelope,
7
+ )
8
+ import numpy as np
9
+ import pyarrow as pa
10
+
11
+ from deltacat.compute.compactor.model.table_object_store import (
12
+ LocalTableStorageStrategy,
13
+ LocalTableRayObjectStoreReferenceStorageStrategy,
14
+ )
15
+
16
+
17
+ class DeleteFileEnvelope(DeltaFileEnvelope):
18
+ @staticmethod
19
+ def of(
20
+ stream_position: int,
21
+ delta_type: DeltaType,
22
+ table: LocalTable,
23
+ delete_columns: List[str],
24
+ file_index: int = None,
25
+ is_src_delta: np.bool_ = True,
26
+ file_record_count: Optional[int] = None,
27
+ table_storage_strategy: LocalTableStorageStrategy = LocalTableRayObjectStoreReferenceStorageStrategy(),
28
+ ) -> DeleteFileEnvelope:
29
+ """
30
+ Static factory builder for a DeleteFileEnvelope. Subclasses from DeltaFileEnvelope
31
+ `
32
+ Args:
33
+ stream_position: Stream position of a delta.
34
+ delta_type: A delta type.
35
+ table: The table object that represents the delta file.
36
+ delete_columns: delete column_names needed for equality-based deletes,
37
+ file_index: Manifest file index number of a delta.
38
+ is_src_delta: True if this Delta File Locator is
39
+ pointing to a file from the uncompacted source table, False if
40
+ this Locator is pointing to a file in the compacted destination
41
+ table.
42
+ table_storage_strategy: The way the table object is stored in the delta file envelope. Defaults to LocalTableRayObjectStoreReferenceStorageStrategy
43
+ Returns:
44
+ A delete file envelope.
45
+
46
+ """
47
+ delete_file_envelope = DeltaFileEnvelope.of(
48
+ stream_position,
49
+ delta_type,
50
+ table,
51
+ file_index,
52
+ is_src_delta,
53
+ file_record_count,
54
+ table_storage_strategy,
55
+ )
56
+ assert len(delete_columns) > 0, "At least 1 delete column is expected"
57
+ delete_file_envelope["delete_columns"] = delete_columns
58
+ if isinstance(table, pa.Table):
59
+ delete_file_envelope["table_size_bytes"] = table.nbytes
60
+ return DeleteFileEnvelope(**delete_file_envelope)
61
+
62
+ @property
63
+ def table_size_bytes(self) -> int:
64
+ val = self.get("table_size_bytes")
65
+ if val is not None:
66
+ return val
67
+ else:
68
+ raise ValueError(
69
+ f"Table type: {type(self.table)} not for supported for size method."
70
+ )
71
+
72
+ @property
73
+ def delete_columns(self) -> List[str]:
74
+ return self["delete_columns"]
75
+
76
+ @property
77
+ def table_reference(self) -> Optional[Any]:
78
+ if self.table_storage_strategy is not None and isinstance(
79
+ self.table_storage_strategy,
80
+ LocalTableRayObjectStoreReferenceStorageStrategy,
81
+ ):
82
+ return self.table_storage_strategy.get_table_reference(self["table"])
83
+ return None
@@ -0,0 +1,82 @@
1
+ from typing import List, Optional
2
+
3
+ import pyarrow as pa
4
+ from abc import ABC, abstractmethod
5
+
6
+ from typing import Tuple
7
+ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
8
+ DeleteFileEnvelope,
9
+ )
10
+
11
+
12
+ class DeleteStrategy(ABC):
13
+ """
14
+ Encapsulates a strategy for applying row-level deletes on tables during compaction
15
+
16
+ This abstract base class defines the interface for applying delete operations
17
+ on intermediate in-memory pyarrow tables during compaction. Concrete subclasses must implement the `apply_deletes` and
18
+ `apply_many_deletes` methods, as well as the `name` property.
19
+
20
+ Example:
21
+ >>> class MyDeleteStrategy(DeleteStrategy):
22
+ ... @property
23
+ ... def name(self) -> str:
24
+ ... return "MyDeleteStrategy"
25
+ ...
26
+ ... def apply_deletes(self, table: Optional[pa.Table], delete_file_envelope: DeleteFileEnvelope) -> ReturnTuple[pa.Table, int]:
27
+ ... # Implement delete logic here
28
+ ... pass
29
+ ...
30
+ ... def apply_many_deletes(self, table: Optional[pa.Table], delete_file_envelopes: List[DeleteFileEnvelope]) -> ReturnTuple[pa.Table, int]:
31
+ ... # Implement delete logic here
32
+ ... pass
33
+ """
34
+
35
+ @property
36
+ def name(self) -> str:
37
+ """
38
+ The name of the delete strategy.
39
+ """
40
+ pass
41
+
42
+ @abstractmethod
43
+ def apply_deletes(
44
+ self,
45
+ table: Optional[pa.Table],
46
+ delete_file_envelope: DeleteFileEnvelope,
47
+ *args,
48
+ **kwargs,
49
+ ) -> Tuple[pa.Table, int]:
50
+ """
51
+ Apply delete operations on the given table using the provided delete file envelope.
52
+
53
+ Args:
54
+ table (Optional[pa.Table]): The pyarrow table to apply deletes on.
55
+ delete_file_envelope (DeleteFileEnvelope): The delete file envelope containing delete parameters.
56
+
57
+ Returns:
58
+ Tuple[pa.Table, int]: A tuple containing the updated Arrow table after applying deletes,
59
+ and the number of rows deleted.
60
+ """
61
+ pass
62
+
63
+ @abstractmethod
64
+ def apply_many_deletes(
65
+ self,
66
+ table: Optional[pa.Table],
67
+ delete_file_envelopes: List[DeleteFileEnvelope],
68
+ *args,
69
+ **kwargs,
70
+ ) -> Tuple[pa.Table, int]:
71
+ """
72
+ Apply delete operations on the given table using all provided delete file envelopes.
73
+
74
+ Args:
75
+ table (Optional[pa.Table]): The Arrow table to apply deletes on.
76
+ delete_file_envelopes (List[DeleteFileEnvelope]): A list of delete file envelopes containing delete parameters.
77
+
78
+ Returns:
79
+ Tuple[pa.Table, int]: A tuple containing the updated Arrow table after applying all deletes,
80
+ and the total number of rows deleted.
81
+ """
82
+ pass