deltacat 1.1.13__py3-none-any.whl → 1.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +3 -2
  3. deltacat/compute/compactor/model/compact_partition_params.py +11 -1
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +2 -2
  5. deltacat/compute/compactor/model/delta_annotated.py +2 -4
  6. deltacat/compute/compactor/steps/hash_bucket.py +2 -3
  7. deltacat/compute/compactor_v2/compaction_session.py +26 -27
  8. deltacat/compute/compactor_v2/constants.py +4 -0
  9. deltacat/compute/compactor_v2/private/__init__.py +0 -0
  10. deltacat/compute/compactor_v2/private/compaction_utils.py +753 -0
  11. deltacat/compute/compactor_v2/steps/merge.py +0 -3
  12. deltacat/compute/compactor_v2/utils/delta.py +2 -3
  13. deltacat/compute/compactor_v2/utils/io.py +0 -2
  14. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  15. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +855 -0
  16. deltacat/tests/compute/compactor_v2/test_compaction_session.py +1 -1
  17. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +330 -0
  18. deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
  19. deltacat/tests/compute/test_util_create_table_deltas_repo.py +118 -0
  20. deltacat/tests/local_deltacat_storage/__init__.py +8 -5
  21. {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/METADATA +1 -1
  22. {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/RECORD +25 -21
  23. {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/LICENSE +0 -0
  24. {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/WHEEL +0 -0
  25. {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.13"
47
+ __version__ = "1.1.15"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -688,8 +688,9 @@ def _execute_compaction_round(
688
688
  session_peak_memory
689
689
  )
690
690
 
691
- compaction_audit.save_round_completion_stats(
692
- mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
691
+ compaction_audit.save_round_completion_stats(mat_results)
692
+ compaction_audit.set_telemetry_time_in_seconds(
693
+ telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
693
694
  )
694
695
 
695
696
  s3_utils.upload(
@@ -22,6 +22,7 @@ from deltacat.compute.compactor_v2.constants import (
22
22
  DROP_DUPLICATES,
23
23
  TOTAL_MEMORY_BUFFER_PERCENTAGE,
24
24
  DEFAULT_DISABLE_COPY_BY_REFERENCE,
25
+ DEFAULT_NUM_ROUNDS,
25
26
  )
26
27
  from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
27
28
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -102,6 +103,8 @@ class CompactPartitionParams(dict):
102
103
 
103
104
  result.metrics_config = params.get("metrics_config")
104
105
 
106
+ result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
107
+
105
108
  if not importlib.util.find_spec("memray"):
106
109
  result.enable_profiler = False
107
110
 
@@ -189,7 +192,6 @@ class CompactPartitionParams(dict):
189
192
  cluster_resources = self.pg_config.resource
190
193
  cluster_cpus = cluster_resources["CPU"]
191
194
  self.task_max_parallelism = cluster_cpus
192
- self["task_max_parallelism"] = self.task_max_parallelism
193
195
  return self["task_max_parallelism"]
194
196
 
195
197
  @task_max_parallelism.setter
@@ -403,6 +405,14 @@ class CompactPartitionParams(dict):
403
405
  def metrics_config(self, config: MetricsConfig) -> None:
404
406
  self["metrics_config"] = config
405
407
 
408
+ @property
409
+ def num_rounds(self) -> int:
410
+ return self["num_rounds"]
411
+
412
+ @num_rounds.setter
413
+ def num_rounds(self, num_rounds: int) -> None:
414
+ self["num_rounds"] = num_rounds
415
+
406
416
  @staticmethod
407
417
  def json_handler_for_compact_partition_params(obj):
408
418
  """
@@ -818,7 +818,8 @@ class CompactionSessionAuditInfo(dict):
818
818
  return cluster_util_after_task_latency + telemetry_time
819
819
 
820
820
  def save_round_completion_stats(
821
- self, mat_results: List[MaterializeResult], total_telemetry_time: float
821
+ self,
822
+ mat_results: List[MaterializeResult],
822
823
  ) -> None:
823
824
  """
824
825
  This method saves all the relevant stats after all the steps are completed.
@@ -888,4 +889,3 @@ class CompactionSessionAuditInfo(dict):
888
889
  )
889
890
 
890
891
  self.set_pyarrow_version(pa.__version__)
891
- self.set_telemetry_time_in_seconds(total_telemetry_time)
@@ -97,8 +97,7 @@ class DeltaAnnotated(Delta):
97
97
  for src_da in split_annotated_deltas:
98
98
  src_da_annotations = src_da.annotations
99
99
  src_da_entries = src_da.manifest.entries
100
- assert (
101
- len(src_da_annotations) == len(src_da_entries),
100
+ assert len(src_da_annotations) == len(src_da_entries), (
102
101
  f"Unexpected Error: Length of delta annotations "
103
102
  f"({len(src_da_annotations)}) doesn't mach the length of "
104
103
  f"delta manifest entries ({len(src_da_entries)}).",
@@ -152,8 +151,7 @@ class DeltaAnnotated(Delta):
152
151
  da_group_entry_count = 0
153
152
  src_da_annotations = src_da.annotations
154
153
  src_da_entries = src_da.manifest.entries
155
- assert (
156
- len(src_da_annotations) == len(src_da_entries),
154
+ assert len(src_da_annotations) == len(src_da_entries), (
157
155
  f"Unexpected Error: Length of delta annotations "
158
156
  f"({len(src_da_annotations)}) doesn't mach the length of "
159
157
  f"delta manifest entries ({len(src_da_entries)}).",
@@ -157,11 +157,10 @@ def _read_delta_file_envelopes(
157
157
  **deltacat_storage_kwargs,
158
158
  )
159
159
  annotations = annotated_delta.annotations
160
- assert (
161
- len(tables) == len(annotations),
160
+ assert len(tables) == len(annotations), (
162
161
  f"Unexpected Error: Length of downloaded delta manifest tables "
163
162
  f"({len(tables)}) doesn't match the length of delta manifest "
164
- f"annotations ({len(annotations)}).",
163
+ f"annotations ({len(annotations)})."
165
164
  )
166
165
  if not tables:
167
166
  return None, 0
@@ -1,4 +1,3 @@
1
- import numpy as np
2
1
  import importlib
3
2
  from contextlib import nullcontext
4
3
  import logging
@@ -40,6 +39,8 @@ from deltacat.utils.resources import (
40
39
  from deltacat.compute.compactor_v2.private.compaction_utils import (
41
40
  _fetch_compaction_metadata,
42
41
  _build_uniform_deltas,
42
+ _group_uniform_deltas,
43
+ _stage_new_partition,
43
44
  _run_hash_and_merge,
44
45
  _process_merge_results,
45
46
  _upload_compaction_audit,
@@ -69,6 +70,10 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
69
70
  assert (
70
71
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
71
72
  ), "hash_bucket_count is a required arg for compactor v2"
73
+ if params.num_rounds > 1:
74
+ assert (
75
+ not params.drop_duplicates
76
+ ), "num_rounds > 1, drop_duplicates must be False but is True"
72
77
 
73
78
  with memray.Tracker(
74
79
  "compaction_partition.bin"
@@ -144,32 +149,28 @@ def _execute_compaction(
144
149
  delete_strategy,
145
150
  delete_file_envelopes,
146
151
  ) = build_uniform_deltas_result
147
-
148
- # run merge
149
- _run_hash_and_merge_result: tuple[
150
- Optional[List[MergeResult]],
151
- np.float64,
152
- np.float64,
153
- Partition,
154
- ] = _run_hash_and_merge(
155
- params,
156
- uniform_deltas,
157
- round_completion_info,
158
- delete_strategy,
159
- delete_file_envelopes,
160
- compaction_audit,
161
- previous_compacted_delta_manifest,
162
- )
163
- (
164
- merge_results,
165
- telemetry_time_hb,
166
- telemetry_time_merge,
167
- compacted_partition,
168
- ) = _run_hash_and_merge_result
152
+ logger.info(f"Number of rounds parameter is set to: {params.num_rounds}")
153
+ uniform_deltas_grouped = _group_uniform_deltas(params, uniform_deltas)
154
+ logger.info(f"Length of grouped uniform deltas is: {len(uniform_deltas_grouped)}")
155
+ merge_result_list: List[MergeResult] = []
156
+ compacted_partition = _stage_new_partition(params)
157
+ for uniform_deltas in uniform_deltas_grouped:
158
+ # run hash and merge
159
+ _run_hash_and_merge_result: List[MergeResult] = _run_hash_and_merge(
160
+ params,
161
+ uniform_deltas,
162
+ round_completion_info,
163
+ delete_strategy,
164
+ delete_file_envelopes,
165
+ compaction_audit,
166
+ previous_compacted_delta_manifest,
167
+ compacted_partition,
168
+ )
169
+ merge_result_list.extend(_run_hash_and_merge_result)
169
170
  # process merge results
170
171
  process_merge_results: tuple[
171
172
  Delta, list[MaterializeResult], dict
172
- ] = _process_merge_results(params, merge_results, compaction_audit)
173
+ ] = _process_merge_results(params, merge_result_list, compaction_audit)
173
174
  merged_delta, mat_results, hb_id_to_entry_indices_range = process_merge_results
174
175
  # Record information, logging, and return ExecutionCompactionResult
175
176
  record_info_msg: str = f" Materialized records: {merged_delta.meta.record_count}"
@@ -198,9 +199,7 @@ def _execute_compaction(
198
199
  session_peak_memory
199
200
  )
200
201
 
201
- compaction_audit.save_round_completion_stats(
202
- mat_results, telemetry_time_hb + telemetry_time_merge
203
- )
202
+ compaction_audit.save_round_completion_stats(mat_results)
204
203
 
205
204
  _upload_compaction_audit(
206
205
  params,
@@ -71,3 +71,7 @@ PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
71
71
 
72
72
  # Metric prefix for compact partition method
73
73
  COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
74
+
75
+ # Number of rounds to run hash/merge for a single
76
+ # partition. (For large table support)
77
+ DEFAULT_NUM_ROUNDS = 1
File without changes