deltacat 1.1.13__py3-none-any.whl → 1.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +3 -2
- deltacat/compute/compactor/model/compact_partition_params.py +11 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +2 -2
- deltacat/compute/compactor/model/delta_annotated.py +2 -4
- deltacat/compute/compactor/steps/hash_bucket.py +2 -3
- deltacat/compute/compactor_v2/compaction_session.py +26 -27
- deltacat/compute/compactor_v2/constants.py +4 -0
- deltacat/compute/compactor_v2/private/__init__.py +0 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +753 -0
- deltacat/compute/compactor_v2/steps/merge.py +0 -3
- deltacat/compute/compactor_v2/utils/delta.py +2 -3
- deltacat/compute/compactor_v2/utils/io.py +0 -2
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +855 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +1 -1
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +330 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +118 -0
- deltacat/tests/local_deltacat_storage/__init__.py +8 -5
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/METADATA +1 -1
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/RECORD +25 -21
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/LICENSE +0 -0
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/WHEEL +0 -0
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -688,8 +688,9 @@ def _execute_compaction_round(
|
|
688
688
|
session_peak_memory
|
689
689
|
)
|
690
690
|
|
691
|
-
compaction_audit.save_round_completion_stats(
|
692
|
-
|
691
|
+
compaction_audit.save_round_completion_stats(mat_results)
|
692
|
+
compaction_audit.set_telemetry_time_in_seconds(
|
693
|
+
telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
693
694
|
)
|
694
695
|
|
695
696
|
s3_utils.upload(
|
@@ -22,6 +22,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
22
22
|
DROP_DUPLICATES,
|
23
23
|
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
24
24
|
DEFAULT_DISABLE_COPY_BY_REFERENCE,
|
25
|
+
DEFAULT_NUM_ROUNDS,
|
25
26
|
)
|
26
27
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
27
28
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -102,6 +103,8 @@ class CompactPartitionParams(dict):
|
|
102
103
|
|
103
104
|
result.metrics_config = params.get("metrics_config")
|
104
105
|
|
106
|
+
result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
|
107
|
+
|
105
108
|
if not importlib.util.find_spec("memray"):
|
106
109
|
result.enable_profiler = False
|
107
110
|
|
@@ -189,7 +192,6 @@ class CompactPartitionParams(dict):
|
|
189
192
|
cluster_resources = self.pg_config.resource
|
190
193
|
cluster_cpus = cluster_resources["CPU"]
|
191
194
|
self.task_max_parallelism = cluster_cpus
|
192
|
-
self["task_max_parallelism"] = self.task_max_parallelism
|
193
195
|
return self["task_max_parallelism"]
|
194
196
|
|
195
197
|
@task_max_parallelism.setter
|
@@ -403,6 +405,14 @@ class CompactPartitionParams(dict):
|
|
403
405
|
def metrics_config(self, config: MetricsConfig) -> None:
|
404
406
|
self["metrics_config"] = config
|
405
407
|
|
408
|
+
@property
|
409
|
+
def num_rounds(self) -> int:
|
410
|
+
return self["num_rounds"]
|
411
|
+
|
412
|
+
@num_rounds.setter
|
413
|
+
def num_rounds(self, num_rounds: int) -> None:
|
414
|
+
self["num_rounds"] = num_rounds
|
415
|
+
|
406
416
|
@staticmethod
|
407
417
|
def json_handler_for_compact_partition_params(obj):
|
408
418
|
"""
|
@@ -818,7 +818,8 @@ class CompactionSessionAuditInfo(dict):
|
|
818
818
|
return cluster_util_after_task_latency + telemetry_time
|
819
819
|
|
820
820
|
def save_round_completion_stats(
|
821
|
-
self,
|
821
|
+
self,
|
822
|
+
mat_results: List[MaterializeResult],
|
822
823
|
) -> None:
|
823
824
|
"""
|
824
825
|
This method saves all the relevant stats after all the steps are completed.
|
@@ -888,4 +889,3 @@ class CompactionSessionAuditInfo(dict):
|
|
888
889
|
)
|
889
890
|
|
890
891
|
self.set_pyarrow_version(pa.__version__)
|
891
|
-
self.set_telemetry_time_in_seconds(total_telemetry_time)
|
@@ -97,8 +97,7 @@ class DeltaAnnotated(Delta):
|
|
97
97
|
for src_da in split_annotated_deltas:
|
98
98
|
src_da_annotations = src_da.annotations
|
99
99
|
src_da_entries = src_da.manifest.entries
|
100
|
-
assert (
|
101
|
-
len(src_da_annotations) == len(src_da_entries),
|
100
|
+
assert len(src_da_annotations) == len(src_da_entries), (
|
102
101
|
f"Unexpected Error: Length of delta annotations "
|
103
102
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
104
103
|
f"delta manifest entries ({len(src_da_entries)}).",
|
@@ -152,8 +151,7 @@ class DeltaAnnotated(Delta):
|
|
152
151
|
da_group_entry_count = 0
|
153
152
|
src_da_annotations = src_da.annotations
|
154
153
|
src_da_entries = src_da.manifest.entries
|
155
|
-
assert (
|
156
|
-
len(src_da_annotations) == len(src_da_entries),
|
154
|
+
assert len(src_da_annotations) == len(src_da_entries), (
|
157
155
|
f"Unexpected Error: Length of delta annotations "
|
158
156
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
159
157
|
f"delta manifest entries ({len(src_da_entries)}).",
|
@@ -157,11 +157,10 @@ def _read_delta_file_envelopes(
|
|
157
157
|
**deltacat_storage_kwargs,
|
158
158
|
)
|
159
159
|
annotations = annotated_delta.annotations
|
160
|
-
assert (
|
161
|
-
len(tables) == len(annotations),
|
160
|
+
assert len(tables) == len(annotations), (
|
162
161
|
f"Unexpected Error: Length of downloaded delta manifest tables "
|
163
162
|
f"({len(tables)}) doesn't match the length of delta manifest "
|
164
|
-
f"annotations ({len(annotations)})."
|
163
|
+
f"annotations ({len(annotations)})."
|
165
164
|
)
|
166
165
|
if not tables:
|
167
166
|
return None, 0
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import numpy as np
|
2
1
|
import importlib
|
3
2
|
from contextlib import nullcontext
|
4
3
|
import logging
|
@@ -40,6 +39,8 @@ from deltacat.utils.resources import (
|
|
40
39
|
from deltacat.compute.compactor_v2.private.compaction_utils import (
|
41
40
|
_fetch_compaction_metadata,
|
42
41
|
_build_uniform_deltas,
|
42
|
+
_group_uniform_deltas,
|
43
|
+
_stage_new_partition,
|
43
44
|
_run_hash_and_merge,
|
44
45
|
_process_merge_results,
|
45
46
|
_upload_compaction_audit,
|
@@ -69,6 +70,10 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
69
70
|
assert (
|
70
71
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
71
72
|
), "hash_bucket_count is a required arg for compactor v2"
|
73
|
+
if params.num_rounds > 1:
|
74
|
+
assert (
|
75
|
+
not params.drop_duplicates
|
76
|
+
), "num_rounds > 1, drop_duplicates must be False but is True"
|
72
77
|
|
73
78
|
with memray.Tracker(
|
74
79
|
"compaction_partition.bin"
|
@@ -144,32 +149,28 @@ def _execute_compaction(
|
|
144
149
|
delete_strategy,
|
145
150
|
delete_file_envelopes,
|
146
151
|
) = build_uniform_deltas_result
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
telemetry_time_hb,
|
166
|
-
telemetry_time_merge,
|
167
|
-
compacted_partition,
|
168
|
-
) = _run_hash_and_merge_result
|
152
|
+
logger.info(f"Number of rounds parameter is set to: {params.num_rounds}")
|
153
|
+
uniform_deltas_grouped = _group_uniform_deltas(params, uniform_deltas)
|
154
|
+
logger.info(f"Length of grouped uniform deltas is: {len(uniform_deltas_grouped)}")
|
155
|
+
merge_result_list: List[MergeResult] = []
|
156
|
+
compacted_partition = _stage_new_partition(params)
|
157
|
+
for uniform_deltas in uniform_deltas_grouped:
|
158
|
+
# run hash and merge
|
159
|
+
_run_hash_and_merge_result: List[MergeResult] = _run_hash_and_merge(
|
160
|
+
params,
|
161
|
+
uniform_deltas,
|
162
|
+
round_completion_info,
|
163
|
+
delete_strategy,
|
164
|
+
delete_file_envelopes,
|
165
|
+
compaction_audit,
|
166
|
+
previous_compacted_delta_manifest,
|
167
|
+
compacted_partition,
|
168
|
+
)
|
169
|
+
merge_result_list.extend(_run_hash_and_merge_result)
|
169
170
|
# process merge results
|
170
171
|
process_merge_results: tuple[
|
171
172
|
Delta, list[MaterializeResult], dict
|
172
|
-
] = _process_merge_results(params,
|
173
|
+
] = _process_merge_results(params, merge_result_list, compaction_audit)
|
173
174
|
merged_delta, mat_results, hb_id_to_entry_indices_range = process_merge_results
|
174
175
|
# Record information, logging, and return ExecutionCompactionResult
|
175
176
|
record_info_msg: str = f" Materialized records: {merged_delta.meta.record_count}"
|
@@ -198,9 +199,7 @@ def _execute_compaction(
|
|
198
199
|
session_peak_memory
|
199
200
|
)
|
200
201
|
|
201
|
-
compaction_audit.save_round_completion_stats(
|
202
|
-
mat_results, telemetry_time_hb + telemetry_time_merge
|
203
|
-
)
|
202
|
+
compaction_audit.save_round_completion_stats(mat_results)
|
204
203
|
|
205
204
|
_upload_compaction_audit(
|
206
205
|
params,
|
@@ -71,3 +71,7 @@ PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
|
|
71
71
|
|
72
72
|
# Metric prefix for compact partition method
|
73
73
|
COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
74
|
+
|
75
|
+
# Number of rounds to run hash/merge for a single
|
76
|
+
# partition. (For large table support)
|
77
|
+
DEFAULT_NUM_ROUNDS = 1
|
File without changes
|