deltacat 1.1.14__py3-none-any.whl → 1.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +3 -2
- deltacat/compute/compactor/model/compact_partition_params.py +11 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +2 -2
- deltacat/compute/compactor/model/delta_annotated.py +2 -4
- deltacat/compute/compactor/steps/hash_bucket.py +2 -3
- deltacat/compute/compactor_v2/compaction_session.py +27 -33
- deltacat/compute/compactor_v2/constants.py +4 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +112 -67
- deltacat/compute/compactor_v2/steps/merge.py +0 -3
- deltacat/compute/compactor_v2/utils/delta.py +2 -3
- deltacat/compute/compactor_v2/utils/io.py +0 -2
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +855 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +147 -1
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +330 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +118 -0
- deltacat/tests/local_deltacat_storage/__init__.py +8 -5
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/METADATA +1 -1
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/RECORD +24 -22
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/LICENSE +0 -0
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/WHEEL +0 -0
- {deltacat-1.1.14.dist-info → deltacat-1.1.16.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -688,8 +688,9 @@ def _execute_compaction_round(
|
|
688
688
|
session_peak_memory
|
689
689
|
)
|
690
690
|
|
691
|
-
compaction_audit.save_round_completion_stats(
|
692
|
-
|
691
|
+
compaction_audit.save_round_completion_stats(mat_results)
|
692
|
+
compaction_audit.set_telemetry_time_in_seconds(
|
693
|
+
telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
693
694
|
)
|
694
695
|
|
695
696
|
s3_utils.upload(
|
@@ -22,6 +22,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
22
22
|
DROP_DUPLICATES,
|
23
23
|
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
24
24
|
DEFAULT_DISABLE_COPY_BY_REFERENCE,
|
25
|
+
DEFAULT_NUM_ROUNDS,
|
25
26
|
)
|
26
27
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
27
28
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -102,6 +103,8 @@ class CompactPartitionParams(dict):
|
|
102
103
|
|
103
104
|
result.metrics_config = params.get("metrics_config")
|
104
105
|
|
106
|
+
result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
|
107
|
+
|
105
108
|
if not importlib.util.find_spec("memray"):
|
106
109
|
result.enable_profiler = False
|
107
110
|
|
@@ -189,7 +192,6 @@ class CompactPartitionParams(dict):
|
|
189
192
|
cluster_resources = self.pg_config.resource
|
190
193
|
cluster_cpus = cluster_resources["CPU"]
|
191
194
|
self.task_max_parallelism = cluster_cpus
|
192
|
-
self["task_max_parallelism"] = self.task_max_parallelism
|
193
195
|
return self["task_max_parallelism"]
|
194
196
|
|
195
197
|
@task_max_parallelism.setter
|
@@ -403,6 +405,14 @@ class CompactPartitionParams(dict):
|
|
403
405
|
def metrics_config(self, config: MetricsConfig) -> None:
|
404
406
|
self["metrics_config"] = config
|
405
407
|
|
408
|
+
@property
|
409
|
+
def num_rounds(self) -> int:
|
410
|
+
return self["num_rounds"]
|
411
|
+
|
412
|
+
@num_rounds.setter
|
413
|
+
def num_rounds(self, num_rounds: int) -> None:
|
414
|
+
self["num_rounds"] = num_rounds
|
415
|
+
|
406
416
|
@staticmethod
|
407
417
|
def json_handler_for_compact_partition_params(obj):
|
408
418
|
"""
|
@@ -818,7 +818,8 @@ class CompactionSessionAuditInfo(dict):
|
|
818
818
|
return cluster_util_after_task_latency + telemetry_time
|
819
819
|
|
820
820
|
def save_round_completion_stats(
|
821
|
-
self,
|
821
|
+
self,
|
822
|
+
mat_results: List[MaterializeResult],
|
822
823
|
) -> None:
|
823
824
|
"""
|
824
825
|
This method saves all the relevant stats after all the steps are completed.
|
@@ -888,4 +889,3 @@ class CompactionSessionAuditInfo(dict):
|
|
888
889
|
)
|
889
890
|
|
890
891
|
self.set_pyarrow_version(pa.__version__)
|
891
|
-
self.set_telemetry_time_in_seconds(total_telemetry_time)
|
@@ -97,8 +97,7 @@ class DeltaAnnotated(Delta):
|
|
97
97
|
for src_da in split_annotated_deltas:
|
98
98
|
src_da_annotations = src_da.annotations
|
99
99
|
src_da_entries = src_da.manifest.entries
|
100
|
-
assert (
|
101
|
-
len(src_da_annotations) == len(src_da_entries),
|
100
|
+
assert len(src_da_annotations) == len(src_da_entries), (
|
102
101
|
f"Unexpected Error: Length of delta annotations "
|
103
102
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
104
103
|
f"delta manifest entries ({len(src_da_entries)}).",
|
@@ -152,8 +151,7 @@ class DeltaAnnotated(Delta):
|
|
152
151
|
da_group_entry_count = 0
|
153
152
|
src_da_annotations = src_da.annotations
|
154
153
|
src_da_entries = src_da.manifest.entries
|
155
|
-
assert (
|
156
|
-
len(src_da_annotations) == len(src_da_entries),
|
154
|
+
assert len(src_da_annotations) == len(src_da_entries), (
|
157
155
|
f"Unexpected Error: Length of delta annotations "
|
158
156
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
159
157
|
f"delta manifest entries ({len(src_da_entries)}).",
|
@@ -157,11 +157,10 @@ def _read_delta_file_envelopes(
|
|
157
157
|
**deltacat_storage_kwargs,
|
158
158
|
)
|
159
159
|
annotations = annotated_delta.annotations
|
160
|
-
assert (
|
161
|
-
len(tables) == len(annotations),
|
160
|
+
assert len(tables) == len(annotations), (
|
162
161
|
f"Unexpected Error: Length of downloaded delta manifest tables "
|
163
162
|
f"({len(tables)}) doesn't match the length of delta manifest "
|
164
|
-
f"annotations ({len(annotations)})."
|
163
|
+
f"annotations ({len(annotations)})."
|
165
164
|
)
|
166
165
|
if not tables:
|
167
166
|
return None, 0
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import numpy as np
|
2
1
|
import importlib
|
3
2
|
from contextlib import nullcontext
|
4
3
|
import logging
|
@@ -40,9 +39,10 @@ from deltacat.utils.resources import (
|
|
40
39
|
from deltacat.compute.compactor_v2.private.compaction_utils import (
|
41
40
|
_fetch_compaction_metadata,
|
42
41
|
_build_uniform_deltas,
|
42
|
+
_group_uniform_deltas,
|
43
|
+
_stage_new_partition,
|
43
44
|
_run_hash_and_merge,
|
44
45
|
_process_merge_results,
|
45
|
-
_upload_compaction_audit,
|
46
46
|
_write_new_round_completion_file,
|
47
47
|
_commit_compaction_result,
|
48
48
|
)
|
@@ -69,6 +69,10 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
69
69
|
assert (
|
70
70
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
71
71
|
), "hash_bucket_count is a required arg for compactor v2"
|
72
|
+
if params.num_rounds > 1:
|
73
|
+
assert (
|
74
|
+
not params.drop_duplicates
|
75
|
+
), "num_rounds > 1, drop_duplicates must be False but is True"
|
72
76
|
|
73
77
|
with memray.Tracker(
|
74
78
|
"compaction_partition.bin"
|
@@ -144,32 +148,28 @@ def _execute_compaction(
|
|
144
148
|
delete_strategy,
|
145
149
|
delete_file_envelopes,
|
146
150
|
) = build_uniform_deltas_result
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
telemetry_time_hb,
|
166
|
-
telemetry_time_merge,
|
167
|
-
compacted_partition,
|
168
|
-
) = _run_hash_and_merge_result
|
151
|
+
logger.info(f"Number of rounds parameter is set to: {params.num_rounds}")
|
152
|
+
uniform_deltas_grouped = _group_uniform_deltas(params, uniform_deltas)
|
153
|
+
logger.info(f"Length of grouped uniform deltas is: {len(uniform_deltas_grouped)}")
|
154
|
+
merge_result_list: List[MergeResult] = []
|
155
|
+
compacted_partition = _stage_new_partition(params)
|
156
|
+
for uniform_deltas in uniform_deltas_grouped:
|
157
|
+
# run hash and merge
|
158
|
+
_run_hash_and_merge_result: List[MergeResult] = _run_hash_and_merge(
|
159
|
+
params,
|
160
|
+
uniform_deltas,
|
161
|
+
round_completion_info,
|
162
|
+
delete_strategy,
|
163
|
+
delete_file_envelopes,
|
164
|
+
compaction_audit,
|
165
|
+
previous_compacted_delta_manifest,
|
166
|
+
compacted_partition,
|
167
|
+
)
|
168
|
+
merge_result_list.extend(_run_hash_and_merge_result)
|
169
169
|
# process merge results
|
170
170
|
process_merge_results: tuple[
|
171
171
|
Delta, list[MaterializeResult], dict
|
172
|
-
] = _process_merge_results(params,
|
172
|
+
] = _process_merge_results(params, merge_result_list, compaction_audit)
|
173
173
|
merged_delta, mat_results, hb_id_to_entry_indices_range = process_merge_results
|
174
174
|
# Record information, logging, and return ExecutionCompactionResult
|
175
175
|
record_info_msg: str = f" Materialized records: {merged_delta.meta.record_count}"
|
@@ -198,15 +198,8 @@ def _execute_compaction(
|
|
198
198
|
session_peak_memory
|
199
199
|
)
|
200
200
|
|
201
|
-
compaction_audit.save_round_completion_stats(
|
202
|
-
mat_results, telemetry_time_hb + telemetry_time_merge
|
203
|
-
)
|
201
|
+
compaction_audit.save_round_completion_stats(mat_results)
|
204
202
|
|
205
|
-
_upload_compaction_audit(
|
206
|
-
params,
|
207
|
-
compaction_audit,
|
208
|
-
round_completion_info,
|
209
|
-
)
|
210
203
|
compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
|
211
204
|
params,
|
212
205
|
compaction_audit,
|
@@ -216,5 +209,6 @@ def _execute_compaction(
|
|
216
209
|
rcf_source_partition_locator,
|
217
210
|
new_compacted_delta_locator,
|
218
211
|
pyarrow_write_result,
|
212
|
+
round_completion_info,
|
219
213
|
)
|
220
214
|
return compaction_result
|
@@ -71,3 +71,7 @@ PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
|
|
71
71
|
|
72
72
|
# Metric prefix for compact partition method
|
73
73
|
COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
74
|
+
|
75
|
+
# Number of rounds to run hash/merge for a single
|
76
|
+
# partition. (For large table support)
|
77
|
+
DEFAULT_NUM_ROUNDS = 1
|
@@ -4,8 +4,10 @@ import logging
|
|
4
4
|
import ray
|
5
5
|
import time
|
6
6
|
import json
|
7
|
+
from math import ceil
|
7
8
|
|
8
9
|
from deltacat.compute.compactor import (
|
10
|
+
PyArrowWriteResult,
|
9
11
|
HighWatermark,
|
10
12
|
RoundCompletionInfo,
|
11
13
|
)
|
@@ -44,10 +46,11 @@ from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
|
|
44
46
|
from deltacat.storage import (
|
45
47
|
Delta,
|
46
48
|
DeltaType,
|
47
|
-
|
48
|
-
StreamLocator,
|
49
|
+
DeltaLocator,
|
49
50
|
Partition,
|
50
51
|
Manifest,
|
52
|
+
Stream,
|
53
|
+
StreamLocator,
|
51
54
|
)
|
52
55
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
53
56
|
CompactPartitionParams,
|
@@ -60,7 +63,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
|
|
60
63
|
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
61
64
|
from deltacat.compute.compactor_v2.utils import io
|
62
65
|
|
63
|
-
from typing import
|
66
|
+
from typing import List, Optional
|
64
67
|
from collections import defaultdict
|
65
68
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
66
69
|
CompactionSessionAuditInfo,
|
@@ -123,9 +126,9 @@ def _fetch_compaction_metadata(
|
|
123
126
|
|
124
127
|
def _build_uniform_deltas(
|
125
128
|
params: CompactPartitionParams,
|
126
|
-
mutable_compaction_audit,
|
127
|
-
input_deltas,
|
128
|
-
delta_discovery_start,
|
129
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
130
|
+
input_deltas: List[Delta],
|
131
|
+
delta_discovery_start: float,
|
129
132
|
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition]:
|
130
133
|
|
131
134
|
delete_strategy: Optional[DeleteStrategy] = None
|
@@ -173,18 +176,34 @@ def _build_uniform_deltas(
|
|
173
176
|
)
|
174
177
|
|
175
178
|
|
176
|
-
def
|
177
|
-
params: CompactPartitionParams,
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
179
|
+
def _group_uniform_deltas(
|
180
|
+
params: CompactPartitionParams, uniform_deltas: List[DeltaAnnotated]
|
181
|
+
) -> List[List[DeltaAnnotated]]:
|
182
|
+
num_deltas = len(uniform_deltas)
|
183
|
+
num_rounds = params.num_rounds
|
184
|
+
if num_rounds == 1:
|
185
|
+
return [uniform_deltas]
|
186
|
+
assert (
|
187
|
+
num_rounds > 0
|
188
|
+
), f"num_rounds parameter should be greater than zero but is {params.num_rounds}"
|
189
|
+
assert (
|
190
|
+
num_rounds <= num_deltas
|
191
|
+
), f"{params.num_rounds} rounds should be less than the number of uniform deltas, which is {len(uniform_deltas)}"
|
192
|
+
size = ceil(num_deltas / num_rounds)
|
193
|
+
uniform_deltas_grouped = list(
|
194
|
+
map(
|
195
|
+
lambda x: uniform_deltas[x * size : x * size + size],
|
196
|
+
list(range(num_rounds)),
|
197
|
+
)
|
198
|
+
)
|
199
|
+
num_deltas_after_grouping = sum(len(sublist) for sublist in uniform_deltas_grouped)
|
200
|
+
assert (
|
201
|
+
num_deltas_after_grouping == num_deltas
|
202
|
+
), f"uniform_deltas_grouped expected to have {num_deltas} deltas, but has {num_deltas_after_grouping}"
|
203
|
+
return uniform_deltas_grouped
|
204
|
+
|
205
|
+
|
206
|
+
def _stage_new_partition(params: CompactPartitionParams) -> Partition:
|
188
207
|
compacted_stream_locator: Optional[
|
189
208
|
StreamLocator
|
190
209
|
] = params.destination_partition_locator.stream_locator
|
@@ -199,7 +218,19 @@ def _run_hash_and_merge(
|
|
199
218
|
params.destination_partition_locator.partition_values,
|
200
219
|
**params.deltacat_storage_kwargs,
|
201
220
|
)
|
221
|
+
return compacted_partition
|
222
|
+
|
202
223
|
|
224
|
+
def _run_hash_and_merge(
|
225
|
+
params: CompactPartitionParams,
|
226
|
+
uniform_deltas: List[DeltaAnnotated],
|
227
|
+
round_completion_info: RoundCompletionInfo,
|
228
|
+
delete_strategy: Optional[DeleteStrategy],
|
229
|
+
delete_file_envelopes: Optional[DeleteFileEnvelope],
|
230
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
231
|
+
previous_compacted_delta_manifest: Optional[Manifest],
|
232
|
+
compacted_partition: Partition,
|
233
|
+
) -> List[MergeResult]:
|
203
234
|
telemetry_time_hb = 0
|
204
235
|
total_input_records_count = np.int64(0)
|
205
236
|
total_hb_record_count = np.int64(0)
|
@@ -257,7 +288,6 @@ def _run_hash_and_merge(
|
|
257
288
|
for hb_result in hb_results:
|
258
289
|
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
259
290
|
total_input_records_count += hb_result.hb_record_count
|
260
|
-
|
261
291
|
for hash_group_index, object_id_size_tuple in enumerate(
|
262
292
|
hb_result.hash_bucket_group_to_obj_id_tuple
|
263
293
|
):
|
@@ -271,7 +301,6 @@ def _run_hash_and_merge(
|
|
271
301
|
all_hash_group_idx_to_num_rows[
|
272
302
|
hash_group_index
|
273
303
|
] += object_id_size_tuple[2].item()
|
274
|
-
|
275
304
|
logger.info(
|
276
305
|
f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
|
277
306
|
)
|
@@ -330,26 +359,31 @@ def _run_hash_and_merge(
|
|
330
359
|
f" Deleted records: {total_deleted_record_count}, "
|
331
360
|
)
|
332
361
|
logger.info(record_info_msg)
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
362
|
+
telemetry_this_round = telemetry_time_hb + telemetry_time_merge
|
363
|
+
previous_telemetry = (
|
364
|
+
mutable_compaction_audit.telemetry_time_in_seconds
|
365
|
+
if mutable_compaction_audit.telemetry_time_in_seconds
|
366
|
+
else 0.0
|
338
367
|
)
|
339
368
|
|
369
|
+
mutable_compaction_audit.set_telemetry_time_in_seconds(
|
370
|
+
telemetry_this_round + previous_telemetry
|
371
|
+
)
|
372
|
+
return merge_results
|
373
|
+
|
340
374
|
|
341
375
|
def _merge(
|
342
376
|
params: CompactPartitionParams,
|
343
|
-
task_resource_options_provider,
|
344
|
-
merge_resource_options_provider,
|
345
|
-
all_hash_group_idx_to_size_bytes,
|
346
|
-
all_hash_group_idx_to_num_rows,
|
347
|
-
round_completion_info,
|
348
|
-
previous_compacted_delta_manifest,
|
349
|
-
all_hash_group_idx_to_obj_id,
|
350
|
-
compacted_partition,
|
351
|
-
delete_strategy,
|
352
|
-
delete_file_envelopes,
|
377
|
+
task_resource_options_provider: callable,
|
378
|
+
merge_resource_options_provider: callable,
|
379
|
+
all_hash_group_idx_to_size_bytes: dict,
|
380
|
+
all_hash_group_idx_to_num_rows: dict,
|
381
|
+
round_completion_info: RoundCompletionInfo,
|
382
|
+
previous_compacted_delta_manifest: Manifest,
|
383
|
+
all_hash_group_idx_to_obj_id: dict,
|
384
|
+
compacted_partition: Partition,
|
385
|
+
delete_strategy: DeleteStrategy,
|
386
|
+
delete_file_envelopes: DeleteFileEnvelope,
|
353
387
|
) -> tuple[List[MergeResult], float]:
|
354
388
|
merge_options_provider = functools.partial(
|
355
389
|
task_resource_options_provider,
|
@@ -416,8 +450,9 @@ def _merge(
|
|
416
450
|
|
417
451
|
def _hash_bucket(
|
418
452
|
params: CompactPartitionParams,
|
419
|
-
uniform_deltas,
|
420
|
-
):
|
453
|
+
uniform_deltas: List[DeltaAnnotated],
|
454
|
+
) -> tuple[List[HashBucketResult], float]:
|
455
|
+
|
421
456
|
hb_options_provider = functools.partial(
|
422
457
|
task_resource_options_provider,
|
423
458
|
pg_config=params.pg_config,
|
@@ -455,7 +490,6 @@ def _hash_bucket(
|
|
455
490
|
options_provider=hb_options_provider,
|
456
491
|
kwargs_provider=hash_bucket_input_provider,
|
457
492
|
)
|
458
|
-
|
459
493
|
hb_invoke_end = time.monotonic()
|
460
494
|
|
461
495
|
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
@@ -467,15 +501,15 @@ def _hash_bucket(
|
|
467
501
|
|
468
502
|
def _run_local_merge(
|
469
503
|
params: CompactPartitionParams,
|
470
|
-
uniform_deltas,
|
471
|
-
compacted_partition,
|
472
|
-
round_completion_info,
|
473
|
-
delete_strategy,
|
474
|
-
delete_file_envelopes,
|
475
|
-
mutable_compaction_audit,
|
476
|
-
previous_compacted_delta_manifest,
|
477
|
-
total_input_records_count,
|
478
|
-
) -> tuple[
|
504
|
+
uniform_deltas: List[DeltaAnnotated],
|
505
|
+
compacted_partition: Partition,
|
506
|
+
round_completion_info: RoundCompletionInfo,
|
507
|
+
delete_strategy: Optional[DeleteStrategy],
|
508
|
+
delete_file_envelopes: Optional[DeleteFileEnvelope],
|
509
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
510
|
+
previous_compacted_delta_manifest: Optional[Manifest],
|
511
|
+
total_input_records_count: np.int64,
|
512
|
+
) -> tuple[List[MergeResult], np.int64]:
|
479
513
|
local_merge_input: MergeInput = generate_local_merge_input(
|
480
514
|
params,
|
481
515
|
uniform_deltas,
|
@@ -513,8 +547,10 @@ def _run_local_merge(
|
|
513
547
|
|
514
548
|
|
515
549
|
def _process_merge_results(
|
516
|
-
params: CompactPartitionParams,
|
517
|
-
|
550
|
+
params: CompactPartitionParams,
|
551
|
+
merge_results: List[MergeResult],
|
552
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
553
|
+
) -> tuple[Delta, List[MaterializeResult], dict]:
|
518
554
|
mat_results = []
|
519
555
|
for merge_result in merge_results:
|
520
556
|
mat_results.extend(merge_result.materialize_results)
|
@@ -522,19 +558,23 @@ def _process_merge_results(
|
|
522
558
|
mat_results: List[MaterializeResult] = sorted(
|
523
559
|
mat_results, key=lambda m: m.task_index
|
524
560
|
)
|
525
|
-
|
526
561
|
hb_id_to_entry_indices_range = {}
|
527
562
|
file_index = 0
|
528
563
|
previous_task_index = -1
|
529
564
|
|
565
|
+
duplicate_hash_bucket_mat_results = 0
|
530
566
|
for mat_result in mat_results:
|
531
567
|
assert (
|
532
568
|
mat_result.pyarrow_write_result.files >= 1
|
533
|
-
), "
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
569
|
+
), "At least one file must be materialized"
|
570
|
+
if mat_result.task_index == previous_task_index:
|
571
|
+
duplicate_hash_bucket_mat_results += 1
|
572
|
+
else:
|
573
|
+
duplicate_hash_bucket_mat_results = 0
|
574
|
+
assert duplicate_hash_bucket_mat_results < params.num_rounds, (
|
575
|
+
f"Duplicate record count ({duplicate_hash_bucket_mat_results}) is as large "
|
576
|
+
f"as or greater than params.num_rounds, which is {params.num_rounds}"
|
577
|
+
)
|
538
578
|
hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
|
539
579
|
file_index,
|
540
580
|
file_index + mat_result.pyarrow_write_result.files,
|
@@ -548,9 +588,7 @@ def _process_merge_results(
|
|
548
588
|
str(json.dumps(mutable_compaction_audit)),
|
549
589
|
**params.s3_client_kwargs,
|
550
590
|
)
|
551
|
-
|
552
591
|
deltas: List[Delta] = [m.delta for m in mat_results]
|
553
|
-
|
554
592
|
# Note: An appropriate last stream position must be set
|
555
593
|
# to avoid correctness issue.
|
556
594
|
merged_delta: Delta = Delta.merge_deltas(
|
@@ -561,10 +599,10 @@ def _process_merge_results(
|
|
561
599
|
return merged_delta, mat_results, hb_id_to_entry_indices_range
|
562
600
|
|
563
601
|
|
564
|
-
def
|
602
|
+
def _update_and_upload_compaction_audit(
|
565
603
|
params: CompactPartitionParams,
|
566
|
-
mutable_compaction_audit,
|
567
|
-
round_completion_info,
|
604
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
605
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
568
606
|
) -> None:
|
569
607
|
|
570
608
|
# After all incremental delta related calculations, we update
|
@@ -593,13 +631,14 @@ def _upload_compaction_audit(
|
|
593
631
|
|
594
632
|
def _write_new_round_completion_file(
|
595
633
|
params: CompactPartitionParams,
|
596
|
-
mutable_compaction_audit,
|
597
|
-
compacted_partition,
|
598
|
-
audit_url,
|
599
|
-
hb_id_to_entry_indices_range,
|
600
|
-
rcf_source_partition_locator,
|
601
|
-
new_compacted_delta_locator,
|
602
|
-
pyarrow_write_result,
|
634
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
635
|
+
compacted_partition: Partition,
|
636
|
+
audit_url: str,
|
637
|
+
hb_id_to_entry_indices_range: dict,
|
638
|
+
rcf_source_partition_locator: rcf.PartitionLocator,
|
639
|
+
new_compacted_delta_locator: DeltaLocator,
|
640
|
+
pyarrow_write_result: PyArrowWriteResult,
|
641
|
+
prev_round_completion_info: Optional[RoundCompletionInfo] = None,
|
603
642
|
) -> ExecutionCompactionResult:
|
604
643
|
input_inflation = None
|
605
644
|
input_average_record_size_bytes = None
|
@@ -627,6 +666,12 @@ def _write_new_round_completion_file(
|
|
627
666
|
f" and average record size={input_average_record_size_bytes}"
|
628
667
|
)
|
629
668
|
|
669
|
+
_update_and_upload_compaction_audit(
|
670
|
+
params,
|
671
|
+
mutable_compaction_audit,
|
672
|
+
prev_round_completion_info,
|
673
|
+
)
|
674
|
+
|
630
675
|
new_round_completion_info = RoundCompletionInfo.of(
|
631
676
|
high_watermark=params.last_stream_position_to_compact,
|
632
677
|
compacted_delta_locator=new_compacted_delta_locator,
|
@@ -177,13 +177,10 @@ def _download_compacted_table(
|
|
177
177
|
|
178
178
|
if str(hb_index) not in hb_index_to_indices:
|
179
179
|
return None
|
180
|
-
|
181
180
|
indices = hb_index_to_indices[str(hb_index)]
|
182
|
-
|
183
181
|
assert (
|
184
182
|
indices is not None and len(indices) == 2
|
185
183
|
), "indices should not be none and contains exactly two elements"
|
186
|
-
|
187
184
|
for offset in range(indices[1] - indices[0]):
|
188
185
|
table = deltacat_storage.download_delta_manifest_entry(
|
189
186
|
rcf.compacted_delta_locator,
|
@@ -42,11 +42,10 @@ def read_delta_file_envelopes(
|
|
42
42
|
**deltacat_storage_kwargs,
|
43
43
|
)
|
44
44
|
annotations = annotated_delta.annotations
|
45
|
-
assert (
|
46
|
-
len(tables) == len(annotations),
|
45
|
+
assert len(tables) == len(annotations), (
|
47
46
|
f"Unexpected Error: Length of downloaded delta manifest tables "
|
48
47
|
f"({len(tables)}) doesn't match the length of delta manifest "
|
49
|
-
f"annotations ({len(annotations)})."
|
48
|
+
f"annotations ({len(annotations)})."
|
50
49
|
)
|
51
50
|
if not tables:
|
52
51
|
return None, 0, 0
|
@@ -61,7 +61,6 @@ def discover_deltas(
|
|
61
61
|
)
|
62
62
|
|
63
63
|
result.extend(delta_source_incremental_deltas)
|
64
|
-
|
65
64
|
logger.info(
|
66
65
|
f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
|
67
66
|
f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
|
@@ -132,7 +131,6 @@ def create_uniform_input_deltas(
|
|
132
131
|
size_estimation_function = functools.partial(
|
133
132
|
estimate_manifest_entry_size_bytes, previous_inflation=previous_inflation
|
134
133
|
)
|
135
|
-
|
136
134
|
rebatched_da_list = DeltaAnnotated.rebatch(
|
137
135
|
input_da_list,
|
138
136
|
min_delta_bytes=min_delta_bytes,
|