deltacat 1.1.14__py3-none-any.whl → 1.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +3 -2
- deltacat/compute/compactor/model/compact_partition_params.py +11 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +2 -2
- deltacat/compute/compactor/model/delta_annotated.py +2 -4
- deltacat/compute/compactor/steps/hash_bucket.py +2 -3
- deltacat/compute/compactor_v2/compaction_session.py +26 -27
- deltacat/compute/compactor_v2/constants.py +4 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +103 -66
- deltacat/compute/compactor_v2/steps/merge.py +0 -3
- deltacat/compute/compactor_v2/utils/delta.py +2 -3
- deltacat/compute/compactor_v2/utils/io.py +0 -2
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +855 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +1 -1
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +330 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +118 -0
- deltacat/tests/local_deltacat_storage/__init__.py +8 -5
- {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/METADATA +1 -1
- {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/RECORD +24 -22
- {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/LICENSE +0 -0
- {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/WHEEL +0 -0
- {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -688,8 +688,9 @@ def _execute_compaction_round(
|
|
688
688
|
session_peak_memory
|
689
689
|
)
|
690
690
|
|
691
|
-
compaction_audit.save_round_completion_stats(
|
692
|
-
|
691
|
+
compaction_audit.save_round_completion_stats(mat_results)
|
692
|
+
compaction_audit.set_telemetry_time_in_seconds(
|
693
|
+
telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
693
694
|
)
|
694
695
|
|
695
696
|
s3_utils.upload(
|
@@ -22,6 +22,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
22
22
|
DROP_DUPLICATES,
|
23
23
|
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
24
24
|
DEFAULT_DISABLE_COPY_BY_REFERENCE,
|
25
|
+
DEFAULT_NUM_ROUNDS,
|
25
26
|
)
|
26
27
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
27
28
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -102,6 +103,8 @@ class CompactPartitionParams(dict):
|
|
102
103
|
|
103
104
|
result.metrics_config = params.get("metrics_config")
|
104
105
|
|
106
|
+
result.num_rounds = params.get("num_rounds", DEFAULT_NUM_ROUNDS)
|
107
|
+
|
105
108
|
if not importlib.util.find_spec("memray"):
|
106
109
|
result.enable_profiler = False
|
107
110
|
|
@@ -189,7 +192,6 @@ class CompactPartitionParams(dict):
|
|
189
192
|
cluster_resources = self.pg_config.resource
|
190
193
|
cluster_cpus = cluster_resources["CPU"]
|
191
194
|
self.task_max_parallelism = cluster_cpus
|
192
|
-
self["task_max_parallelism"] = self.task_max_parallelism
|
193
195
|
return self["task_max_parallelism"]
|
194
196
|
|
195
197
|
@task_max_parallelism.setter
|
@@ -403,6 +405,14 @@ class CompactPartitionParams(dict):
|
|
403
405
|
def metrics_config(self, config: MetricsConfig) -> None:
|
404
406
|
self["metrics_config"] = config
|
405
407
|
|
408
|
+
@property
|
409
|
+
def num_rounds(self) -> int:
|
410
|
+
return self["num_rounds"]
|
411
|
+
|
412
|
+
@num_rounds.setter
|
413
|
+
def num_rounds(self, num_rounds: int) -> None:
|
414
|
+
self["num_rounds"] = num_rounds
|
415
|
+
|
406
416
|
@staticmethod
|
407
417
|
def json_handler_for_compact_partition_params(obj):
|
408
418
|
"""
|
@@ -818,7 +818,8 @@ class CompactionSessionAuditInfo(dict):
|
|
818
818
|
return cluster_util_after_task_latency + telemetry_time
|
819
819
|
|
820
820
|
def save_round_completion_stats(
|
821
|
-
self,
|
821
|
+
self,
|
822
|
+
mat_results: List[MaterializeResult],
|
822
823
|
) -> None:
|
823
824
|
"""
|
824
825
|
This method saves all the relevant stats after all the steps are completed.
|
@@ -888,4 +889,3 @@ class CompactionSessionAuditInfo(dict):
|
|
888
889
|
)
|
889
890
|
|
890
891
|
self.set_pyarrow_version(pa.__version__)
|
891
|
-
self.set_telemetry_time_in_seconds(total_telemetry_time)
|
@@ -97,8 +97,7 @@ class DeltaAnnotated(Delta):
|
|
97
97
|
for src_da in split_annotated_deltas:
|
98
98
|
src_da_annotations = src_da.annotations
|
99
99
|
src_da_entries = src_da.manifest.entries
|
100
|
-
assert (
|
101
|
-
len(src_da_annotations) == len(src_da_entries),
|
100
|
+
assert len(src_da_annotations) == len(src_da_entries), (
|
102
101
|
f"Unexpected Error: Length of delta annotations "
|
103
102
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
104
103
|
f"delta manifest entries ({len(src_da_entries)}).",
|
@@ -152,8 +151,7 @@ class DeltaAnnotated(Delta):
|
|
152
151
|
da_group_entry_count = 0
|
153
152
|
src_da_annotations = src_da.annotations
|
154
153
|
src_da_entries = src_da.manifest.entries
|
155
|
-
assert (
|
156
|
-
len(src_da_annotations) == len(src_da_entries),
|
154
|
+
assert len(src_da_annotations) == len(src_da_entries), (
|
157
155
|
f"Unexpected Error: Length of delta annotations "
|
158
156
|
f"({len(src_da_annotations)}) doesn't mach the length of "
|
159
157
|
f"delta manifest entries ({len(src_da_entries)}).",
|
@@ -157,11 +157,10 @@ def _read_delta_file_envelopes(
|
|
157
157
|
**deltacat_storage_kwargs,
|
158
158
|
)
|
159
159
|
annotations = annotated_delta.annotations
|
160
|
-
assert (
|
161
|
-
len(tables) == len(annotations),
|
160
|
+
assert len(tables) == len(annotations), (
|
162
161
|
f"Unexpected Error: Length of downloaded delta manifest tables "
|
163
162
|
f"({len(tables)}) doesn't match the length of delta manifest "
|
164
|
-
f"annotations ({len(annotations)})."
|
163
|
+
f"annotations ({len(annotations)})."
|
165
164
|
)
|
166
165
|
if not tables:
|
167
166
|
return None, 0
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import numpy as np
|
2
1
|
import importlib
|
3
2
|
from contextlib import nullcontext
|
4
3
|
import logging
|
@@ -40,6 +39,8 @@ from deltacat.utils.resources import (
|
|
40
39
|
from deltacat.compute.compactor_v2.private.compaction_utils import (
|
41
40
|
_fetch_compaction_metadata,
|
42
41
|
_build_uniform_deltas,
|
42
|
+
_group_uniform_deltas,
|
43
|
+
_stage_new_partition,
|
43
44
|
_run_hash_and_merge,
|
44
45
|
_process_merge_results,
|
45
46
|
_upload_compaction_audit,
|
@@ -69,6 +70,10 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
69
70
|
assert (
|
70
71
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
71
72
|
), "hash_bucket_count is a required arg for compactor v2"
|
73
|
+
if params.num_rounds > 1:
|
74
|
+
assert (
|
75
|
+
not params.drop_duplicates
|
76
|
+
), "num_rounds > 1, drop_duplicates must be False but is True"
|
72
77
|
|
73
78
|
with memray.Tracker(
|
74
79
|
"compaction_partition.bin"
|
@@ -144,32 +149,28 @@ def _execute_compaction(
|
|
144
149
|
delete_strategy,
|
145
150
|
delete_file_envelopes,
|
146
151
|
) = build_uniform_deltas_result
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
telemetry_time_hb,
|
166
|
-
telemetry_time_merge,
|
167
|
-
compacted_partition,
|
168
|
-
) = _run_hash_and_merge_result
|
152
|
+
logger.info(f"Number of rounds parameter is set to: {params.num_rounds}")
|
153
|
+
uniform_deltas_grouped = _group_uniform_deltas(params, uniform_deltas)
|
154
|
+
logger.info(f"Length of grouped uniform deltas is: {len(uniform_deltas_grouped)}")
|
155
|
+
merge_result_list: List[MergeResult] = []
|
156
|
+
compacted_partition = _stage_new_partition(params)
|
157
|
+
for uniform_deltas in uniform_deltas_grouped:
|
158
|
+
# run hash and merge
|
159
|
+
_run_hash_and_merge_result: List[MergeResult] = _run_hash_and_merge(
|
160
|
+
params,
|
161
|
+
uniform_deltas,
|
162
|
+
round_completion_info,
|
163
|
+
delete_strategy,
|
164
|
+
delete_file_envelopes,
|
165
|
+
compaction_audit,
|
166
|
+
previous_compacted_delta_manifest,
|
167
|
+
compacted_partition,
|
168
|
+
)
|
169
|
+
merge_result_list.extend(_run_hash_and_merge_result)
|
169
170
|
# process merge results
|
170
171
|
process_merge_results: tuple[
|
171
172
|
Delta, list[MaterializeResult], dict
|
172
|
-
] = _process_merge_results(params,
|
173
|
+
] = _process_merge_results(params, merge_result_list, compaction_audit)
|
173
174
|
merged_delta, mat_results, hb_id_to_entry_indices_range = process_merge_results
|
174
175
|
# Record information, logging, and return ExecutionCompactionResult
|
175
176
|
record_info_msg: str = f" Materialized records: {merged_delta.meta.record_count}"
|
@@ -198,9 +199,7 @@ def _execute_compaction(
|
|
198
199
|
session_peak_memory
|
199
200
|
)
|
200
201
|
|
201
|
-
compaction_audit.save_round_completion_stats(
|
202
|
-
mat_results, telemetry_time_hb + telemetry_time_merge
|
203
|
-
)
|
202
|
+
compaction_audit.save_round_completion_stats(mat_results)
|
204
203
|
|
205
204
|
_upload_compaction_audit(
|
206
205
|
params,
|
@@ -71,3 +71,7 @@ PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
|
|
71
71
|
|
72
72
|
# Metric prefix for compact partition method
|
73
73
|
COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
74
|
+
|
75
|
+
# Number of rounds to run hash/merge for a single
|
76
|
+
# partition. (For large table support)
|
77
|
+
DEFAULT_NUM_ROUNDS = 1
|
@@ -4,8 +4,10 @@ import logging
|
|
4
4
|
import ray
|
5
5
|
import time
|
6
6
|
import json
|
7
|
+
from math import ceil
|
7
8
|
|
8
9
|
from deltacat.compute.compactor import (
|
10
|
+
PyArrowWriteResult,
|
9
11
|
HighWatermark,
|
10
12
|
RoundCompletionInfo,
|
11
13
|
)
|
@@ -44,10 +46,11 @@ from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
|
|
44
46
|
from deltacat.storage import (
|
45
47
|
Delta,
|
46
48
|
DeltaType,
|
47
|
-
|
48
|
-
StreamLocator,
|
49
|
+
DeltaLocator,
|
49
50
|
Partition,
|
50
51
|
Manifest,
|
52
|
+
Stream,
|
53
|
+
StreamLocator,
|
51
54
|
)
|
52
55
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
53
56
|
CompactPartitionParams,
|
@@ -60,7 +63,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
|
|
60
63
|
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
61
64
|
from deltacat.compute.compactor_v2.utils import io
|
62
65
|
|
63
|
-
from typing import
|
66
|
+
from typing import List, Optional
|
64
67
|
from collections import defaultdict
|
65
68
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
66
69
|
CompactionSessionAuditInfo,
|
@@ -123,9 +126,9 @@ def _fetch_compaction_metadata(
|
|
123
126
|
|
124
127
|
def _build_uniform_deltas(
|
125
128
|
params: CompactPartitionParams,
|
126
|
-
mutable_compaction_audit,
|
127
|
-
input_deltas,
|
128
|
-
delta_discovery_start,
|
129
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
130
|
+
input_deltas: List[Delta],
|
131
|
+
delta_discovery_start: float,
|
129
132
|
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition]:
|
130
133
|
|
131
134
|
delete_strategy: Optional[DeleteStrategy] = None
|
@@ -173,18 +176,34 @@ def _build_uniform_deltas(
|
|
173
176
|
)
|
174
177
|
|
175
178
|
|
176
|
-
def
|
177
|
-
params: CompactPartitionParams,
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
179
|
+
def _group_uniform_deltas(
|
180
|
+
params: CompactPartitionParams, uniform_deltas: List[DeltaAnnotated]
|
181
|
+
) -> List[List[DeltaAnnotated]]:
|
182
|
+
num_deltas = len(uniform_deltas)
|
183
|
+
num_rounds = params.num_rounds
|
184
|
+
if num_rounds == 1:
|
185
|
+
return [uniform_deltas]
|
186
|
+
assert (
|
187
|
+
num_rounds > 0
|
188
|
+
), f"num_rounds parameter should be greater than zero but is {params.num_rounds}"
|
189
|
+
assert (
|
190
|
+
num_rounds <= num_deltas
|
191
|
+
), f"{params.num_rounds} rounds should be less than the number of uniform deltas, which is {len(uniform_deltas)}"
|
192
|
+
size = ceil(num_deltas / num_rounds)
|
193
|
+
uniform_deltas_grouped = list(
|
194
|
+
map(
|
195
|
+
lambda x: uniform_deltas[x * size : x * size + size],
|
196
|
+
list(range(num_rounds)),
|
197
|
+
)
|
198
|
+
)
|
199
|
+
num_deltas_after_grouping = sum(len(sublist) for sublist in uniform_deltas_grouped)
|
200
|
+
assert (
|
201
|
+
num_deltas_after_grouping == num_deltas
|
202
|
+
), f"uniform_deltas_grouped expected to have {num_deltas} deltas, but has {num_deltas_after_grouping}"
|
203
|
+
return uniform_deltas_grouped
|
204
|
+
|
205
|
+
|
206
|
+
def _stage_new_partition(params: CompactPartitionParams) -> Partition:
|
188
207
|
compacted_stream_locator: Optional[
|
189
208
|
StreamLocator
|
190
209
|
] = params.destination_partition_locator.stream_locator
|
@@ -199,7 +218,19 @@ def _run_hash_and_merge(
|
|
199
218
|
params.destination_partition_locator.partition_values,
|
200
219
|
**params.deltacat_storage_kwargs,
|
201
220
|
)
|
221
|
+
return compacted_partition
|
202
222
|
|
223
|
+
|
224
|
+
def _run_hash_and_merge(
|
225
|
+
params: CompactPartitionParams,
|
226
|
+
uniform_deltas: List[DeltaAnnotated],
|
227
|
+
round_completion_info: RoundCompletionInfo,
|
228
|
+
delete_strategy: Optional[DeleteStrategy],
|
229
|
+
delete_file_envelopes: Optional[DeleteFileEnvelope],
|
230
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
231
|
+
previous_compacted_delta_manifest: Optional[Manifest],
|
232
|
+
compacted_partition: Partition,
|
233
|
+
) -> List[MergeResult]:
|
203
234
|
telemetry_time_hb = 0
|
204
235
|
total_input_records_count = np.int64(0)
|
205
236
|
total_hb_record_count = np.int64(0)
|
@@ -257,7 +288,6 @@ def _run_hash_and_merge(
|
|
257
288
|
for hb_result in hb_results:
|
258
289
|
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
259
290
|
total_input_records_count += hb_result.hb_record_count
|
260
|
-
|
261
291
|
for hash_group_index, object_id_size_tuple in enumerate(
|
262
292
|
hb_result.hash_bucket_group_to_obj_id_tuple
|
263
293
|
):
|
@@ -271,7 +301,6 @@ def _run_hash_and_merge(
|
|
271
301
|
all_hash_group_idx_to_num_rows[
|
272
302
|
hash_group_index
|
273
303
|
] += object_id_size_tuple[2].item()
|
274
|
-
|
275
304
|
logger.info(
|
276
305
|
f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
|
277
306
|
)
|
@@ -330,26 +359,30 @@ def _run_hash_and_merge(
|
|
330
359
|
f" Deleted records: {total_deleted_record_count}, "
|
331
360
|
)
|
332
361
|
logger.info(record_info_msg)
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
362
|
+
telemetry_this_round = telemetry_time_hb + telemetry_time_merge
|
363
|
+
previous_telemetry = (
|
364
|
+
mutable_compaction_audit.telemetry_time_in_seconds
|
365
|
+
if mutable_compaction_audit.telemetry_time_in_seconds
|
366
|
+
else 0.0
|
338
367
|
)
|
368
|
+
mutable_compaction_audit.set_telemetry_time_in_seconds(
|
369
|
+
telemetry_this_round + previous_telemetry
|
370
|
+
)
|
371
|
+
return merge_results
|
339
372
|
|
340
373
|
|
341
374
|
def _merge(
|
342
375
|
params: CompactPartitionParams,
|
343
|
-
task_resource_options_provider,
|
344
|
-
merge_resource_options_provider,
|
345
|
-
all_hash_group_idx_to_size_bytes,
|
346
|
-
all_hash_group_idx_to_num_rows,
|
347
|
-
round_completion_info,
|
348
|
-
previous_compacted_delta_manifest,
|
349
|
-
all_hash_group_idx_to_obj_id,
|
350
|
-
compacted_partition,
|
351
|
-
delete_strategy,
|
352
|
-
delete_file_envelopes,
|
376
|
+
task_resource_options_provider: callable,
|
377
|
+
merge_resource_options_provider: callable,
|
378
|
+
all_hash_group_idx_to_size_bytes: dict,
|
379
|
+
all_hash_group_idx_to_num_rows: dict,
|
380
|
+
round_completion_info: RoundCompletionInfo,
|
381
|
+
previous_compacted_delta_manifest: Manifest,
|
382
|
+
all_hash_group_idx_to_obj_id: dict,
|
383
|
+
compacted_partition: Partition,
|
384
|
+
delete_strategy: DeleteStrategy,
|
385
|
+
delete_file_envelopes: DeleteFileEnvelope,
|
353
386
|
) -> tuple[List[MergeResult], float]:
|
354
387
|
merge_options_provider = functools.partial(
|
355
388
|
task_resource_options_provider,
|
@@ -416,8 +449,9 @@ def _merge(
|
|
416
449
|
|
417
450
|
def _hash_bucket(
|
418
451
|
params: CompactPartitionParams,
|
419
|
-
uniform_deltas,
|
420
|
-
):
|
452
|
+
uniform_deltas: List[DeltaAnnotated],
|
453
|
+
) -> tuple[List[HashBucketResult], float]:
|
454
|
+
|
421
455
|
hb_options_provider = functools.partial(
|
422
456
|
task_resource_options_provider,
|
423
457
|
pg_config=params.pg_config,
|
@@ -455,7 +489,6 @@ def _hash_bucket(
|
|
455
489
|
options_provider=hb_options_provider,
|
456
490
|
kwargs_provider=hash_bucket_input_provider,
|
457
491
|
)
|
458
|
-
|
459
492
|
hb_invoke_end = time.monotonic()
|
460
493
|
|
461
494
|
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
@@ -467,15 +500,15 @@ def _hash_bucket(
|
|
467
500
|
|
468
501
|
def _run_local_merge(
|
469
502
|
params: CompactPartitionParams,
|
470
|
-
uniform_deltas,
|
471
|
-
compacted_partition,
|
472
|
-
round_completion_info,
|
473
|
-
delete_strategy,
|
474
|
-
delete_file_envelopes,
|
475
|
-
mutable_compaction_audit,
|
476
|
-
previous_compacted_delta_manifest,
|
477
|
-
total_input_records_count,
|
478
|
-
) -> tuple[
|
503
|
+
uniform_deltas: List[DeltaAnnotated],
|
504
|
+
compacted_partition: Partition,
|
505
|
+
round_completion_info: RoundCompletionInfo,
|
506
|
+
delete_strategy: Optional[DeleteStrategy],
|
507
|
+
delete_file_envelopes: Optional[DeleteFileEnvelope],
|
508
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
509
|
+
previous_compacted_delta_manifest: Optional[Manifest],
|
510
|
+
total_input_records_count: np.int64,
|
511
|
+
) -> tuple[List[MergeResult], np.int64]:
|
479
512
|
local_merge_input: MergeInput = generate_local_merge_input(
|
480
513
|
params,
|
481
514
|
uniform_deltas,
|
@@ -513,8 +546,10 @@ def _run_local_merge(
|
|
513
546
|
|
514
547
|
|
515
548
|
def _process_merge_results(
|
516
|
-
params: CompactPartitionParams,
|
517
|
-
|
549
|
+
params: CompactPartitionParams,
|
550
|
+
merge_results: List[MergeResult],
|
551
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
552
|
+
) -> tuple[Delta, List[MaterializeResult], dict]:
|
518
553
|
mat_results = []
|
519
554
|
for merge_result in merge_results:
|
520
555
|
mat_results.extend(merge_result.materialize_results)
|
@@ -522,19 +557,23 @@ def _process_merge_results(
|
|
522
557
|
mat_results: List[MaterializeResult] = sorted(
|
523
558
|
mat_results, key=lambda m: m.task_index
|
524
559
|
)
|
525
|
-
|
526
560
|
hb_id_to_entry_indices_range = {}
|
527
561
|
file_index = 0
|
528
562
|
previous_task_index = -1
|
529
563
|
|
564
|
+
duplicate_hash_bucket_mat_results = 0
|
530
565
|
for mat_result in mat_results:
|
531
566
|
assert (
|
532
567
|
mat_result.pyarrow_write_result.files >= 1
|
533
|
-
), "
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
568
|
+
), "At least one file must be materialized"
|
569
|
+
if mat_result.task_index == previous_task_index:
|
570
|
+
duplicate_hash_bucket_mat_results += 1
|
571
|
+
else:
|
572
|
+
duplicate_hash_bucket_mat_results = 0
|
573
|
+
assert duplicate_hash_bucket_mat_results < params.num_rounds, (
|
574
|
+
f"Duplicate record count ({duplicate_hash_bucket_mat_results}) is as large "
|
575
|
+
f"as or greater than params.num_rounds, which is {params.num_rounds}"
|
576
|
+
)
|
538
577
|
hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
|
539
578
|
file_index,
|
540
579
|
file_index + mat_result.pyarrow_write_result.files,
|
@@ -548,9 +587,7 @@ def _process_merge_results(
|
|
548
587
|
str(json.dumps(mutable_compaction_audit)),
|
549
588
|
**params.s3_client_kwargs,
|
550
589
|
)
|
551
|
-
|
552
590
|
deltas: List[Delta] = [m.delta for m in mat_results]
|
553
|
-
|
554
591
|
# Note: An appropriate last stream position must be set
|
555
592
|
# to avoid correctness issue.
|
556
593
|
merged_delta: Delta = Delta.merge_deltas(
|
@@ -563,8 +600,8 @@ def _process_merge_results(
|
|
563
600
|
|
564
601
|
def _upload_compaction_audit(
|
565
602
|
params: CompactPartitionParams,
|
566
|
-
mutable_compaction_audit,
|
567
|
-
round_completion_info,
|
603
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
604
|
+
round_completion_info: RoundCompletionInfo,
|
568
605
|
) -> None:
|
569
606
|
|
570
607
|
# After all incremental delta related calculations, we update
|
@@ -593,13 +630,13 @@ def _upload_compaction_audit(
|
|
593
630
|
|
594
631
|
def _write_new_round_completion_file(
|
595
632
|
params: CompactPartitionParams,
|
596
|
-
mutable_compaction_audit,
|
597
|
-
compacted_partition,
|
598
|
-
audit_url,
|
599
|
-
hb_id_to_entry_indices_range,
|
600
|
-
rcf_source_partition_locator,
|
601
|
-
new_compacted_delta_locator,
|
602
|
-
pyarrow_write_result,
|
633
|
+
mutable_compaction_audit: CompactionSessionAuditInfo,
|
634
|
+
compacted_partition: Partition,
|
635
|
+
audit_url: str,
|
636
|
+
hb_id_to_entry_indices_range: dict,
|
637
|
+
rcf_source_partition_locator: rcf.PartitionLocator,
|
638
|
+
new_compacted_delta_locator: DeltaLocator,
|
639
|
+
pyarrow_write_result: PyArrowWriteResult,
|
603
640
|
) -> ExecutionCompactionResult:
|
604
641
|
input_inflation = None
|
605
642
|
input_average_record_size_bytes = None
|
@@ -177,13 +177,10 @@ def _download_compacted_table(
|
|
177
177
|
|
178
178
|
if str(hb_index) not in hb_index_to_indices:
|
179
179
|
return None
|
180
|
-
|
181
180
|
indices = hb_index_to_indices[str(hb_index)]
|
182
|
-
|
183
181
|
assert (
|
184
182
|
indices is not None and len(indices) == 2
|
185
183
|
), "indices should not be none and contains exactly two elements"
|
186
|
-
|
187
184
|
for offset in range(indices[1] - indices[0]):
|
188
185
|
table = deltacat_storage.download_delta_manifest_entry(
|
189
186
|
rcf.compacted_delta_locator,
|
@@ -42,11 +42,10 @@ def read_delta_file_envelopes(
|
|
42
42
|
**deltacat_storage_kwargs,
|
43
43
|
)
|
44
44
|
annotations = annotated_delta.annotations
|
45
|
-
assert (
|
46
|
-
len(tables) == len(annotations),
|
45
|
+
assert len(tables) == len(annotations), (
|
47
46
|
f"Unexpected Error: Length of downloaded delta manifest tables "
|
48
47
|
f"({len(tables)}) doesn't match the length of delta manifest "
|
49
|
-
f"annotations ({len(annotations)})."
|
48
|
+
f"annotations ({len(annotations)})."
|
50
49
|
)
|
51
50
|
if not tables:
|
52
51
|
return None, 0, 0
|
@@ -61,7 +61,6 @@ def discover_deltas(
|
|
61
61
|
)
|
62
62
|
|
63
63
|
result.extend(delta_source_incremental_deltas)
|
64
|
-
|
65
64
|
logger.info(
|
66
65
|
f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
|
67
66
|
f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
|
@@ -132,7 +131,6 @@ def create_uniform_input_deltas(
|
|
132
131
|
size_estimation_function = functools.partial(
|
133
132
|
estimate_manifest_entry_size_bytes, previous_inflation=previous_inflation
|
134
133
|
)
|
135
|
-
|
136
134
|
rebatched_da_list = DeltaAnnotated.rebatch(
|
137
135
|
input_da_list,
|
138
136
|
min_delta_bytes=min_delta_bytes,
|