deltacat 1.1.13__py3-none-any.whl → 1.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/private/__init__.py +0 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +716 -0
- {deltacat-1.1.13.dist-info → deltacat-1.1.14.dist-info}/METADATA +1 -1
- {deltacat-1.1.13.dist-info → deltacat-1.1.14.dist-info}/RECORD +8 -6
- {deltacat-1.1.13.dist-info → deltacat-1.1.14.dist-info}/LICENSE +0 -0
- {deltacat-1.1.13.dist-info → deltacat-1.1.14.dist-info}/WHEEL +0 -0
- {deltacat-1.1.13.dist-info → deltacat-1.1.14.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
File without changes
|
@@ -0,0 +1,716 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import functools
|
3
|
+
import logging
|
4
|
+
import ray
|
5
|
+
import time
|
6
|
+
import json
|
7
|
+
|
8
|
+
from deltacat.compute.compactor import (
|
9
|
+
HighWatermark,
|
10
|
+
RoundCompletionInfo,
|
11
|
+
)
|
12
|
+
from deltacat.aws import s3u as s3_utils
|
13
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
14
|
+
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
15
|
+
ExecutionCompactionResult,
|
16
|
+
)
|
17
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
18
|
+
RemoteMergeFileGroupsProvider,
|
19
|
+
)
|
20
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
21
|
+
|
22
|
+
from deltacat import logs
|
23
|
+
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
24
|
+
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
25
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
26
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
27
|
+
from deltacat.compute.compactor_v2.utils.merge import (
|
28
|
+
generate_local_merge_input,
|
29
|
+
)
|
30
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
31
|
+
hash_bucket_resource_options_provider,
|
32
|
+
)
|
33
|
+
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
34
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
35
|
+
from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
|
36
|
+
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
37
|
+
DeleteStrategy,
|
38
|
+
)
|
39
|
+
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
40
|
+
DeleteFileEnvelope,
|
41
|
+
)
|
42
|
+
from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
|
43
|
+
|
44
|
+
from deltacat.storage import (
|
45
|
+
Delta,
|
46
|
+
DeltaType,
|
47
|
+
Stream,
|
48
|
+
StreamLocator,
|
49
|
+
Partition,
|
50
|
+
Manifest,
|
51
|
+
)
|
52
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
53
|
+
CompactPartitionParams,
|
54
|
+
)
|
55
|
+
from deltacat.utils.ray_utils.concurrency import (
|
56
|
+
invoke_parallel,
|
57
|
+
task_resource_options_provider,
|
58
|
+
)
|
59
|
+
from deltacat.compute.compactor_v2.steps import merge as mg
|
60
|
+
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
61
|
+
from deltacat.compute.compactor_v2.utils import io
|
62
|
+
|
63
|
+
from typing import Any, List, Optional
|
64
|
+
from collections import defaultdict
|
65
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
66
|
+
CompactionSessionAuditInfo,
|
67
|
+
)
|
68
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
69
|
+
merge_resource_options_provider,
|
70
|
+
local_merge_resource_options_provider,
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
75
|
+
|
76
|
+
|
77
|
+
def _fetch_compaction_metadata(
|
78
|
+
params: CompactPartitionParams,
|
79
|
+
) -> tuple[Optional[Manifest], Optional[RoundCompletionInfo]]:
|
80
|
+
|
81
|
+
# read the results from any previously completed compaction round
|
82
|
+
round_completion_info: Optional[RoundCompletionInfo] = None
|
83
|
+
high_watermark: Optional[HighWatermark] = None
|
84
|
+
previous_compacted_delta_manifest: Optional[Manifest] = None
|
85
|
+
|
86
|
+
if not params.rebase_source_partition_locator:
|
87
|
+
round_completion_info = rcf.read_round_completion_file(
|
88
|
+
params.compaction_artifact_s3_bucket,
|
89
|
+
params.source_partition_locator,
|
90
|
+
params.destination_partition_locator,
|
91
|
+
**params.s3_client_kwargs,
|
92
|
+
)
|
93
|
+
if not round_completion_info:
|
94
|
+
logger.info(
|
95
|
+
"Both rebase partition and round completion file not found. Performing an entire backfill on source."
|
96
|
+
)
|
97
|
+
else:
|
98
|
+
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
99
|
+
|
100
|
+
previous_compacted_delta_manifest = (
|
101
|
+
params.deltacat_storage.get_delta_manifest(
|
102
|
+
compacted_delta_locator, **params.deltacat_storage_kwargs
|
103
|
+
)
|
104
|
+
)
|
105
|
+
|
106
|
+
high_watermark = round_completion_info.high_watermark
|
107
|
+
logger.info(f"Setting round completion high watermark: {high_watermark}")
|
108
|
+
assert (
|
109
|
+
params.hash_bucket_count == round_completion_info.hash_bucket_count
|
110
|
+
), (
|
111
|
+
"The hash bucket count has changed. "
|
112
|
+
"Kindly run rebase compaction and trigger incremental again. "
|
113
|
+
f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
|
114
|
+
f"not equal to Hash bucket count in args={params.hash_bucket_count}."
|
115
|
+
)
|
116
|
+
|
117
|
+
logger.info(f"Round completion file: {round_completion_info}")
|
118
|
+
return (
|
119
|
+
previous_compacted_delta_manifest,
|
120
|
+
round_completion_info,
|
121
|
+
)
|
122
|
+
|
123
|
+
|
124
|
+
def _build_uniform_deltas(
|
125
|
+
params: CompactPartitionParams,
|
126
|
+
mutable_compaction_audit,
|
127
|
+
input_deltas,
|
128
|
+
delta_discovery_start,
|
129
|
+
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition]:
|
130
|
+
|
131
|
+
delete_strategy: Optional[DeleteStrategy] = None
|
132
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
|
133
|
+
delete_file_size_bytes: int = 0
|
134
|
+
if contains_delete_deltas(input_deltas):
|
135
|
+
input_deltas, delete_file_envelopes, delete_strategy = prepare_deletes(
|
136
|
+
params, input_deltas
|
137
|
+
)
|
138
|
+
for delete_file_envelope in delete_file_envelopes:
|
139
|
+
delete_file_size_bytes += delete_file_envelope.table_size_bytes
|
140
|
+
logger.info(
|
141
|
+
f" Input deltas contain {DeltaType.DELETE}-type deltas. Total delete file size={delete_file_size_bytes}."
|
142
|
+
f" Total length of delete file envelopes={len(delete_file_envelopes)}"
|
143
|
+
)
|
144
|
+
uniform_deltas: List[DeltaAnnotated] = io.create_uniform_input_deltas(
|
145
|
+
input_deltas=input_deltas,
|
146
|
+
hash_bucket_count=params.hash_bucket_count,
|
147
|
+
compaction_audit=mutable_compaction_audit,
|
148
|
+
deltacat_storage=params.deltacat_storage,
|
149
|
+
previous_inflation=params.previous_inflation,
|
150
|
+
min_delta_bytes=params.min_delta_bytes_in_batch,
|
151
|
+
min_file_counts=params.min_files_in_batch,
|
152
|
+
# disable input split during rebase as the rebase files are already uniform
|
153
|
+
enable_input_split=params.rebase_source_partition_locator is None,
|
154
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
155
|
+
)
|
156
|
+
delta_discovery_end: float = time.monotonic()
|
157
|
+
|
158
|
+
mutable_compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
|
159
|
+
mutable_compaction_audit.set_delta_discovery_time_in_seconds(
|
160
|
+
delta_discovery_end - delta_discovery_start
|
161
|
+
)
|
162
|
+
|
163
|
+
s3_utils.upload(
|
164
|
+
mutable_compaction_audit.audit_url,
|
165
|
+
str(json.dumps(mutable_compaction_audit)),
|
166
|
+
**params.s3_client_kwargs,
|
167
|
+
)
|
168
|
+
|
169
|
+
return (
|
170
|
+
uniform_deltas,
|
171
|
+
delete_strategy,
|
172
|
+
delete_file_envelopes,
|
173
|
+
)
|
174
|
+
|
175
|
+
|
176
|
+
def _run_hash_and_merge(
|
177
|
+
params: CompactPartitionParams,
|
178
|
+
uniform_deltas,
|
179
|
+
round_completion_info,
|
180
|
+
delete_strategy,
|
181
|
+
delete_file_envelopes,
|
182
|
+
mutable_compaction_audit,
|
183
|
+
previous_compacted_delta_manifest,
|
184
|
+
) -> tuple[
|
185
|
+
list[MergeResult], np.int64, np.float64, np.int64, np.int64, np.float64, Partition
|
186
|
+
]:
|
187
|
+
# create a new stream for this round
|
188
|
+
compacted_stream_locator: Optional[
|
189
|
+
StreamLocator
|
190
|
+
] = params.destination_partition_locator.stream_locator
|
191
|
+
compacted_stream: Stream = params.deltacat_storage.get_stream(
|
192
|
+
compacted_stream_locator.namespace,
|
193
|
+
compacted_stream_locator.table_name,
|
194
|
+
compacted_stream_locator.table_version,
|
195
|
+
**params.deltacat_storage_kwargs,
|
196
|
+
)
|
197
|
+
compacted_partition: Partition = params.deltacat_storage.stage_partition(
|
198
|
+
compacted_stream,
|
199
|
+
params.destination_partition_locator.partition_values,
|
200
|
+
**params.deltacat_storage_kwargs,
|
201
|
+
)
|
202
|
+
|
203
|
+
telemetry_time_hb = 0
|
204
|
+
total_input_records_count = np.int64(0)
|
205
|
+
total_hb_record_count = np.int64(0)
|
206
|
+
if params.hash_bucket_count == 1:
|
207
|
+
logger.info("Hash bucket count set to 1. Running local merge")
|
208
|
+
merge_start: float = time.monotonic()
|
209
|
+
merge_results, total_input_records_count = _run_local_merge(
|
210
|
+
params,
|
211
|
+
uniform_deltas,
|
212
|
+
compacted_partition,
|
213
|
+
round_completion_info,
|
214
|
+
delete_strategy,
|
215
|
+
delete_file_envelopes,
|
216
|
+
mutable_compaction_audit,
|
217
|
+
previous_compacted_delta_manifest,
|
218
|
+
total_input_records_count,
|
219
|
+
)
|
220
|
+
merge_invoke_end = time.monotonic()
|
221
|
+
else:
|
222
|
+
# hash bucket
|
223
|
+
hb_start = time.monotonic()
|
224
|
+
all_hash_group_idx_to_obj_id = defaultdict(list)
|
225
|
+
all_hash_group_idx_to_size_bytes = defaultdict(int)
|
226
|
+
all_hash_group_idx_to_num_rows = defaultdict(int)
|
227
|
+
(hb_results, hb_invoke_end) = _hash_bucket(params, uniform_deltas)
|
228
|
+
hb_end = time.monotonic()
|
229
|
+
|
230
|
+
# we use time.time() here because time.monotonic() has no reference point
|
231
|
+
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
232
|
+
# to compare time.time()s captured in different nodes.
|
233
|
+
hb_results_retrieved_at = time.time()
|
234
|
+
|
235
|
+
telemetry_time_hb = mutable_compaction_audit.save_step_stats(
|
236
|
+
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
237
|
+
hb_results,
|
238
|
+
hb_results_retrieved_at,
|
239
|
+
hb_invoke_end - hb_start,
|
240
|
+
hb_end - hb_start,
|
241
|
+
)
|
242
|
+
|
243
|
+
s3_utils.upload(
|
244
|
+
mutable_compaction_audit.audit_url,
|
245
|
+
str(json.dumps(mutable_compaction_audit)),
|
246
|
+
**params.s3_client_kwargs,
|
247
|
+
)
|
248
|
+
|
249
|
+
hb_data_processed_size_bytes = np.int64(0)
|
250
|
+
|
251
|
+
# initialize all hash groups
|
252
|
+
for hb_group in range(params.hash_group_count):
|
253
|
+
all_hash_group_idx_to_num_rows[hb_group] = 0
|
254
|
+
all_hash_group_idx_to_obj_id[hb_group] = []
|
255
|
+
all_hash_group_idx_to_size_bytes[hb_group] = 0
|
256
|
+
|
257
|
+
for hb_result in hb_results:
|
258
|
+
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
259
|
+
total_input_records_count += hb_result.hb_record_count
|
260
|
+
|
261
|
+
for hash_group_index, object_id_size_tuple in enumerate(
|
262
|
+
hb_result.hash_bucket_group_to_obj_id_tuple
|
263
|
+
):
|
264
|
+
if object_id_size_tuple:
|
265
|
+
all_hash_group_idx_to_obj_id[hash_group_index].append(
|
266
|
+
object_id_size_tuple[0],
|
267
|
+
)
|
268
|
+
all_hash_group_idx_to_size_bytes[
|
269
|
+
hash_group_index
|
270
|
+
] += object_id_size_tuple[1].item()
|
271
|
+
all_hash_group_idx_to_num_rows[
|
272
|
+
hash_group_index
|
273
|
+
] += object_id_size_tuple[2].item()
|
274
|
+
|
275
|
+
logger.info(
|
276
|
+
f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
|
277
|
+
)
|
278
|
+
|
279
|
+
total_hb_record_count = total_input_records_count
|
280
|
+
mutable_compaction_audit.set_hash_bucket_processed_size_bytes(
|
281
|
+
hb_data_processed_size_bytes.item()
|
282
|
+
)
|
283
|
+
|
284
|
+
# BSP Step 2: Merge
|
285
|
+
# NOTE: DELETE-type deltas are stored in Plasma object store
|
286
|
+
# in prepare_deletes and therefore don't need to included
|
287
|
+
# in merge task resource estimation
|
288
|
+
merge_start = time.monotonic()
|
289
|
+
merge_results, merge_invoke_end = _merge(
|
290
|
+
params,
|
291
|
+
task_resource_options_provider,
|
292
|
+
merge_resource_options_provider,
|
293
|
+
all_hash_group_idx_to_size_bytes,
|
294
|
+
all_hash_group_idx_to_num_rows,
|
295
|
+
round_completion_info,
|
296
|
+
previous_compacted_delta_manifest,
|
297
|
+
all_hash_group_idx_to_obj_id,
|
298
|
+
compacted_partition,
|
299
|
+
delete_strategy,
|
300
|
+
delete_file_envelopes,
|
301
|
+
)
|
302
|
+
logger.info(f"Got {len(merge_results)} merge results.")
|
303
|
+
|
304
|
+
merge_results_retrieved_at: float = time.time()
|
305
|
+
merge_end: float = time.monotonic()
|
306
|
+
|
307
|
+
total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
|
308
|
+
total_deleted_record_count = sum(
|
309
|
+
[ddr.deleted_record_count for ddr in merge_results]
|
310
|
+
)
|
311
|
+
logger.info(
|
312
|
+
f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
|
313
|
+
)
|
314
|
+
|
315
|
+
mutable_compaction_audit.set_input_records(total_input_records_count.item())
|
316
|
+
|
317
|
+
telemetry_time_merge = mutable_compaction_audit.save_step_stats(
|
318
|
+
CompactionSessionAuditInfo.MERGE_STEP_NAME,
|
319
|
+
merge_results,
|
320
|
+
merge_results_retrieved_at,
|
321
|
+
merge_invoke_end - merge_start,
|
322
|
+
merge_end - merge_start,
|
323
|
+
)
|
324
|
+
|
325
|
+
mutable_compaction_audit.set_records_deduped(total_dd_record_count.item())
|
326
|
+
mutable_compaction_audit.set_records_deleted(total_deleted_record_count.item())
|
327
|
+
record_info_msg: str = (
|
328
|
+
f"Hash bucket records: {total_hb_record_count},"
|
329
|
+
f" Deduped records: {total_dd_record_count}, "
|
330
|
+
f" Deleted records: {total_deleted_record_count}, "
|
331
|
+
)
|
332
|
+
logger.info(record_info_msg)
|
333
|
+
return (
|
334
|
+
merge_results,
|
335
|
+
telemetry_time_hb,
|
336
|
+
telemetry_time_merge,
|
337
|
+
compacted_partition,
|
338
|
+
)
|
339
|
+
|
340
|
+
|
341
|
+
def _merge(
|
342
|
+
params: CompactPartitionParams,
|
343
|
+
task_resource_options_provider,
|
344
|
+
merge_resource_options_provider,
|
345
|
+
all_hash_group_idx_to_size_bytes,
|
346
|
+
all_hash_group_idx_to_num_rows,
|
347
|
+
round_completion_info,
|
348
|
+
previous_compacted_delta_manifest,
|
349
|
+
all_hash_group_idx_to_obj_id,
|
350
|
+
compacted_partition,
|
351
|
+
delete_strategy,
|
352
|
+
delete_file_envelopes,
|
353
|
+
) -> tuple[List[MergeResult], float]:
|
354
|
+
merge_options_provider = functools.partial(
|
355
|
+
task_resource_options_provider,
|
356
|
+
pg_config=params.pg_config,
|
357
|
+
resource_amount_provider=merge_resource_options_provider,
|
358
|
+
num_hash_groups=params.hash_group_count,
|
359
|
+
hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
|
360
|
+
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
361
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
362
|
+
round_completion_info=round_completion_info,
|
363
|
+
compacted_delta_manifest=previous_compacted_delta_manifest,
|
364
|
+
primary_keys=params.primary_keys,
|
365
|
+
deltacat_storage=params.deltacat_storage,
|
366
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
367
|
+
ray_custom_resources=params.ray_custom_resources,
|
368
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
369
|
+
)
|
370
|
+
|
371
|
+
def merge_input_provider(index, item) -> dict[str, MergeInput]:
|
372
|
+
return {
|
373
|
+
"input": MergeInput.of(
|
374
|
+
merge_file_groups_provider=RemoteMergeFileGroupsProvider(
|
375
|
+
hash_group_index=item[0],
|
376
|
+
dfe_groups_refs=item[1],
|
377
|
+
hash_bucket_count=params.hash_bucket_count,
|
378
|
+
num_hash_groups=params.hash_group_count,
|
379
|
+
object_store=params.object_store,
|
380
|
+
),
|
381
|
+
write_to_partition=compacted_partition,
|
382
|
+
compacted_file_content_type=params.compacted_file_content_type,
|
383
|
+
primary_keys=params.primary_keys,
|
384
|
+
sort_keys=params.sort_keys,
|
385
|
+
merge_task_index=index,
|
386
|
+
drop_duplicates=params.drop_duplicates,
|
387
|
+
max_records_per_output_file=params.records_per_compacted_file,
|
388
|
+
enable_profiler=params.enable_profiler,
|
389
|
+
metrics_config=params.metrics_config,
|
390
|
+
s3_table_writer_kwargs=params.s3_table_writer_kwargs,
|
391
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
392
|
+
round_completion_info=round_completion_info,
|
393
|
+
object_store=params.object_store,
|
394
|
+
deltacat_storage=params.deltacat_storage,
|
395
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
396
|
+
delete_strategy=delete_strategy,
|
397
|
+
delete_file_envelopes=delete_file_envelopes,
|
398
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
399
|
+
disable_copy_by_reference=params.disable_copy_by_reference,
|
400
|
+
)
|
401
|
+
}
|
402
|
+
|
403
|
+
merge_tasks_pending = invoke_parallel(
|
404
|
+
items=all_hash_group_idx_to_obj_id.items(),
|
405
|
+
ray_task=mg.merge,
|
406
|
+
max_parallelism=params.task_max_parallelism,
|
407
|
+
options_provider=merge_options_provider,
|
408
|
+
kwargs_provider=merge_input_provider,
|
409
|
+
)
|
410
|
+
merge_invoke_end = time.monotonic()
|
411
|
+
logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
|
412
|
+
merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
|
413
|
+
|
414
|
+
return merge_results, merge_invoke_end
|
415
|
+
|
416
|
+
|
417
|
+
def _hash_bucket(
|
418
|
+
params: CompactPartitionParams,
|
419
|
+
uniform_deltas,
|
420
|
+
):
|
421
|
+
hb_options_provider = functools.partial(
|
422
|
+
task_resource_options_provider,
|
423
|
+
pg_config=params.pg_config,
|
424
|
+
resource_amount_provider=hash_bucket_resource_options_provider,
|
425
|
+
previous_inflation=params.previous_inflation,
|
426
|
+
average_record_size_bytes=params.average_record_size_bytes,
|
427
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
428
|
+
primary_keys=params.primary_keys,
|
429
|
+
ray_custom_resources=params.ray_custom_resources,
|
430
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
431
|
+
)
|
432
|
+
|
433
|
+
def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
|
434
|
+
return {
|
435
|
+
"input": HashBucketInput.of(
|
436
|
+
item,
|
437
|
+
primary_keys=params.primary_keys,
|
438
|
+
hb_task_index=index,
|
439
|
+
num_hash_buckets=params.hash_bucket_count,
|
440
|
+
num_hash_groups=params.hash_group_count,
|
441
|
+
enable_profiler=params.enable_profiler,
|
442
|
+
metrics_config=params.metrics_config,
|
443
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
444
|
+
object_store=params.object_store,
|
445
|
+
deltacat_storage=params.deltacat_storage,
|
446
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
447
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
448
|
+
)
|
449
|
+
}
|
450
|
+
|
451
|
+
hb_tasks_pending = invoke_parallel(
|
452
|
+
items=uniform_deltas,
|
453
|
+
ray_task=hb.hash_bucket,
|
454
|
+
max_parallelism=params.task_max_parallelism,
|
455
|
+
options_provider=hb_options_provider,
|
456
|
+
kwargs_provider=hash_bucket_input_provider,
|
457
|
+
)
|
458
|
+
|
459
|
+
hb_invoke_end = time.monotonic()
|
460
|
+
|
461
|
+
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
462
|
+
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
463
|
+
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
464
|
+
|
465
|
+
return (hb_results, hb_invoke_end)
|
466
|
+
|
467
|
+
|
468
|
+
def _run_local_merge(
|
469
|
+
params: CompactPartitionParams,
|
470
|
+
uniform_deltas,
|
471
|
+
compacted_partition,
|
472
|
+
round_completion_info,
|
473
|
+
delete_strategy,
|
474
|
+
delete_file_envelopes,
|
475
|
+
mutable_compaction_audit,
|
476
|
+
previous_compacted_delta_manifest,
|
477
|
+
total_input_records_count,
|
478
|
+
) -> tuple[list[Any], Any]:
|
479
|
+
local_merge_input: MergeInput = generate_local_merge_input(
|
480
|
+
params,
|
481
|
+
uniform_deltas,
|
482
|
+
compacted_partition,
|
483
|
+
round_completion_info,
|
484
|
+
delete_strategy,
|
485
|
+
delete_file_envelopes,
|
486
|
+
)
|
487
|
+
estimated_da_bytes = (
|
488
|
+
mutable_compaction_audit.estimated_in_memory_size_bytes_during_discovery
|
489
|
+
)
|
490
|
+
estimated_num_records: int = sum(
|
491
|
+
[
|
492
|
+
entry.meta.record_count
|
493
|
+
for delta in uniform_deltas
|
494
|
+
for entry in delta.manifest.entries
|
495
|
+
]
|
496
|
+
)
|
497
|
+
local_merge_options = local_merge_resource_options_provider(
|
498
|
+
estimated_da_size=estimated_da_bytes,
|
499
|
+
estimated_num_rows=estimated_num_records,
|
500
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
501
|
+
round_completion_info=round_completion_info,
|
502
|
+
compacted_delta_manifest=previous_compacted_delta_manifest,
|
503
|
+
ray_custom_resources=params.ray_custom_resources,
|
504
|
+
primary_keys=params.primary_keys,
|
505
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
506
|
+
)
|
507
|
+
local_merge_result = ray.get(
|
508
|
+
mg.merge.options(**local_merge_options).remote(local_merge_input)
|
509
|
+
)
|
510
|
+
total_input_records_count += local_merge_result.input_record_count
|
511
|
+
merge_results = [local_merge_result]
|
512
|
+
return merge_results, total_input_records_count
|
513
|
+
|
514
|
+
|
515
|
+
def _process_merge_results(
|
516
|
+
params: CompactPartitionParams, merge_results, mutable_compaction_audit
|
517
|
+
) -> tuple[Delta, list[MaterializeResult], dict]:
|
518
|
+
mat_results = []
|
519
|
+
for merge_result in merge_results:
|
520
|
+
mat_results.extend(merge_result.materialize_results)
|
521
|
+
|
522
|
+
mat_results: List[MaterializeResult] = sorted(
|
523
|
+
mat_results, key=lambda m: m.task_index
|
524
|
+
)
|
525
|
+
|
526
|
+
hb_id_to_entry_indices_range = {}
|
527
|
+
file_index = 0
|
528
|
+
previous_task_index = -1
|
529
|
+
|
530
|
+
for mat_result in mat_results:
|
531
|
+
assert (
|
532
|
+
mat_result.pyarrow_write_result.files >= 1
|
533
|
+
), "Atleast one file must be materialized"
|
534
|
+
assert (
|
535
|
+
mat_result.task_index != previous_task_index
|
536
|
+
), f"Multiple materialize results found for a hash bucket: {mat_result.task_index}"
|
537
|
+
|
538
|
+
hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
|
539
|
+
file_index,
|
540
|
+
file_index + mat_result.pyarrow_write_result.files,
|
541
|
+
)
|
542
|
+
|
543
|
+
file_index += mat_result.pyarrow_write_result.files
|
544
|
+
previous_task_index = mat_result.task_index
|
545
|
+
|
546
|
+
s3_utils.upload(
|
547
|
+
mutable_compaction_audit.audit_url,
|
548
|
+
str(json.dumps(mutable_compaction_audit)),
|
549
|
+
**params.s3_client_kwargs,
|
550
|
+
)
|
551
|
+
|
552
|
+
deltas: List[Delta] = [m.delta for m in mat_results]
|
553
|
+
|
554
|
+
# Note: An appropriate last stream position must be set
|
555
|
+
# to avoid correctness issue.
|
556
|
+
merged_delta: Delta = Delta.merge_deltas(
|
557
|
+
deltas,
|
558
|
+
stream_position=params.last_stream_position_to_compact,
|
559
|
+
)
|
560
|
+
|
561
|
+
return merged_delta, mat_results, hb_id_to_entry_indices_range
|
562
|
+
|
563
|
+
|
564
|
+
def _upload_compaction_audit(
|
565
|
+
params: CompactPartitionParams,
|
566
|
+
mutable_compaction_audit,
|
567
|
+
round_completion_info,
|
568
|
+
) -> None:
|
569
|
+
|
570
|
+
# After all incremental delta related calculations, we update
|
571
|
+
# the input sizes to accommodate the compacted table
|
572
|
+
if round_completion_info:
|
573
|
+
mutable_compaction_audit.set_input_file_count(
|
574
|
+
(mutable_compaction_audit.input_file_count or 0)
|
575
|
+
+ round_completion_info.compacted_pyarrow_write_result.files
|
576
|
+
)
|
577
|
+
mutable_compaction_audit.set_input_size_bytes(
|
578
|
+
(mutable_compaction_audit.input_size_bytes or 0.0)
|
579
|
+
+ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
580
|
+
)
|
581
|
+
mutable_compaction_audit.set_input_records(
|
582
|
+
(mutable_compaction_audit.input_records or 0)
|
583
|
+
+ round_completion_info.compacted_pyarrow_write_result.records
|
584
|
+
)
|
585
|
+
|
586
|
+
s3_utils.upload(
|
587
|
+
mutable_compaction_audit.audit_url,
|
588
|
+
str(json.dumps(mutable_compaction_audit)),
|
589
|
+
**params.s3_client_kwargs,
|
590
|
+
)
|
591
|
+
return
|
592
|
+
|
593
|
+
|
594
|
+
def _write_new_round_completion_file(
|
595
|
+
params: CompactPartitionParams,
|
596
|
+
mutable_compaction_audit,
|
597
|
+
compacted_partition,
|
598
|
+
audit_url,
|
599
|
+
hb_id_to_entry_indices_range,
|
600
|
+
rcf_source_partition_locator,
|
601
|
+
new_compacted_delta_locator,
|
602
|
+
pyarrow_write_result,
|
603
|
+
) -> ExecutionCompactionResult:
|
604
|
+
input_inflation = None
|
605
|
+
input_average_record_size_bytes = None
|
606
|
+
# Note: we only consider inflation for incremental delta
|
607
|
+
if (
|
608
|
+
mutable_compaction_audit.input_size_bytes
|
609
|
+
and mutable_compaction_audit.hash_bucket_processed_size_bytes
|
610
|
+
):
|
611
|
+
input_inflation = (
|
612
|
+
mutable_compaction_audit.hash_bucket_processed_size_bytes
|
613
|
+
/ mutable_compaction_audit.input_size_bytes
|
614
|
+
)
|
615
|
+
|
616
|
+
if (
|
617
|
+
mutable_compaction_audit.hash_bucket_processed_size_bytes
|
618
|
+
and mutable_compaction_audit.input_records
|
619
|
+
):
|
620
|
+
input_average_record_size_bytes = (
|
621
|
+
mutable_compaction_audit.hash_bucket_processed_size_bytes
|
622
|
+
/ mutable_compaction_audit.input_records
|
623
|
+
)
|
624
|
+
|
625
|
+
logger.info(
|
626
|
+
f"The inflation of input deltas={input_inflation}"
|
627
|
+
f" and average record size={input_average_record_size_bytes}"
|
628
|
+
)
|
629
|
+
|
630
|
+
new_round_completion_info = RoundCompletionInfo.of(
|
631
|
+
high_watermark=params.last_stream_position_to_compact,
|
632
|
+
compacted_delta_locator=new_compacted_delta_locator,
|
633
|
+
compacted_pyarrow_write_result=pyarrow_write_result,
|
634
|
+
sort_keys_bit_width=params.bit_width_of_sort_keys,
|
635
|
+
manifest_entry_copied_by_reference_ratio=mutable_compaction_audit.untouched_file_ratio,
|
636
|
+
compaction_audit_url=audit_url,
|
637
|
+
hash_bucket_count=params.hash_bucket_count,
|
638
|
+
hb_index_to_entry_range=hb_id_to_entry_indices_range,
|
639
|
+
compactor_version=CompactorVersion.V2.value,
|
640
|
+
input_inflation=input_inflation,
|
641
|
+
input_average_record_size_bytes=input_average_record_size_bytes,
|
642
|
+
)
|
643
|
+
|
644
|
+
logger.info(
|
645
|
+
f"Partition-{params.source_partition_locator.partition_values},"
|
646
|
+
f"compacted at: {params.last_stream_position_to_compact},"
|
647
|
+
)
|
648
|
+
logger.info(
|
649
|
+
f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
650
|
+
)
|
651
|
+
is_inplace_compacted: bool = (
|
652
|
+
rcf_source_partition_locator.partition_values
|
653
|
+
== params.destination_partition_locator.partition_values
|
654
|
+
and rcf_source_partition_locator.stream_id
|
655
|
+
== params.destination_partition_locator.stream_id
|
656
|
+
)
|
657
|
+
if is_inplace_compacted:
|
658
|
+
logger.info(
|
659
|
+
"Overriding round completion file source partition locator as in-place compacted. "
|
660
|
+
+ f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
|
661
|
+
f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
|
662
|
+
)
|
663
|
+
rcf_source_partition_locator = compacted_partition.locator
|
664
|
+
|
665
|
+
round_completion_file_s3_url = rcf.write_round_completion_file(
|
666
|
+
params.compaction_artifact_s3_bucket,
|
667
|
+
rcf_source_partition_locator,
|
668
|
+
compacted_partition.locator,
|
669
|
+
new_round_completion_info,
|
670
|
+
**params.s3_client_kwargs,
|
671
|
+
)
|
672
|
+
|
673
|
+
return ExecutionCompactionResult(
|
674
|
+
compacted_partition,
|
675
|
+
new_round_completion_info,
|
676
|
+
round_completion_file_s3_url,
|
677
|
+
is_inplace_compacted,
|
678
|
+
)
|
679
|
+
|
680
|
+
|
681
|
+
def _commit_compaction_result(
|
682
|
+
params: CompactPartitionParams,
|
683
|
+
execute_compaction_result: ExecutionCompactionResult,
|
684
|
+
) -> None:
|
685
|
+
compaction_session_type: str = (
|
686
|
+
"INPLACE" if execute_compaction_result.is_inplace_compacted else "NON-INPLACE"
|
687
|
+
)
|
688
|
+
logger.info(
|
689
|
+
f"Partition-{params.source_partition_locator} -> "
|
690
|
+
f"{compaction_session_type} Compaction session data processing completed"
|
691
|
+
)
|
692
|
+
if execute_compaction_result.new_compacted_partition:
|
693
|
+
previous_partition: Optional[Partition] = None
|
694
|
+
if execute_compaction_result.is_inplace_compacted:
|
695
|
+
previous_partition: Optional[
|
696
|
+
Partition
|
697
|
+
] = params.deltacat_storage.get_partition(
|
698
|
+
params.source_partition_locator.stream_locator,
|
699
|
+
params.source_partition_locator.partition_values,
|
700
|
+
**params.deltacat_storage_kwargs,
|
701
|
+
)
|
702
|
+
# NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
|
703
|
+
logger.info(
|
704
|
+
f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
|
705
|
+
f"using previous partition: {previous_partition.locator if previous_partition else None}"
|
706
|
+
)
|
707
|
+
committed_partition: Partition = params.deltacat_storage.commit_partition(
|
708
|
+
execute_compaction_result.new_compacted_partition,
|
709
|
+
previous_partition,
|
710
|
+
**params.deltacat_storage_kwargs,
|
711
|
+
)
|
712
|
+
logger.info(f"Committed compacted partition: {committed_partition}")
|
713
|
+
else:
|
714
|
+
logger.warning("No new partition was committed during compaction.")
|
715
|
+
|
716
|
+
logger.info(f"Completed compaction session for: {params.source_partition_locator}")
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=dvd9BOMviyQgIHFPVSN_kQV6dAuyud4WZ6kUJyuO9go,1778
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
|
4
4
|
deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
|
@@ -65,6 +65,8 @@ deltacat/compute/compactor_v2/model/hash_bucket_result.py,sha256=EsY9BPPywhmxlcL
|
|
65
65
|
deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViVO1SVljCj6f0B3MfB3hqtGm2S0s,7410
|
66
66
|
deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
|
67
67
|
deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
|
68
|
+
deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
|
+
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=3ukQx50xH810XJu1KzKdxY95lGuZGerHhHTJ89ns-jg,27622
|
68
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
70
72
|
deltacat/compute/compactor_v2/steps/merge.py,sha256=ukCn312igxq7jNiCn7a2Vzk309LKdYZ902HTcEZhjM4,21774
|
@@ -216,8 +218,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
216
218
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
217
219
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
218
220
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
219
|
-
deltacat-1.1.
|
220
|
-
deltacat-1.1.
|
221
|
-
deltacat-1.1.
|
222
|
-
deltacat-1.1.
|
223
|
-
deltacat-1.1.
|
221
|
+
deltacat-1.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
222
|
+
deltacat-1.1.14.dist-info/METADATA,sha256=orKTHhgUb74RXGlgcKhu-M36EmYd3-41AH7V1IP2jEI,1734
|
223
|
+
deltacat-1.1.14.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
224
|
+
deltacat-1.1.14.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
225
|
+
deltacat-1.1.14.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|