deltacat 1.1.13__py3-none-any.whl → 1.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.13"
47
+ __version__ = "1.1.14"
48
48
 
49
49
 
50
50
  __all__ = [
File without changes
@@ -0,0 +1,716 @@
1
+ import numpy as np
2
+ import functools
3
+ import logging
4
+ import ray
5
+ import time
6
+ import json
7
+
8
+ from deltacat.compute.compactor import (
9
+ HighWatermark,
10
+ RoundCompletionInfo,
11
+ )
12
+ from deltacat.aws import s3u as s3_utils
13
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
14
+ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
15
+ ExecutionCompactionResult,
16
+ )
17
+ from deltacat.compute.compactor_v2.model.merge_file_group import (
18
+ RemoteMergeFileGroupsProvider,
19
+ )
20
+ from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
21
+
22
+ from deltacat import logs
23
+ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
24
+ from deltacat.compute.compactor_v2.model.merge_result import MergeResult
25
+ from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
26
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
27
+ from deltacat.compute.compactor_v2.utils.merge import (
28
+ generate_local_merge_input,
29
+ )
30
+ from deltacat.compute.compactor_v2.utils.task_options import (
31
+ hash_bucket_resource_options_provider,
32
+ )
33
+ from deltacat.compute.compactor.utils import round_completion_file as rcf
34
+ from deltacat.compute.compactor import DeltaAnnotated
35
+ from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
36
+ from deltacat.compute.compactor_v2.deletes.delete_strategy import (
37
+ DeleteStrategy,
38
+ )
39
+ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
40
+ DeleteFileEnvelope,
41
+ )
42
+ from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
43
+
44
+ from deltacat.storage import (
45
+ Delta,
46
+ DeltaType,
47
+ Stream,
48
+ StreamLocator,
49
+ Partition,
50
+ Manifest,
51
+ )
52
+ from deltacat.compute.compactor.model.compact_partition_params import (
53
+ CompactPartitionParams,
54
+ )
55
+ from deltacat.utils.ray_utils.concurrency import (
56
+ invoke_parallel,
57
+ task_resource_options_provider,
58
+ )
59
+ from deltacat.compute.compactor_v2.steps import merge as mg
60
+ from deltacat.compute.compactor_v2.steps import hash_bucket as hb
61
+ from deltacat.compute.compactor_v2.utils import io
62
+
63
+ from typing import Any, List, Optional
64
+ from collections import defaultdict
65
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
66
+ CompactionSessionAuditInfo,
67
+ )
68
+ from deltacat.compute.compactor_v2.utils.task_options import (
69
+ merge_resource_options_provider,
70
+ local_merge_resource_options_provider,
71
+ )
72
+
73
+
74
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
75
+
76
+
77
+ def _fetch_compaction_metadata(
78
+ params: CompactPartitionParams,
79
+ ) -> tuple[Optional[Manifest], Optional[RoundCompletionInfo]]:
80
+
81
+ # read the results from any previously completed compaction round
82
+ round_completion_info: Optional[RoundCompletionInfo] = None
83
+ high_watermark: Optional[HighWatermark] = None
84
+ previous_compacted_delta_manifest: Optional[Manifest] = None
85
+
86
+ if not params.rebase_source_partition_locator:
87
+ round_completion_info = rcf.read_round_completion_file(
88
+ params.compaction_artifact_s3_bucket,
89
+ params.source_partition_locator,
90
+ params.destination_partition_locator,
91
+ **params.s3_client_kwargs,
92
+ )
93
+ if not round_completion_info:
94
+ logger.info(
95
+ "Both rebase partition and round completion file not found. Performing an entire backfill on source."
96
+ )
97
+ else:
98
+ compacted_delta_locator = round_completion_info.compacted_delta_locator
99
+
100
+ previous_compacted_delta_manifest = (
101
+ params.deltacat_storage.get_delta_manifest(
102
+ compacted_delta_locator, **params.deltacat_storage_kwargs
103
+ )
104
+ )
105
+
106
+ high_watermark = round_completion_info.high_watermark
107
+ logger.info(f"Setting round completion high watermark: {high_watermark}")
108
+ assert (
109
+ params.hash_bucket_count == round_completion_info.hash_bucket_count
110
+ ), (
111
+ "The hash bucket count has changed. "
112
+ "Kindly run rebase compaction and trigger incremental again. "
113
+ f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
114
+ f"not equal to Hash bucket count in args={params.hash_bucket_count}."
115
+ )
116
+
117
+ logger.info(f"Round completion file: {round_completion_info}")
118
+ return (
119
+ previous_compacted_delta_manifest,
120
+ round_completion_info,
121
+ )
122
+
123
+
124
+ def _build_uniform_deltas(
125
+ params: CompactPartitionParams,
126
+ mutable_compaction_audit,
127
+ input_deltas,
128
+ delta_discovery_start,
129
+ ) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition]:
130
+
131
+ delete_strategy: Optional[DeleteStrategy] = None
132
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
133
+ delete_file_size_bytes: int = 0
134
+ if contains_delete_deltas(input_deltas):
135
+ input_deltas, delete_file_envelopes, delete_strategy = prepare_deletes(
136
+ params, input_deltas
137
+ )
138
+ for delete_file_envelope in delete_file_envelopes:
139
+ delete_file_size_bytes += delete_file_envelope.table_size_bytes
140
+ logger.info(
141
+ f" Input deltas contain {DeltaType.DELETE}-type deltas. Total delete file size={delete_file_size_bytes}."
142
+ f" Total length of delete file envelopes={len(delete_file_envelopes)}"
143
+ )
144
+ uniform_deltas: List[DeltaAnnotated] = io.create_uniform_input_deltas(
145
+ input_deltas=input_deltas,
146
+ hash_bucket_count=params.hash_bucket_count,
147
+ compaction_audit=mutable_compaction_audit,
148
+ deltacat_storage=params.deltacat_storage,
149
+ previous_inflation=params.previous_inflation,
150
+ min_delta_bytes=params.min_delta_bytes_in_batch,
151
+ min_file_counts=params.min_files_in_batch,
152
+ # disable input split during rebase as the rebase files are already uniform
153
+ enable_input_split=params.rebase_source_partition_locator is None,
154
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
155
+ )
156
+ delta_discovery_end: float = time.monotonic()
157
+
158
+ mutable_compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
159
+ mutable_compaction_audit.set_delta_discovery_time_in_seconds(
160
+ delta_discovery_end - delta_discovery_start
161
+ )
162
+
163
+ s3_utils.upload(
164
+ mutable_compaction_audit.audit_url,
165
+ str(json.dumps(mutable_compaction_audit)),
166
+ **params.s3_client_kwargs,
167
+ )
168
+
169
+ return (
170
+ uniform_deltas,
171
+ delete_strategy,
172
+ delete_file_envelopes,
173
+ )
174
+
175
+
176
+ def _run_hash_and_merge(
177
+ params: CompactPartitionParams,
178
+ uniform_deltas,
179
+ round_completion_info,
180
+ delete_strategy,
181
+ delete_file_envelopes,
182
+ mutable_compaction_audit,
183
+ previous_compacted_delta_manifest,
184
+ ) -> tuple[
185
+ list[MergeResult], np.int64, np.float64, np.int64, np.int64, np.float64, Partition
186
+ ]:
187
+ # create a new stream for this round
188
+ compacted_stream_locator: Optional[
189
+ StreamLocator
190
+ ] = params.destination_partition_locator.stream_locator
191
+ compacted_stream: Stream = params.deltacat_storage.get_stream(
192
+ compacted_stream_locator.namespace,
193
+ compacted_stream_locator.table_name,
194
+ compacted_stream_locator.table_version,
195
+ **params.deltacat_storage_kwargs,
196
+ )
197
+ compacted_partition: Partition = params.deltacat_storage.stage_partition(
198
+ compacted_stream,
199
+ params.destination_partition_locator.partition_values,
200
+ **params.deltacat_storage_kwargs,
201
+ )
202
+
203
+ telemetry_time_hb = 0
204
+ total_input_records_count = np.int64(0)
205
+ total_hb_record_count = np.int64(0)
206
+ if params.hash_bucket_count == 1:
207
+ logger.info("Hash bucket count set to 1. Running local merge")
208
+ merge_start: float = time.monotonic()
209
+ merge_results, total_input_records_count = _run_local_merge(
210
+ params,
211
+ uniform_deltas,
212
+ compacted_partition,
213
+ round_completion_info,
214
+ delete_strategy,
215
+ delete_file_envelopes,
216
+ mutable_compaction_audit,
217
+ previous_compacted_delta_manifest,
218
+ total_input_records_count,
219
+ )
220
+ merge_invoke_end = time.monotonic()
221
+ else:
222
+ # hash bucket
223
+ hb_start = time.monotonic()
224
+ all_hash_group_idx_to_obj_id = defaultdict(list)
225
+ all_hash_group_idx_to_size_bytes = defaultdict(int)
226
+ all_hash_group_idx_to_num_rows = defaultdict(int)
227
+ (hb_results, hb_invoke_end) = _hash_bucket(params, uniform_deltas)
228
+ hb_end = time.monotonic()
229
+
230
+ # we use time.time() here because time.monotonic() has no reference point
231
+ # whereas time.time() measures epoch seconds. Hence, it will be reasonable
232
+ # to compare time.time()s captured in different nodes.
233
+ hb_results_retrieved_at = time.time()
234
+
235
+ telemetry_time_hb = mutable_compaction_audit.save_step_stats(
236
+ CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
237
+ hb_results,
238
+ hb_results_retrieved_at,
239
+ hb_invoke_end - hb_start,
240
+ hb_end - hb_start,
241
+ )
242
+
243
+ s3_utils.upload(
244
+ mutable_compaction_audit.audit_url,
245
+ str(json.dumps(mutable_compaction_audit)),
246
+ **params.s3_client_kwargs,
247
+ )
248
+
249
+ hb_data_processed_size_bytes = np.int64(0)
250
+
251
+ # initialize all hash groups
252
+ for hb_group in range(params.hash_group_count):
253
+ all_hash_group_idx_to_num_rows[hb_group] = 0
254
+ all_hash_group_idx_to_obj_id[hb_group] = []
255
+ all_hash_group_idx_to_size_bytes[hb_group] = 0
256
+
257
+ for hb_result in hb_results:
258
+ hb_data_processed_size_bytes += hb_result.hb_size_bytes
259
+ total_input_records_count += hb_result.hb_record_count
260
+
261
+ for hash_group_index, object_id_size_tuple in enumerate(
262
+ hb_result.hash_bucket_group_to_obj_id_tuple
263
+ ):
264
+ if object_id_size_tuple:
265
+ all_hash_group_idx_to_obj_id[hash_group_index].append(
266
+ object_id_size_tuple[0],
267
+ )
268
+ all_hash_group_idx_to_size_bytes[
269
+ hash_group_index
270
+ ] += object_id_size_tuple[1].item()
271
+ all_hash_group_idx_to_num_rows[
272
+ hash_group_index
273
+ ] += object_id_size_tuple[2].item()
274
+
275
+ logger.info(
276
+ f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
277
+ )
278
+
279
+ total_hb_record_count = total_input_records_count
280
+ mutable_compaction_audit.set_hash_bucket_processed_size_bytes(
281
+ hb_data_processed_size_bytes.item()
282
+ )
283
+
284
+ # BSP Step 2: Merge
285
+ # NOTE: DELETE-type deltas are stored in Plasma object store
286
+ # in prepare_deletes and therefore don't need to included
287
+ # in merge task resource estimation
288
+ merge_start = time.monotonic()
289
+ merge_results, merge_invoke_end = _merge(
290
+ params,
291
+ task_resource_options_provider,
292
+ merge_resource_options_provider,
293
+ all_hash_group_idx_to_size_bytes,
294
+ all_hash_group_idx_to_num_rows,
295
+ round_completion_info,
296
+ previous_compacted_delta_manifest,
297
+ all_hash_group_idx_to_obj_id,
298
+ compacted_partition,
299
+ delete_strategy,
300
+ delete_file_envelopes,
301
+ )
302
+ logger.info(f"Got {len(merge_results)} merge results.")
303
+
304
+ merge_results_retrieved_at: float = time.time()
305
+ merge_end: float = time.monotonic()
306
+
307
+ total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
308
+ total_deleted_record_count = sum(
309
+ [ddr.deleted_record_count for ddr in merge_results]
310
+ )
311
+ logger.info(
312
+ f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
313
+ )
314
+
315
+ mutable_compaction_audit.set_input_records(total_input_records_count.item())
316
+
317
+ telemetry_time_merge = mutable_compaction_audit.save_step_stats(
318
+ CompactionSessionAuditInfo.MERGE_STEP_NAME,
319
+ merge_results,
320
+ merge_results_retrieved_at,
321
+ merge_invoke_end - merge_start,
322
+ merge_end - merge_start,
323
+ )
324
+
325
+ mutable_compaction_audit.set_records_deduped(total_dd_record_count.item())
326
+ mutable_compaction_audit.set_records_deleted(total_deleted_record_count.item())
327
+ record_info_msg: str = (
328
+ f"Hash bucket records: {total_hb_record_count},"
329
+ f" Deduped records: {total_dd_record_count}, "
330
+ f" Deleted records: {total_deleted_record_count}, "
331
+ )
332
+ logger.info(record_info_msg)
333
+ return (
334
+ merge_results,
335
+ telemetry_time_hb,
336
+ telemetry_time_merge,
337
+ compacted_partition,
338
+ )
339
+
340
+
341
+ def _merge(
342
+ params: CompactPartitionParams,
343
+ task_resource_options_provider,
344
+ merge_resource_options_provider,
345
+ all_hash_group_idx_to_size_bytes,
346
+ all_hash_group_idx_to_num_rows,
347
+ round_completion_info,
348
+ previous_compacted_delta_manifest,
349
+ all_hash_group_idx_to_obj_id,
350
+ compacted_partition,
351
+ delete_strategy,
352
+ delete_file_envelopes,
353
+ ) -> tuple[List[MergeResult], float]:
354
+ merge_options_provider = functools.partial(
355
+ task_resource_options_provider,
356
+ pg_config=params.pg_config,
357
+ resource_amount_provider=merge_resource_options_provider,
358
+ num_hash_groups=params.hash_group_count,
359
+ hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
360
+ hash_group_num_rows=all_hash_group_idx_to_num_rows,
361
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
362
+ round_completion_info=round_completion_info,
363
+ compacted_delta_manifest=previous_compacted_delta_manifest,
364
+ primary_keys=params.primary_keys,
365
+ deltacat_storage=params.deltacat_storage,
366
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
367
+ ray_custom_resources=params.ray_custom_resources,
368
+ memory_logs_enabled=params.memory_logs_enabled,
369
+ )
370
+
371
+ def merge_input_provider(index, item) -> dict[str, MergeInput]:
372
+ return {
373
+ "input": MergeInput.of(
374
+ merge_file_groups_provider=RemoteMergeFileGroupsProvider(
375
+ hash_group_index=item[0],
376
+ dfe_groups_refs=item[1],
377
+ hash_bucket_count=params.hash_bucket_count,
378
+ num_hash_groups=params.hash_group_count,
379
+ object_store=params.object_store,
380
+ ),
381
+ write_to_partition=compacted_partition,
382
+ compacted_file_content_type=params.compacted_file_content_type,
383
+ primary_keys=params.primary_keys,
384
+ sort_keys=params.sort_keys,
385
+ merge_task_index=index,
386
+ drop_duplicates=params.drop_duplicates,
387
+ max_records_per_output_file=params.records_per_compacted_file,
388
+ enable_profiler=params.enable_profiler,
389
+ metrics_config=params.metrics_config,
390
+ s3_table_writer_kwargs=params.s3_table_writer_kwargs,
391
+ read_kwargs_provider=params.read_kwargs_provider,
392
+ round_completion_info=round_completion_info,
393
+ object_store=params.object_store,
394
+ deltacat_storage=params.deltacat_storage,
395
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
396
+ delete_strategy=delete_strategy,
397
+ delete_file_envelopes=delete_file_envelopes,
398
+ memory_logs_enabled=params.memory_logs_enabled,
399
+ disable_copy_by_reference=params.disable_copy_by_reference,
400
+ )
401
+ }
402
+
403
+ merge_tasks_pending = invoke_parallel(
404
+ items=all_hash_group_idx_to_obj_id.items(),
405
+ ray_task=mg.merge,
406
+ max_parallelism=params.task_max_parallelism,
407
+ options_provider=merge_options_provider,
408
+ kwargs_provider=merge_input_provider,
409
+ )
410
+ merge_invoke_end = time.monotonic()
411
+ logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
412
+ merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
413
+
414
+ return merge_results, merge_invoke_end
415
+
416
+
417
+ def _hash_bucket(
418
+ params: CompactPartitionParams,
419
+ uniform_deltas,
420
+ ):
421
+ hb_options_provider = functools.partial(
422
+ task_resource_options_provider,
423
+ pg_config=params.pg_config,
424
+ resource_amount_provider=hash_bucket_resource_options_provider,
425
+ previous_inflation=params.previous_inflation,
426
+ average_record_size_bytes=params.average_record_size_bytes,
427
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
428
+ primary_keys=params.primary_keys,
429
+ ray_custom_resources=params.ray_custom_resources,
430
+ memory_logs_enabled=params.memory_logs_enabled,
431
+ )
432
+
433
+ def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
434
+ return {
435
+ "input": HashBucketInput.of(
436
+ item,
437
+ primary_keys=params.primary_keys,
438
+ hb_task_index=index,
439
+ num_hash_buckets=params.hash_bucket_count,
440
+ num_hash_groups=params.hash_group_count,
441
+ enable_profiler=params.enable_profiler,
442
+ metrics_config=params.metrics_config,
443
+ read_kwargs_provider=params.read_kwargs_provider,
444
+ object_store=params.object_store,
445
+ deltacat_storage=params.deltacat_storage,
446
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
447
+ memory_logs_enabled=params.memory_logs_enabled,
448
+ )
449
+ }
450
+
451
+ hb_tasks_pending = invoke_parallel(
452
+ items=uniform_deltas,
453
+ ray_task=hb.hash_bucket,
454
+ max_parallelism=params.task_max_parallelism,
455
+ options_provider=hb_options_provider,
456
+ kwargs_provider=hash_bucket_input_provider,
457
+ )
458
+
459
+ hb_invoke_end = time.monotonic()
460
+
461
+ logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
462
+ hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
463
+ logger.info(f"Got {len(hb_results)} hash bucket results.")
464
+
465
+ return (hb_results, hb_invoke_end)
466
+
467
+
468
+ def _run_local_merge(
469
+ params: CompactPartitionParams,
470
+ uniform_deltas,
471
+ compacted_partition,
472
+ round_completion_info,
473
+ delete_strategy,
474
+ delete_file_envelopes,
475
+ mutable_compaction_audit,
476
+ previous_compacted_delta_manifest,
477
+ total_input_records_count,
478
+ ) -> tuple[list[Any], Any]:
479
+ local_merge_input: MergeInput = generate_local_merge_input(
480
+ params,
481
+ uniform_deltas,
482
+ compacted_partition,
483
+ round_completion_info,
484
+ delete_strategy,
485
+ delete_file_envelopes,
486
+ )
487
+ estimated_da_bytes = (
488
+ mutable_compaction_audit.estimated_in_memory_size_bytes_during_discovery
489
+ )
490
+ estimated_num_records: int = sum(
491
+ [
492
+ entry.meta.record_count
493
+ for delta in uniform_deltas
494
+ for entry in delta.manifest.entries
495
+ ]
496
+ )
497
+ local_merge_options = local_merge_resource_options_provider(
498
+ estimated_da_size=estimated_da_bytes,
499
+ estimated_num_rows=estimated_num_records,
500
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
501
+ round_completion_info=round_completion_info,
502
+ compacted_delta_manifest=previous_compacted_delta_manifest,
503
+ ray_custom_resources=params.ray_custom_resources,
504
+ primary_keys=params.primary_keys,
505
+ memory_logs_enabled=params.memory_logs_enabled,
506
+ )
507
+ local_merge_result = ray.get(
508
+ mg.merge.options(**local_merge_options).remote(local_merge_input)
509
+ )
510
+ total_input_records_count += local_merge_result.input_record_count
511
+ merge_results = [local_merge_result]
512
+ return merge_results, total_input_records_count
513
+
514
+
515
+ def _process_merge_results(
516
+ params: CompactPartitionParams, merge_results, mutable_compaction_audit
517
+ ) -> tuple[Delta, list[MaterializeResult], dict]:
518
+ mat_results = []
519
+ for merge_result in merge_results:
520
+ mat_results.extend(merge_result.materialize_results)
521
+
522
+ mat_results: List[MaterializeResult] = sorted(
523
+ mat_results, key=lambda m: m.task_index
524
+ )
525
+
526
+ hb_id_to_entry_indices_range = {}
527
+ file_index = 0
528
+ previous_task_index = -1
529
+
530
+ for mat_result in mat_results:
531
+ assert (
532
+ mat_result.pyarrow_write_result.files >= 1
533
+ ), "Atleast one file must be materialized"
534
+ assert (
535
+ mat_result.task_index != previous_task_index
536
+ ), f"Multiple materialize results found for a hash bucket: {mat_result.task_index}"
537
+
538
+ hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
539
+ file_index,
540
+ file_index + mat_result.pyarrow_write_result.files,
541
+ )
542
+
543
+ file_index += mat_result.pyarrow_write_result.files
544
+ previous_task_index = mat_result.task_index
545
+
546
+ s3_utils.upload(
547
+ mutable_compaction_audit.audit_url,
548
+ str(json.dumps(mutable_compaction_audit)),
549
+ **params.s3_client_kwargs,
550
+ )
551
+
552
+ deltas: List[Delta] = [m.delta for m in mat_results]
553
+
554
+ # Note: An appropriate last stream position must be set
555
+ # to avoid correctness issue.
556
+ merged_delta: Delta = Delta.merge_deltas(
557
+ deltas,
558
+ stream_position=params.last_stream_position_to_compact,
559
+ )
560
+
561
+ return merged_delta, mat_results, hb_id_to_entry_indices_range
562
+
563
+
564
+ def _upload_compaction_audit(
565
+ params: CompactPartitionParams,
566
+ mutable_compaction_audit,
567
+ round_completion_info,
568
+ ) -> None:
569
+
570
+ # After all incremental delta related calculations, we update
571
+ # the input sizes to accommodate the compacted table
572
+ if round_completion_info:
573
+ mutable_compaction_audit.set_input_file_count(
574
+ (mutable_compaction_audit.input_file_count or 0)
575
+ + round_completion_info.compacted_pyarrow_write_result.files
576
+ )
577
+ mutable_compaction_audit.set_input_size_bytes(
578
+ (mutable_compaction_audit.input_size_bytes or 0.0)
579
+ + round_completion_info.compacted_pyarrow_write_result.file_bytes
580
+ )
581
+ mutable_compaction_audit.set_input_records(
582
+ (mutable_compaction_audit.input_records or 0)
583
+ + round_completion_info.compacted_pyarrow_write_result.records
584
+ )
585
+
586
+ s3_utils.upload(
587
+ mutable_compaction_audit.audit_url,
588
+ str(json.dumps(mutable_compaction_audit)),
589
+ **params.s3_client_kwargs,
590
+ )
591
+ return
592
+
593
+
594
+ def _write_new_round_completion_file(
595
+ params: CompactPartitionParams,
596
+ mutable_compaction_audit,
597
+ compacted_partition,
598
+ audit_url,
599
+ hb_id_to_entry_indices_range,
600
+ rcf_source_partition_locator,
601
+ new_compacted_delta_locator,
602
+ pyarrow_write_result,
603
+ ) -> ExecutionCompactionResult:
604
+ input_inflation = None
605
+ input_average_record_size_bytes = None
606
+ # Note: we only consider inflation for incremental delta
607
+ if (
608
+ mutable_compaction_audit.input_size_bytes
609
+ and mutable_compaction_audit.hash_bucket_processed_size_bytes
610
+ ):
611
+ input_inflation = (
612
+ mutable_compaction_audit.hash_bucket_processed_size_bytes
613
+ / mutable_compaction_audit.input_size_bytes
614
+ )
615
+
616
+ if (
617
+ mutable_compaction_audit.hash_bucket_processed_size_bytes
618
+ and mutable_compaction_audit.input_records
619
+ ):
620
+ input_average_record_size_bytes = (
621
+ mutable_compaction_audit.hash_bucket_processed_size_bytes
622
+ / mutable_compaction_audit.input_records
623
+ )
624
+
625
+ logger.info(
626
+ f"The inflation of input deltas={input_inflation}"
627
+ f" and average record size={input_average_record_size_bytes}"
628
+ )
629
+
630
+ new_round_completion_info = RoundCompletionInfo.of(
631
+ high_watermark=params.last_stream_position_to_compact,
632
+ compacted_delta_locator=new_compacted_delta_locator,
633
+ compacted_pyarrow_write_result=pyarrow_write_result,
634
+ sort_keys_bit_width=params.bit_width_of_sort_keys,
635
+ manifest_entry_copied_by_reference_ratio=mutable_compaction_audit.untouched_file_ratio,
636
+ compaction_audit_url=audit_url,
637
+ hash_bucket_count=params.hash_bucket_count,
638
+ hb_index_to_entry_range=hb_id_to_entry_indices_range,
639
+ compactor_version=CompactorVersion.V2.value,
640
+ input_inflation=input_inflation,
641
+ input_average_record_size_bytes=input_average_record_size_bytes,
642
+ )
643
+
644
+ logger.info(
645
+ f"Partition-{params.source_partition_locator.partition_values},"
646
+ f"compacted at: {params.last_stream_position_to_compact},"
647
+ )
648
+ logger.info(
649
+ f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
650
+ )
651
+ is_inplace_compacted: bool = (
652
+ rcf_source_partition_locator.partition_values
653
+ == params.destination_partition_locator.partition_values
654
+ and rcf_source_partition_locator.stream_id
655
+ == params.destination_partition_locator.stream_id
656
+ )
657
+ if is_inplace_compacted:
658
+ logger.info(
659
+ "Overriding round completion file source partition locator as in-place compacted. "
660
+ + f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
661
+ f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
662
+ )
663
+ rcf_source_partition_locator = compacted_partition.locator
664
+
665
+ round_completion_file_s3_url = rcf.write_round_completion_file(
666
+ params.compaction_artifact_s3_bucket,
667
+ rcf_source_partition_locator,
668
+ compacted_partition.locator,
669
+ new_round_completion_info,
670
+ **params.s3_client_kwargs,
671
+ )
672
+
673
+ return ExecutionCompactionResult(
674
+ compacted_partition,
675
+ new_round_completion_info,
676
+ round_completion_file_s3_url,
677
+ is_inplace_compacted,
678
+ )
679
+
680
+
681
+ def _commit_compaction_result(
682
+ params: CompactPartitionParams,
683
+ execute_compaction_result: ExecutionCompactionResult,
684
+ ) -> None:
685
+ compaction_session_type: str = (
686
+ "INPLACE" if execute_compaction_result.is_inplace_compacted else "NON-INPLACE"
687
+ )
688
+ logger.info(
689
+ f"Partition-{params.source_partition_locator} -> "
690
+ f"{compaction_session_type} Compaction session data processing completed"
691
+ )
692
+ if execute_compaction_result.new_compacted_partition:
693
+ previous_partition: Optional[Partition] = None
694
+ if execute_compaction_result.is_inplace_compacted:
695
+ previous_partition: Optional[
696
+ Partition
697
+ ] = params.deltacat_storage.get_partition(
698
+ params.source_partition_locator.stream_locator,
699
+ params.source_partition_locator.partition_values,
700
+ **params.deltacat_storage_kwargs,
701
+ )
702
+ # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
703
+ logger.info(
704
+ f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
705
+ f"using previous partition: {previous_partition.locator if previous_partition else None}"
706
+ )
707
+ committed_partition: Partition = params.deltacat_storage.commit_partition(
708
+ execute_compaction_result.new_compacted_partition,
709
+ previous_partition,
710
+ **params.deltacat_storage_kwargs,
711
+ )
712
+ logger.info(f"Committed compacted partition: {committed_partition}")
713
+ else:
714
+ logger.warning("No new partition was committed during compaction.")
715
+
716
+ logger.info(f"Completed compaction session for: {params.source_partition_locator}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.13
3
+ Version: 1.1.14
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=v91QDtZ0JmhMu7T4rGn0ioNLAMn-0Ha7zC-0EACz2LM,1778
1
+ deltacat/__init__.py,sha256=dvd9BOMviyQgIHFPVSN_kQV6dAuyud4WZ6kUJyuO9go,1778
2
2
  deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
3
  deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
4
4
  deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
@@ -65,6 +65,8 @@ deltacat/compute/compactor_v2/model/hash_bucket_result.py,sha256=EsY9BPPywhmxlcL
65
65
  deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViVO1SVljCj6f0B3MfB3hqtGm2S0s,7410
66
66
  deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
67
67
  deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
68
+ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
+ deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=3ukQx50xH810XJu1KzKdxY95lGuZGerHhHTJ89ns-jg,27622
68
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
70
72
  deltacat/compute/compactor_v2/steps/merge.py,sha256=ukCn312igxq7jNiCn7a2Vzk309LKdYZ902HTcEZhjM4,21774
@@ -216,8 +218,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
216
218
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
217
219
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
218
220
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
219
- deltacat-1.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
220
- deltacat-1.1.13.dist-info/METADATA,sha256=vkq5iB16UJ0Ziot6trDCmsRgfKLoKmU_TC8B33M7qLs,1734
221
- deltacat-1.1.13.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
222
- deltacat-1.1.13.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
223
- deltacat-1.1.13.dist-info/RECORD,,
221
+ deltacat-1.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
222
+ deltacat-1.1.14.dist-info/METADATA,sha256=orKTHhgUb74RXGlgcKhu-M36EmYd3-41AH7V1IP2jEI,1734
223
+ deltacat-1.1.14.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
224
+ deltacat-1.1.14.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
225
+ deltacat-1.1.14.dist-info/RECORD,,