deltacat 1.1.12__py3-none-any.whl → 1.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,82 +1,58 @@
1
+ import numpy as np
1
2
  import importlib
2
3
  from contextlib import nullcontext
3
- import numpy as np
4
- import functools
5
4
  import logging
6
- import ray
7
5
  import time
8
- import json
9
-
10
- from deltacat.compute.compactor_v2.model.merge_file_group import (
11
- RemoteMergeFileGroupsProvider,
12
- )
13
- from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
14
-
15
- from deltacat.compute.compactor_v2.model.merge_input import MergeInput
6
+ import ray
16
7
 
17
- from deltacat.aws import s3u as s3_utils
18
8
  import deltacat
19
- from deltacat import logs
20
9
  from deltacat.compute.compactor import (
21
- HighWatermark,
22
10
  PyArrowWriteResult,
23
11
  RoundCompletionInfo,
24
12
  )
25
- from deltacat.compute.compactor_v2.model.merge_result import MergeResult
26
- from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
13
+ from deltacat import logs
27
14
  from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
28
15
  ExecutionCompactionResult,
29
16
  )
30
- from deltacat.compute.compactor.model.materialize_result import MaterializeResult
31
- from deltacat.compute.compactor_v2.utils.merge import (
32
- generate_local_merge_input,
33
- )
17
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
18
+ from deltacat.compute.compactor.utils import round_completion_file as rcf
34
19
  from deltacat.compute.compactor import DeltaAnnotated
35
- from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
36
20
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
37
21
  DeleteStrategy,
38
22
  )
23
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
24
+ from deltacat.compute.compactor_v2.model.merge_result import MergeResult
39
25
  from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
40
26
  DeleteFileEnvelope,
41
27
  )
42
- from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
43
-
44
28
  from deltacat.storage import (
45
29
  Delta,
46
30
  DeltaLocator,
47
- DeltaType,
48
31
  Manifest,
49
32
  Partition,
50
- Stream,
51
- StreamLocator,
52
33
  )
53
34
  from deltacat.compute.compactor.model.compact_partition_params import (
54
35
  CompactPartitionParams,
55
36
  )
56
- from deltacat.utils.ray_utils.concurrency import (
57
- invoke_parallel,
58
- task_resource_options_provider,
37
+ from deltacat.utils.resources import (
38
+ get_current_process_peak_memory_usage_in_bytes,
39
+ )
40
+ from deltacat.compute.compactor_v2.private.compaction_utils import (
41
+ _fetch_compaction_metadata,
42
+ _build_uniform_deltas,
43
+ _run_hash_and_merge,
44
+ _process_merge_results,
45
+ _upload_compaction_audit,
46
+ _write_new_round_completion_file,
47
+ _commit_compaction_result,
59
48
  )
60
- from deltacat.compute.compactor_v2.steps import merge as mg
61
- from deltacat.compute.compactor_v2.steps import hash_bucket as hb
62
- from deltacat.compute.compactor_v2.utils import io
63
- from deltacat.compute.compactor.utils import round_completion_file as rcf
64
49
  from deltacat.utils.metrics import metrics
65
-
66
- from typing import List, Optional
67
- from collections import defaultdict
68
50
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
69
51
  CompactionSessionAuditInfo,
70
52
  )
71
- from deltacat.utils.resources import (
72
- get_current_process_peak_memory_usage_in_bytes,
73
- )
74
- from deltacat.compute.compactor_v2.utils.task_options import (
75
- hash_bucket_resource_options_provider,
76
- merge_resource_options_provider,
77
- local_merge_resource_options_provider,
78
- )
79
- from deltacat.compute.compactor.model.compactor_version import CompactorVersion
53
+
54
+ from typing import List, Optional
55
+ from deltacat.compute.compactor_v2.utils import io
80
56
  from deltacat.exceptions import categorize_errors
81
57
  from deltacat.compute.compactor_v2.constants import COMPACT_PARTITION_METRIC_PREFIX
82
58
 
@@ -101,118 +77,50 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
101
77
  params,
102
78
  **kwargs,
103
79
  )
104
- compaction_session_type: str = (
105
- "INPLACE"
106
- if execute_compaction_result.is_inplace_compacted
107
- else "NON-INPLACE"
108
- )
109
- logger.info(
110
- f"Partition-{params.source_partition_locator} -> "
111
- f"{compaction_session_type} Compaction session data processing completed"
112
- )
113
- if execute_compaction_result.new_compacted_partition:
114
- previous_partition: Optional[Partition] = None
115
- if execute_compaction_result.is_inplace_compacted:
116
- previous_partition: Optional[
117
- Partition
118
- ] = params.deltacat_storage.get_partition(
119
- params.source_partition_locator.stream_locator,
120
- params.source_partition_locator.partition_values,
121
- **params.deltacat_storage_kwargs,
122
- )
123
- # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
124
- logger.info(
125
- f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
126
- f"using previous partition: {previous_partition.locator if previous_partition else None}"
127
- )
128
- committed_partition: Partition = params.deltacat_storage.commit_partition(
129
- execute_compaction_result.new_compacted_partition,
130
- previous_partition,
131
- **params.deltacat_storage_kwargs,
132
- )
133
- logger.info(f"Committed compacted partition: {committed_partition}")
134
- else:
135
- logger.warning("No new partition was committed during compaction.")
136
-
137
- logger.info(
138
- f"Completed compaction session for: {params.source_partition_locator}"
139
- )
80
+ _commit_compaction_result(params, execute_compaction_result)
140
81
  return execute_compaction_result.round_completion_file_s3_url
141
82
 
142
83
 
143
84
  def _execute_compaction(
144
85
  params: CompactPartitionParams, **kwargs
145
86
  ) -> ExecutionCompactionResult:
146
-
147
- rcf_source_partition_locator = (
87
+ compaction_start_time: float = time.monotonic()
88
+ # Fetch round completion info for previously compacted partition, if it exists
89
+ fetch_compaction_metadata_result: tuple[
90
+ Optional[Manifest], Optional[RoundCompletionInfo]
91
+ ] = _fetch_compaction_metadata(params)
92
+ (
93
+ previous_compacted_delta_manifest,
94
+ round_completion_info,
95
+ ) = fetch_compaction_metadata_result
96
+ rcf_source_partition_locator: rcf.PartitionLocator = (
148
97
  params.rebase_source_partition_locator or params.source_partition_locator
149
98
  )
150
99
 
151
- base_audit_url = rcf_source_partition_locator.path(
100
+ base_audit_url: str = rcf_source_partition_locator.path(
152
101
  f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
153
102
  )
154
- audit_url = f"{base_audit_url}.json"
103
+ audit_url: str = f"{base_audit_url}.json"
155
104
  logger.info(f"Compaction audit will be written to {audit_url}")
156
- compaction_audit = (
105
+ compaction_audit: CompactionSessionAuditInfo = (
157
106
  CompactionSessionAuditInfo(deltacat.__version__, ray.__version__, audit_url)
158
107
  .set_hash_bucket_count(params.hash_bucket_count)
159
108
  .set_compactor_version(CompactorVersion.V2.value)
160
109
  )
161
110
 
162
- compaction_start = time.monotonic()
163
-
164
- task_max_parallelism: int = params.task_max_parallelism
165
-
166
111
  if params.pg_config:
167
112
  logger.info(
168
113
  "pg_config specified. Tasks will be scheduled in a placement group."
169
114
  )
170
115
  cluster_resources = params.pg_config.resource
171
- cluster_cpus = cluster_resources["CPU"]
172
116
  cluster_memory = cluster_resources["memory"]
173
- task_max_parallelism = cluster_cpus
174
117
  compaction_audit.set_total_cluster_memory_bytes(cluster_memory)
175
-
176
- # read the results from any previously completed compaction round
177
- round_completion_info: Optional[RoundCompletionInfo] = None
178
- high_watermark: Optional[HighWatermark] = None
179
- previous_compacted_delta_manifest: Optional[Manifest] = None
180
-
181
- if not params.rebase_source_partition_locator:
182
- round_completion_info = rcf.read_round_completion_file(
183
- params.compaction_artifact_s3_bucket,
184
- params.source_partition_locator,
185
- params.destination_partition_locator,
186
- **params.s3_client_kwargs,
187
- )
188
- if not round_completion_info:
189
- logger.info(
190
- "Both rebase partition and round completion file not found. Performing an entire backfill on source."
191
- )
192
- else:
193
- compacted_delta_locator = round_completion_info.compacted_delta_locator
194
-
195
- previous_compacted_delta_manifest = (
196
- params.deltacat_storage.get_delta_manifest(
197
- compacted_delta_locator, **params.deltacat_storage_kwargs
198
- )
199
- )
200
-
201
- high_watermark = round_completion_info.high_watermark
202
- logger.info(f"Setting round completion high watermark: {high_watermark}")
203
- assert (
204
- params.hash_bucket_count == round_completion_info.hash_bucket_count
205
- ), (
206
- "The hash bucket count has changed. "
207
- "Kindly run rebase compaction and trigger incremental again. "
208
- f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
209
- f"not equal to Hash bucket count in args={params.hash_bucket_count}."
210
- )
211
-
212
- logger.info(f"Round completion file: {round_completion_info}")
213
-
118
+ high_watermark = (
119
+ round_completion_info.high_watermark if round_completion_info else None
120
+ )
121
+ audit_url = compaction_audit.audit_url if compaction_audit else None
122
+ # discover and build uniform deltas
214
123
  delta_discovery_start = time.monotonic()
215
-
216
124
  input_deltas: List[Delta] = io.discover_deltas(
217
125
  params.source_partition_locator,
218
126
  params.last_stream_position_to_compact,
@@ -226,348 +134,46 @@ def _execute_compaction(
226
134
  if not input_deltas:
227
135
  logger.info("No input deltas found to compact.")
228
136
  return ExecutionCompactionResult(None, None, None, False)
229
-
230
- delete_strategy: Optional[DeleteStrategy] = None
231
- delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
232
- delete_file_size_bytes: int = 0
233
- if contains_delete_deltas(input_deltas):
234
- input_deltas, delete_file_envelopes, delete_strategy = prepare_deletes(
235
- params, input_deltas
236
- )
237
- for delete_file_envelope in delete_file_envelopes:
238
- delete_file_size_bytes += delete_file_envelope.table_size_bytes
239
- logger.info(
240
- f" Input deltas contain {DeltaType.DELETE}-type deltas. Total delete file size={delete_file_size_bytes}."
241
- f" Total length of delete file envelopes={len(delete_file_envelopes)}"
242
- )
243
- uniform_deltas: List[DeltaAnnotated] = io.create_uniform_input_deltas(
244
- input_deltas=input_deltas,
245
- hash_bucket_count=params.hash_bucket_count,
246
- compaction_audit=compaction_audit,
247
- deltacat_storage=params.deltacat_storage,
248
- previous_inflation=params.previous_inflation,
249
- min_delta_bytes=params.min_delta_bytes_in_batch,
250
- min_file_counts=params.min_files_in_batch,
251
- # disable input split during rebase as the rebase files are already uniform
252
- enable_input_split=params.rebase_source_partition_locator is None,
253
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
254
- )
255
-
256
- delta_discovery_end = time.monotonic()
257
-
258
- compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
259
- compaction_audit.set_delta_discovery_time_in_seconds(
260
- delta_discovery_end - delta_discovery_start
261
- )
262
-
263
- s3_utils.upload(
264
- compaction_audit.audit_url,
265
- str(json.dumps(compaction_audit)),
266
- **params.s3_client_kwargs,
267
- )
268
-
269
- # create a new stream for this round
270
- compacted_stream_locator: Optional[
271
- StreamLocator
272
- ] = params.destination_partition_locator.stream_locator
273
- compacted_stream: Stream = params.deltacat_storage.get_stream(
274
- compacted_stream_locator.namespace,
275
- compacted_stream_locator.table_name,
276
- compacted_stream_locator.table_version,
277
- **params.deltacat_storage_kwargs,
278
- )
279
- compacted_partition: Partition = params.deltacat_storage.stage_partition(
280
- compacted_stream,
281
- params.destination_partition_locator.partition_values,
282
- **params.deltacat_storage_kwargs,
283
- )
284
-
285
- hb_options_provider = functools.partial(
286
- task_resource_options_provider,
287
- pg_config=params.pg_config,
288
- resource_amount_provider=hash_bucket_resource_options_provider,
289
- previous_inflation=params.previous_inflation,
290
- average_record_size_bytes=params.average_record_size_bytes,
291
- total_memory_buffer_percentage=params.total_memory_buffer_percentage,
292
- primary_keys=params.primary_keys,
293
- ray_custom_resources=params.ray_custom_resources,
294
- memory_logs_enabled=params.memory_logs_enabled,
295
- )
296
-
297
- total_input_records_count = np.int64(0)
298
- total_hb_record_count = np.int64(0)
299
- telemetry_time_hb = 0
300
- if params.hash_bucket_count == 1:
301
- logger.info("Hash bucket count set to 1. Running local merge")
302
- merge_start = time.monotonic()
303
- local_merge_input = generate_local_merge_input(
304
- params,
305
- uniform_deltas,
306
- compacted_partition,
307
- round_completion_info,
308
- delete_strategy,
309
- delete_file_envelopes,
310
- )
311
- estimated_da_bytes = (
312
- compaction_audit.estimated_in_memory_size_bytes_during_discovery
313
- )
314
- estimated_num_records = sum(
315
- [
316
- entry.meta.record_count
317
- for delta in uniform_deltas
318
- for entry in delta.manifest.entries
319
- ]
320
- )
321
- local_merge_options = local_merge_resource_options_provider(
322
- estimated_da_size=estimated_da_bytes,
323
- estimated_num_rows=estimated_num_records,
324
- total_memory_buffer_percentage=params.total_memory_buffer_percentage,
325
- round_completion_info=round_completion_info,
326
- compacted_delta_manifest=previous_compacted_delta_manifest,
327
- ray_custom_resources=params.ray_custom_resources,
328
- primary_keys=params.primary_keys,
329
- memory_logs_enabled=params.memory_logs_enabled,
330
- )
331
- local_merge_result = ray.get(
332
- mg.merge.options(**local_merge_options).remote(local_merge_input)
333
- )
334
- total_input_records_count += local_merge_result.input_record_count
335
- merge_results = [local_merge_result]
336
- merge_invoke_end = time.monotonic()
337
- else:
338
- hb_start = time.monotonic()
339
-
340
- def hash_bucket_input_provider(index, item):
341
- return {
342
- "input": HashBucketInput.of(
343
- item,
344
- primary_keys=params.primary_keys,
345
- hb_task_index=index,
346
- num_hash_buckets=params.hash_bucket_count,
347
- num_hash_groups=params.hash_group_count,
348
- enable_profiler=params.enable_profiler,
349
- metrics_config=params.metrics_config,
350
- read_kwargs_provider=params.read_kwargs_provider,
351
- object_store=params.object_store,
352
- deltacat_storage=params.deltacat_storage,
353
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
354
- memory_logs_enabled=params.memory_logs_enabled,
355
- )
356
- }
357
-
358
- all_hash_group_idx_to_obj_id = defaultdict(list)
359
- all_hash_group_idx_to_size_bytes = defaultdict(int)
360
- all_hash_group_idx_to_num_rows = defaultdict(int)
361
- hb_tasks_pending = invoke_parallel(
362
- items=uniform_deltas,
363
- ray_task=hb.hash_bucket,
364
- max_parallelism=task_max_parallelism,
365
- options_provider=hb_options_provider,
366
- kwargs_provider=hash_bucket_input_provider,
367
- )
368
-
369
- hb_invoke_end = time.monotonic()
370
-
371
- logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
372
- hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
373
- logger.info(f"Got {len(hb_results)} hash bucket results.")
374
- hb_end = time.monotonic()
375
-
376
- # we use time.time() here because time.monotonic() has no reference point
377
- # whereas time.time() measures epoch seconds. Hence, it will be reasonable
378
- # to compare time.time()s captured in different nodes.
379
- hb_results_retrieved_at = time.time()
380
-
381
- telemetry_time_hb = compaction_audit.save_step_stats(
382
- CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
383
- hb_results,
384
- hb_results_retrieved_at,
385
- hb_invoke_end - hb_start,
386
- hb_end - hb_start,
387
- )
388
-
389
- s3_utils.upload(
390
- compaction_audit.audit_url,
391
- str(json.dumps(compaction_audit)),
392
- **params.s3_client_kwargs,
393
- )
394
-
395
- hb_data_processed_size_bytes = np.int64(0)
396
-
397
- # initialize all hash groups
398
- for hb_group in range(params.hash_group_count):
399
- all_hash_group_idx_to_num_rows[hb_group] = 0
400
- all_hash_group_idx_to_obj_id[hb_group] = []
401
- all_hash_group_idx_to_size_bytes[hb_group] = 0
402
-
403
- for hb_result in hb_results:
404
- hb_data_processed_size_bytes += hb_result.hb_size_bytes
405
- total_input_records_count += hb_result.hb_record_count
406
-
407
- for hash_group_index, object_id_size_tuple in enumerate(
408
- hb_result.hash_bucket_group_to_obj_id_tuple
409
- ):
410
- if object_id_size_tuple:
411
- all_hash_group_idx_to_obj_id[hash_group_index].append(
412
- object_id_size_tuple[0],
413
- )
414
- all_hash_group_idx_to_size_bytes[
415
- hash_group_index
416
- ] += object_id_size_tuple[1].item()
417
- all_hash_group_idx_to_num_rows[
418
- hash_group_index
419
- ] += object_id_size_tuple[2].item()
420
-
421
- logger.info(
422
- f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
423
- )
424
-
425
- total_hb_record_count = total_input_records_count
426
- compaction_audit.set_hash_bucket_processed_size_bytes(
427
- hb_data_processed_size_bytes.item()
428
- )
429
-
430
- # BSP Step 2: Merge
431
- # NOTE: DELETE-type deltas are stored in Plasma object store
432
- # in prepare_deletes and therefore don't need to included
433
- # in merge task resource estimation
434
- merge_options_provider = functools.partial(
435
- task_resource_options_provider,
436
- pg_config=params.pg_config,
437
- resource_amount_provider=merge_resource_options_provider,
438
- num_hash_groups=params.hash_group_count,
439
- hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
440
- hash_group_num_rows=all_hash_group_idx_to_num_rows,
441
- total_memory_buffer_percentage=params.total_memory_buffer_percentage,
442
- round_completion_info=round_completion_info,
443
- compacted_delta_manifest=previous_compacted_delta_manifest,
444
- primary_keys=params.primary_keys,
445
- deltacat_storage=params.deltacat_storage,
446
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
447
- ray_custom_resources=params.ray_custom_resources,
448
- memory_logs_enabled=params.memory_logs_enabled,
449
- )
450
-
451
- def merge_input_provider(index, item):
452
- return {
453
- "input": MergeInput.of(
454
- merge_file_groups_provider=RemoteMergeFileGroupsProvider(
455
- hash_group_index=item[0],
456
- dfe_groups_refs=item[1],
457
- hash_bucket_count=params.hash_bucket_count,
458
- num_hash_groups=params.hash_group_count,
459
- object_store=params.object_store,
460
- ),
461
- write_to_partition=compacted_partition,
462
- compacted_file_content_type=params.compacted_file_content_type,
463
- primary_keys=params.primary_keys,
464
- sort_keys=params.sort_keys,
465
- merge_task_index=index,
466
- drop_duplicates=params.drop_duplicates,
467
- max_records_per_output_file=params.records_per_compacted_file,
468
- enable_profiler=params.enable_profiler,
469
- metrics_config=params.metrics_config,
470
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
471
- read_kwargs_provider=params.read_kwargs_provider,
472
- round_completion_info=round_completion_info,
473
- object_store=params.object_store,
474
- deltacat_storage=params.deltacat_storage,
475
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
476
- delete_strategy=delete_strategy,
477
- delete_file_envelopes=delete_file_envelopes,
478
- memory_logs_enabled=params.memory_logs_enabled,
479
- disable_copy_by_reference=params.disable_copy_by_reference,
480
- )
481
- }
482
-
483
- merge_start = time.monotonic()
484
- merge_tasks_pending = invoke_parallel(
485
- items=all_hash_group_idx_to_obj_id.items(),
486
- ray_task=mg.merge,
487
- max_parallelism=task_max_parallelism,
488
- options_provider=merge_options_provider,
489
- kwargs_provider=merge_input_provider,
490
- )
491
- merge_invoke_end = time.monotonic()
492
- logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
493
- merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
494
-
495
- logger.info(f"Got {len(merge_results)} merge results.")
496
-
497
- merge_results_retrieved_at = time.time()
498
- merge_end = time.monotonic()
499
-
500
- total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
501
- total_deleted_record_count = sum(
502
- [ddr.deleted_record_count for ddr in merge_results]
503
- )
504
- logger.info(
505
- f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
506
- )
507
-
508
- compaction_audit.set_input_records(total_input_records_count.item())
509
-
510
- telemetry_time_merge = compaction_audit.save_step_stats(
511
- CompactionSessionAuditInfo.MERGE_STEP_NAME,
137
+ build_uniform_deltas_result: tuple[
138
+ List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition
139
+ ] = _build_uniform_deltas(
140
+ params, compaction_audit, input_deltas, delta_discovery_start
141
+ )
142
+ (
143
+ uniform_deltas,
144
+ delete_strategy,
145
+ delete_file_envelopes,
146
+ ) = build_uniform_deltas_result
147
+
148
+ # run merge
149
+ _run_hash_and_merge_result: tuple[
150
+ Optional[List[MergeResult]],
151
+ np.float64,
152
+ np.float64,
153
+ Partition,
154
+ ] = _run_hash_and_merge(
155
+ params,
156
+ uniform_deltas,
157
+ round_completion_info,
158
+ delete_strategy,
159
+ delete_file_envelopes,
160
+ compaction_audit,
161
+ previous_compacted_delta_manifest,
162
+ )
163
+ (
512
164
  merge_results,
513
- merge_results_retrieved_at,
514
- merge_invoke_end - merge_start,
515
- merge_end - merge_start,
516
- )
517
-
518
- compaction_audit.set_records_deduped(total_dd_record_count.item())
519
- compaction_audit.set_records_deleted(total_deleted_record_count.item())
520
- mat_results = []
521
- for merge_result in merge_results:
522
- mat_results.extend(merge_result.materialize_results)
523
-
524
- mat_results: List[MaterializeResult] = sorted(
525
- mat_results, key=lambda m: m.task_index
526
- )
527
-
528
- hb_id_to_entry_indices_range = {}
529
- file_index = 0
530
- previous_task_index = -1
531
-
532
- for mat_result in mat_results:
533
- assert (
534
- mat_result.pyarrow_write_result.files >= 1
535
- ), "Atleast one file must be materialized"
536
- assert (
537
- mat_result.task_index != previous_task_index
538
- ), f"Multiple materialize results found for a hash bucket: {mat_result.task_index}"
539
-
540
- hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
541
- file_index,
542
- file_index + mat_result.pyarrow_write_result.files,
543
- )
544
-
545
- file_index += mat_result.pyarrow_write_result.files
546
- previous_task_index = mat_result.task_index
547
-
548
- s3_utils.upload(
549
- compaction_audit.audit_url,
550
- str(json.dumps(compaction_audit)),
551
- **params.s3_client_kwargs,
552
- )
553
-
554
- deltas = [m.delta for m in mat_results]
555
-
556
- # Note: An appropriate last stream position must be set
557
- # to avoid correctness issue.
558
- merged_delta: Delta = Delta.merge_deltas(
559
- deltas,
560
- stream_position=params.last_stream_position_to_compact,
561
- )
562
-
563
- record_info_msg = (
564
- f"Hash bucket records: {total_hb_record_count},"
565
- f" Deduped records: {total_dd_record_count}, "
566
- f" Deleted records: {total_deleted_record_count}, "
567
- f" Materialized records: {merged_delta.meta.record_count}"
568
- )
165
+ telemetry_time_hb,
166
+ telemetry_time_merge,
167
+ compacted_partition,
168
+ ) = _run_hash_and_merge_result
169
+ # process merge results
170
+ process_merge_results: tuple[
171
+ Delta, list[MaterializeResult], dict
172
+ ] = _process_merge_results(params, merge_results, compaction_audit)
173
+ merged_delta, mat_results, hb_id_to_entry_indices_range = process_merge_results
174
+ # Record information, logging, and return ExecutionCompactionResult
175
+ record_info_msg: str = f" Materialized records: {merged_delta.meta.record_count}"
569
176
  logger.info(record_info_msg)
570
-
571
177
  compacted_delta: Delta = params.deltacat_storage.commit_delta(
572
178
  merged_delta,
573
179
  properties=kwargs.get("properties", {}),
@@ -575,16 +181,15 @@ def _execute_compaction(
575
181
  )
576
182
 
577
183
  logger.info(f"Committed compacted delta: {compacted_delta}")
578
-
579
- compaction_end = time.monotonic()
580
- compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
581
-
582
- new_compacted_delta_locator = DeltaLocator.of(
184
+ compaction_end_time: float = time.monotonic()
185
+ compaction_audit.set_compaction_time_in_seconds(
186
+ compaction_end_time - compaction_start_time
187
+ )
188
+ new_compacted_delta_locator: DeltaLocator = DeltaLocator.of(
583
189
  compacted_partition.locator,
584
190
  compacted_delta.stream_position,
585
191
  )
586
-
587
- pyarrow_write_result = PyArrowWriteResult.union(
192
+ pyarrow_write_result: PyArrowWriteResult = PyArrowWriteResult.union(
588
193
  [m.pyarrow_write_result for m in mat_results]
589
194
  )
590
195
 
@@ -597,100 +202,19 @@ def _execute_compaction(
597
202
  mat_results, telemetry_time_hb + telemetry_time_merge
598
203
  )
599
204
 
600
- input_inflation = None
601
- input_average_record_size_bytes = None
602
- # Note: we only consider inflation for incremental delta
603
- if (
604
- compaction_audit.input_size_bytes
605
- and compaction_audit.hash_bucket_processed_size_bytes
606
- ):
607
- input_inflation = (
608
- compaction_audit.hash_bucket_processed_size_bytes
609
- / compaction_audit.input_size_bytes
610
- )
611
-
612
- if (
613
- compaction_audit.hash_bucket_processed_size_bytes
614
- and compaction_audit.input_records
615
- ):
616
- input_average_record_size_bytes = (
617
- compaction_audit.hash_bucket_processed_size_bytes
618
- / compaction_audit.input_records
619
- )
620
-
621
- logger.info(
622
- f"The inflation of input deltas={input_inflation}"
623
- f" and average record size={input_average_record_size_bytes}"
205
+ _upload_compaction_audit(
206
+ params,
207
+ compaction_audit,
208
+ round_completion_info,
624
209
  )
625
-
626
- # After all incremental delta related calculations, we update
627
- # the input sizes to accommodate the compacted table
628
- if round_completion_info:
629
- compaction_audit.set_input_file_count(
630
- (compaction_audit.input_file_count or 0)
631
- + round_completion_info.compacted_pyarrow_write_result.files
632
- )
633
- compaction_audit.set_input_size_bytes(
634
- (compaction_audit.input_size_bytes or 0.0)
635
- + round_completion_info.compacted_pyarrow_write_result.file_bytes
636
- )
637
- compaction_audit.set_input_records(
638
- (compaction_audit.input_records or 0)
639
- + round_completion_info.compacted_pyarrow_write_result.records
640
- )
641
-
642
- s3_utils.upload(
643
- compaction_audit.audit_url,
644
- str(json.dumps(compaction_audit)),
645
- **params.s3_client_kwargs,
646
- )
647
-
648
- new_round_completion_info = RoundCompletionInfo.of(
649
- high_watermark=params.last_stream_position_to_compact,
650
- compacted_delta_locator=new_compacted_delta_locator,
651
- compacted_pyarrow_write_result=pyarrow_write_result,
652
- sort_keys_bit_width=params.bit_width_of_sort_keys,
653
- manifest_entry_copied_by_reference_ratio=compaction_audit.untouched_file_ratio,
654
- compaction_audit_url=audit_url,
655
- hash_bucket_count=params.hash_bucket_count,
656
- hb_index_to_entry_range=hb_id_to_entry_indices_range,
657
- compactor_version=CompactorVersion.V2.value,
658
- input_inflation=input_inflation,
659
- input_average_record_size_bytes=input_average_record_size_bytes,
660
- )
661
-
662
- logger.info(
663
- f"Partition-{params.source_partition_locator.partition_values},"
664
- f"compacted at: {params.last_stream_position_to_compact},"
665
- )
666
- logger.info(
667
- f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
668
- )
669
- is_inplace_compacted: bool = (
670
- rcf_source_partition_locator.partition_values
671
- == params.destination_partition_locator.partition_values
672
- and rcf_source_partition_locator.stream_id
673
- == params.destination_partition_locator.stream_id
674
- )
675
- if is_inplace_compacted:
676
- logger.info(
677
- "Overriding round completion file source partition locator as in-place compacted. "
678
- + f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
679
- f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
680
- )
681
- rcf_source_partition_locator = compacted_partition.locator
682
-
683
- round_completion_file_s3_url = rcf.write_round_completion_file(
684
- params.compaction_artifact_s3_bucket,
685
- rcf_source_partition_locator,
686
- compacted_partition.locator,
687
- new_round_completion_info,
688
- **params.s3_client_kwargs,
689
- )
690
-
691
- return ExecutionCompactionResult(
210
+ compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
211
+ params,
212
+ compaction_audit,
692
213
  compacted_partition,
693
- new_round_completion_info,
694
- round_completion_file_s3_url,
695
- is_inplace_compacted,
214
+ audit_url,
215
+ hb_id_to_entry_indices_range,
216
+ rcf_source_partition_locator,
217
+ new_compacted_delta_locator,
218
+ pyarrow_write_result,
696
219
  )
220
+ return compaction_result