deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +184 -29
- deltacat/compute/compactor/model/compact_partition_params.py +153 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
- deltacat/compute/compactor/model/dedupe_result.py +3 -0
- deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
- deltacat/compute/compactor/model/delta_file_locator.py +11 -6
- deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
- deltacat/compute/compactor/model/materialize_result.py +27 -6
- deltacat/compute/compactor/model/round_completion_info.py +9 -0
- deltacat/compute/compactor/steps/dedupe.py +35 -19
- deltacat/compute/compactor/steps/hash_bucket.py +41 -16
- deltacat/compute/compactor/steps/materialize.py +73 -70
- deltacat/compute/compactor/utils/io.py +15 -0
- deltacat/compute/compactor/utils/primary_key_index.py +9 -15
- deltacat/compute/compactor/utils/round_completion_file.py +13 -4
- deltacat/compute/compactor/utils/system_columns.py +32 -0
- deltacat/io/__init__.py +0 -7
- deltacat/io/file_object_store.py +48 -0
- deltacat/io/memcached_object_store.py +121 -0
- deltacat/io/object_store.py +51 -0
- deltacat/io/ray_plasma_object_store.py +23 -0
- deltacat/io/redis_object_store.py +114 -0
- deltacat/io/s3_object_store.py +44 -0
- deltacat/storage/model/delta.py +2 -1
- deltacat/tests/compactor/test_compact_partition_params.py +237 -0
- deltacat/tests/compactor/utils/test_io.py +27 -5
- deltacat/tests/io/__init__.py +0 -0
- deltacat/tests/io/test_file_object_store.py +86 -0
- deltacat/tests/io/test_memcached_object_store.py +158 -0
- deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
- deltacat/tests/io/test_redis_object_store.py +103 -0
- deltacat/tests/io/test_s3_object_store.py +59 -0
- deltacat/tests/utils/test_record_batch_tables.py +1 -1
- deltacat/tests/utils/test_resources.py +9 -0
- deltacat/utils/ray_utils/concurrency.py +0 -2
- deltacat/utils/resources.py +30 -18
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -3,6 +3,10 @@ from contextlib import nullcontext
|
|
3
3
|
import functools
|
4
4
|
import logging
|
5
5
|
import ray
|
6
|
+
import time
|
7
|
+
import json
|
8
|
+
from deltacat.aws import s3u as s3_utils
|
9
|
+
import deltacat
|
6
10
|
from deltacat import logs
|
7
11
|
import pyarrow as pa
|
8
12
|
from deltacat.compute.compactor import (
|
@@ -12,6 +16,9 @@ from deltacat.compute.compactor import (
|
|
12
16
|
)
|
13
17
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
14
18
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
19
|
+
from deltacat.io.object_store import IObjectStore
|
20
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
21
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
15
22
|
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
16
23
|
from deltacat.storage import (
|
17
24
|
Delta,
|
@@ -20,6 +27,9 @@ from deltacat.storage import (
|
|
20
27
|
PartitionLocator,
|
21
28
|
interface as unimplemented_deltacat_storage,
|
22
29
|
)
|
30
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
31
|
+
CompactPartitionParams,
|
32
|
+
)
|
23
33
|
from deltacat.utils.ray_utils.concurrency import (
|
24
34
|
invoke_parallel,
|
25
35
|
round_robin_options_provider,
|
@@ -37,7 +47,11 @@ from deltacat.utils.placement import PlacementGroupConfig
|
|
37
47
|
from typing import List, Set, Optional, Tuple, Dict, Any
|
38
48
|
from collections import defaultdict
|
39
49
|
from deltacat.utils.metrics import MetricsConfig
|
40
|
-
from deltacat.
|
50
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
51
|
+
CompactionSessionAuditInfo,
|
52
|
+
)
|
53
|
+
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
54
|
+
|
41
55
|
|
42
56
|
if importlib.util.find_spec("memray"):
|
43
57
|
import memray
|
@@ -100,6 +114,8 @@ def compact_partition(
|
|
100
114
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
101
115
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
102
116
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
117
|
+
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
118
|
+
s3_client_kwargs: Optional[Dict[str, Any]] = None,
|
103
119
|
deltacat_storage=unimplemented_deltacat_storage,
|
104
120
|
**kwargs,
|
105
121
|
) -> Optional[str]:
|
@@ -139,6 +155,8 @@ def compact_partition(
|
|
139
155
|
list_deltas_kwargs,
|
140
156
|
read_kwargs_provider,
|
141
157
|
s3_table_writer_kwargs,
|
158
|
+
object_store,
|
159
|
+
s3_client_kwargs,
|
142
160
|
deltacat_storage,
|
143
161
|
**kwargs,
|
144
162
|
)
|
@@ -184,10 +202,29 @@ def _execute_compaction_round(
|
|
184
202
|
list_deltas_kwargs: Optional[Dict[str, Any]],
|
185
203
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
186
204
|
s3_table_writer_kwargs: Optional[Dict[str, Any]],
|
205
|
+
object_store: Optional[IObjectStore],
|
206
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
187
207
|
deltacat_storage=unimplemented_deltacat_storage,
|
188
208
|
**kwargs,
|
189
209
|
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
190
210
|
|
211
|
+
rcf_source_partition_locator = (
|
212
|
+
rebase_source_partition_locator
|
213
|
+
if rebase_source_partition_locator
|
214
|
+
else source_partition_locator
|
215
|
+
)
|
216
|
+
|
217
|
+
base_audit_url = rcf_source_partition_locator.path(
|
218
|
+
f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
|
219
|
+
)
|
220
|
+
audit_url = f"{base_audit_url}.json"
|
221
|
+
|
222
|
+
logger.info(f"Compaction audit will be written to {audit_url}")
|
223
|
+
|
224
|
+
compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
|
225
|
+
|
226
|
+
compaction_start = time.monotonic()
|
227
|
+
|
191
228
|
if not primary_keys:
|
192
229
|
# TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
|
193
230
|
# with normalized manifest entry sizes
|
@@ -230,6 +267,7 @@ def _execute_compaction_round(
|
|
230
267
|
f"{node_resource_keys}"
|
231
268
|
)
|
232
269
|
|
270
|
+
compaction_audit.set_cluster_cpu_max(cluster_cpus)
|
233
271
|
# create a remote options provider to round-robin tasks across all nodes or allocated bundles
|
234
272
|
logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
|
235
273
|
round_robin_opt_provider = functools.partial(
|
@@ -257,6 +295,13 @@ def _execute_compaction_round(
|
|
257
295
|
)
|
258
296
|
logger.info(f"Round completion file: {round_completion_info}")
|
259
297
|
|
298
|
+
enable_manifest_entry_copy_by_reference = (
|
299
|
+
False if rebase_source_partition_locator else True
|
300
|
+
)
|
301
|
+
logger.info(
|
302
|
+
f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
|
303
|
+
)
|
304
|
+
|
260
305
|
# discover input delta files
|
261
306
|
# For rebase:
|
262
307
|
# Copy the old compacted table to a new destination, plus any new deltas from rebased source
|
@@ -268,6 +313,7 @@ def _execute_compaction_round(
|
|
268
313
|
round_completion_info.high_watermark if round_completion_info else None
|
269
314
|
)
|
270
315
|
|
316
|
+
delta_discovery_start = time.monotonic()
|
271
317
|
(
|
272
318
|
input_deltas,
|
273
319
|
previous_last_stream_position_compacted_on_destination_table,
|
@@ -282,6 +328,17 @@ def _execute_compaction_round(
|
|
282
328
|
**list_deltas_kwargs,
|
283
329
|
)
|
284
330
|
|
331
|
+
delta_discovery_end = time.monotonic()
|
332
|
+
compaction_audit.set_delta_discovery_time_in_seconds(
|
333
|
+
delta_discovery_end - delta_discovery_start
|
334
|
+
)
|
335
|
+
|
336
|
+
s3_utils.upload(
|
337
|
+
compaction_audit.audit_url,
|
338
|
+
str(json.dumps(compaction_audit)),
|
339
|
+
**s3_client_kwargs,
|
340
|
+
)
|
341
|
+
|
285
342
|
if not input_deltas:
|
286
343
|
logger.info("No input deltas found to compact.")
|
287
344
|
return None, None, None
|
@@ -298,6 +355,7 @@ def _execute_compaction_round(
|
|
298
355
|
io.fit_input_deltas(
|
299
356
|
input_deltas,
|
300
357
|
cluster_resources,
|
358
|
+
compaction_audit,
|
301
359
|
hash_bucket_count,
|
302
360
|
deltacat_storage=deltacat_storage,
|
303
361
|
)
|
@@ -307,11 +365,14 @@ def _execute_compaction_round(
|
|
307
365
|
cluster_resources,
|
308
366
|
hash_bucket_count,
|
309
367
|
min_hash_bucket_chunk_size,
|
368
|
+
compaction_audit=compaction_audit,
|
310
369
|
input_deltas_stats=input_deltas_stats,
|
311
370
|
deltacat_storage=deltacat_storage,
|
312
371
|
)
|
313
372
|
)
|
314
373
|
|
374
|
+
compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
|
375
|
+
|
315
376
|
assert hash_bucket_count is not None and hash_bucket_count > 0, (
|
316
377
|
f"Expected hash bucket count to be a positive integer, but found "
|
317
378
|
f"`{hash_bucket_count}`"
|
@@ -335,6 +396,8 @@ def _execute_compaction_round(
|
|
335
396
|
"Multiple rounds are not supported. Please increase the cluster size and run again."
|
336
397
|
)
|
337
398
|
|
399
|
+
hb_start = time.monotonic()
|
400
|
+
|
338
401
|
hb_tasks_pending = invoke_parallel(
|
339
402
|
items=uniform_deltas,
|
340
403
|
ray_task=hb.hash_bucket,
|
@@ -348,11 +411,32 @@ def _execute_compaction_round(
|
|
348
411
|
enable_profiler=enable_profiler,
|
349
412
|
metrics_config=metrics_config,
|
350
413
|
read_kwargs_provider=read_kwargs_provider,
|
414
|
+
object_store=object_store,
|
351
415
|
deltacat_storage=deltacat_storage,
|
352
416
|
)
|
417
|
+
|
418
|
+
hb_invoke_end = time.monotonic()
|
419
|
+
|
353
420
|
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
354
421
|
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
355
422
|
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
423
|
+
hb_end = time.monotonic()
|
424
|
+
hb_results_retrieved_at = time.time()
|
425
|
+
|
426
|
+
telemetry_time_hb = compaction_audit.save_step_stats(
|
427
|
+
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
428
|
+
hb_results,
|
429
|
+
hb_results_retrieved_at,
|
430
|
+
hb_invoke_end - hb_start,
|
431
|
+
hb_end - hb_start,
|
432
|
+
)
|
433
|
+
|
434
|
+
s3_utils.upload(
|
435
|
+
compaction_audit.audit_url,
|
436
|
+
str(json.dumps(compaction_audit)),
|
437
|
+
**s3_client_kwargs,
|
438
|
+
)
|
439
|
+
|
356
440
|
all_hash_group_idx_to_obj_id = defaultdict(list)
|
357
441
|
for hb_result in hb_results:
|
358
442
|
for hash_group_index, object_id in enumerate(
|
@@ -367,6 +451,8 @@ def _execute_compaction_round(
|
|
367
451
|
f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
|
368
452
|
)
|
369
453
|
|
454
|
+
compaction_audit.set_input_records(total_hb_record_count.item())
|
455
|
+
|
370
456
|
# TODO (pdames): when resources are freed during the last round of hash
|
371
457
|
# bucketing, start running dedupe tasks that read existing dedupe
|
372
458
|
# output from S3 then wait for hash bucketing to finish before continuing
|
@@ -389,10 +475,18 @@ def _execute_compaction_round(
|
|
389
475
|
# identify the index of records to keep or drop based on sort keys
|
390
476
|
num_materialize_buckets = max_parallelism
|
391
477
|
logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
|
478
|
+
|
479
|
+
dedupe_start = time.monotonic()
|
480
|
+
dd_max_parallelism = int(
|
481
|
+
max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
|
482
|
+
)
|
483
|
+
logger.info(
|
484
|
+
f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
|
485
|
+
)
|
392
486
|
dd_tasks_pending = invoke_parallel(
|
393
487
|
items=all_hash_group_idx_to_obj_id.values(),
|
394
488
|
ray_task=dd.dedupe,
|
395
|
-
max_parallelism=
|
489
|
+
max_parallelism=dd_max_parallelism,
|
396
490
|
options_provider=round_robin_opt_provider,
|
397
491
|
kwargs_provider=lambda index, item: {
|
398
492
|
"dedupe_task_index": index,
|
@@ -402,12 +496,33 @@ def _execute_compaction_round(
|
|
402
496
|
num_materialize_buckets=num_materialize_buckets,
|
403
497
|
enable_profiler=enable_profiler,
|
404
498
|
metrics_config=metrics_config,
|
499
|
+
object_store=object_store,
|
405
500
|
)
|
501
|
+
|
502
|
+
dedupe_invoke_end = time.monotonic()
|
406
503
|
logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
|
407
504
|
dd_results: List[DedupeResult] = ray.get(dd_tasks_pending)
|
408
505
|
logger.info(f"Got {len(dd_results)} dedupe results.")
|
506
|
+
|
507
|
+
# we use time.time() here because time.monotonic() has no reference point
|
508
|
+
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
509
|
+
# to compare time.time()s captured in different nodes.
|
510
|
+
dedupe_results_retrieved_at = time.time()
|
511
|
+
dedupe_end = time.monotonic()
|
512
|
+
|
409
513
|
total_dd_record_count = sum([ddr.deduped_record_count for ddr in dd_results])
|
410
514
|
logger.info(f"Deduped {total_dd_record_count} records...")
|
515
|
+
|
516
|
+
telemetry_time_dd = compaction_audit.save_step_stats(
|
517
|
+
CompactionSessionAuditInfo.DEDUPE_STEP_NAME,
|
518
|
+
dd_results,
|
519
|
+
dedupe_results_retrieved_at,
|
520
|
+
dedupe_invoke_end - dedupe_start,
|
521
|
+
dedupe_end - dedupe_start,
|
522
|
+
)
|
523
|
+
|
524
|
+
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
525
|
+
|
411
526
|
all_mat_buckets_to_obj_id = defaultdict(list)
|
412
527
|
for dd_result in dd_results:
|
413
528
|
for (
|
@@ -420,6 +535,8 @@ def _execute_compaction_round(
|
|
420
535
|
logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
|
421
536
|
logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
|
422
537
|
|
538
|
+
compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
|
539
|
+
|
423
540
|
# TODO(pdames): when resources are freed during the last round of deduping
|
424
541
|
# start running materialize tasks that read materialization source file
|
425
542
|
# tables from S3 then wait for deduping to finish before continuing
|
@@ -432,6 +549,15 @@ def _execute_compaction_round(
|
|
432
549
|
|
433
550
|
# parallel step 3:
|
434
551
|
# materialize records to keep by index
|
552
|
+
|
553
|
+
s3_utils.upload(
|
554
|
+
compaction_audit.audit_url,
|
555
|
+
str(json.dumps(compaction_audit)),
|
556
|
+
**s3_client_kwargs,
|
557
|
+
)
|
558
|
+
|
559
|
+
materialize_start = time.monotonic()
|
560
|
+
|
435
561
|
mat_tasks_pending = invoke_parallel(
|
436
562
|
items=all_mat_buckets_to_obj_id.items(),
|
437
563
|
ray_task=mat.materialize,
|
@@ -445,38 +571,34 @@ def _execute_compaction_round(
|
|
445
571
|
round_completion_info=round_completion_info,
|
446
572
|
source_partition_locator=source_partition_locator,
|
447
573
|
partition=partition,
|
574
|
+
enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
|
448
575
|
max_records_per_output_file=records_per_compacted_file,
|
449
576
|
compacted_file_content_type=compacted_file_content_type,
|
450
577
|
enable_profiler=enable_profiler,
|
451
578
|
metrics_config=metrics_config,
|
452
579
|
read_kwargs_provider=read_kwargs_provider,
|
453
580
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
581
|
+
object_store=object_store,
|
454
582
|
deltacat_storage=deltacat_storage,
|
455
583
|
)
|
584
|
+
|
585
|
+
materialize_invoke_end = time.monotonic()
|
586
|
+
|
456
587
|
logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
|
457
|
-
mat_results = ray.get(mat_tasks_pending)
|
458
|
-
total_count_of_src_dfl_not_touched = sum(
|
459
|
-
m.count_of_src_dfl_not_touched for m in mat_results
|
460
|
-
)
|
461
|
-
total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
|
462
|
-
logger.info(
|
463
|
-
f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
|
464
|
-
)
|
465
|
-
logger.info(
|
466
|
-
f"Got total of {total_length_src_dfl} manifest files during compaction."
|
467
|
-
)
|
468
|
-
manifest_entry_copied_by_reference_ratio = (
|
469
|
-
(round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
|
470
|
-
if total_length_src_dfl != 0
|
471
|
-
else None
|
472
|
-
)
|
473
|
-
logger.info(
|
474
|
-
f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
|
475
|
-
)
|
588
|
+
mat_results: List[MaterializeResult] = ray.get(mat_tasks_pending)
|
476
589
|
|
477
590
|
logger.info(f"Got {len(mat_results)} materialize result(s).")
|
478
591
|
|
479
|
-
|
592
|
+
materialize_end = time.monotonic()
|
593
|
+
materialize_results_retrieved_at = time.time()
|
594
|
+
|
595
|
+
telemetry_time_materialize = compaction_audit.save_step_stats(
|
596
|
+
CompactionSessionAuditInfo.MATERIALIZE_STEP_NAME,
|
597
|
+
mat_results,
|
598
|
+
materialize_results_retrieved_at,
|
599
|
+
materialize_invoke_end - materialize_start,
|
600
|
+
materialize_end - materialize_start,
|
601
|
+
)
|
480
602
|
|
481
603
|
mat_results = sorted(mat_results, key=lambda m: m.task_index)
|
482
604
|
deltas = [m.delta for m in mat_results]
|
@@ -494,6 +616,7 @@ def _execute_compaction_round(
|
|
494
616
|
f" Materialized records: {merged_delta.meta.record_count}"
|
495
617
|
)
|
496
618
|
logger.info(record_info_msg)
|
619
|
+
|
497
620
|
assert (
|
498
621
|
total_hb_record_count - total_dd_record_count == merged_delta.meta.record_count
|
499
622
|
), (
|
@@ -506,6 +629,9 @@ def _execute_compaction_round(
|
|
506
629
|
)
|
507
630
|
logger.info(f"Committed compacted delta: {compacted_delta}")
|
508
631
|
|
632
|
+
compaction_end = time.monotonic()
|
633
|
+
compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
|
634
|
+
|
509
635
|
new_compacted_delta_locator = DeltaLocator.of(
|
510
636
|
new_compacted_partition_locator,
|
511
637
|
compacted_delta.stream_position,
|
@@ -516,26 +642,55 @@ def _execute_compaction_round(
|
|
516
642
|
if round_completion_info
|
517
643
|
else None
|
518
644
|
)
|
645
|
+
|
646
|
+
pyarrow_write_result = PyArrowWriteResult.union(
|
647
|
+
[m.pyarrow_write_result for m in mat_results]
|
648
|
+
)
|
649
|
+
|
650
|
+
session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
|
651
|
+
compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
|
652
|
+
session_peak_memory
|
653
|
+
)
|
654
|
+
|
655
|
+
compaction_audit.save_round_completion_stats(
|
656
|
+
mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
657
|
+
)
|
658
|
+
|
659
|
+
s3_utils.upload(
|
660
|
+
compaction_audit.audit_url,
|
661
|
+
str(json.dumps(compaction_audit)),
|
662
|
+
**s3_client_kwargs,
|
663
|
+
)
|
664
|
+
|
519
665
|
new_round_completion_info = RoundCompletionInfo.of(
|
520
666
|
last_stream_position_compacted,
|
521
667
|
new_compacted_delta_locator,
|
522
|
-
|
668
|
+
pyarrow_write_result,
|
523
669
|
bit_width_of_sort_keys,
|
524
670
|
last_rebase_source_partition_locator,
|
525
|
-
|
526
|
-
|
527
|
-
rcf_source_partition_locator = (
|
528
|
-
rebase_source_partition_locator
|
529
|
-
if rebase_source_partition_locator
|
530
|
-
else source_partition_locator
|
671
|
+
compaction_audit.untouched_file_ratio,
|
672
|
+
audit_url,
|
531
673
|
)
|
674
|
+
|
532
675
|
logger.info(
|
533
676
|
f"partition-{source_partition_locator.partition_values},"
|
534
677
|
f"compacted at: {last_stream_position_compacted},"
|
535
678
|
f"last position: {last_stream_position_to_compact}"
|
536
679
|
)
|
680
|
+
|
537
681
|
return (
|
538
682
|
partition,
|
539
683
|
new_round_completion_info,
|
540
684
|
rcf_source_partition_locator,
|
541
685
|
)
|
686
|
+
|
687
|
+
|
688
|
+
def compact_partition_from_request(
|
689
|
+
compact_partition_params: CompactPartitionParams,
|
690
|
+
) -> Optional[str]:
|
691
|
+
"""
|
692
|
+
Wrapper for compact_partition that allows for the compact_partition parameters to be
|
693
|
+
passed in as a custom dictionary-like CompactPartitionParams object.
|
694
|
+
:param compact_partition_params:
|
695
|
+
"""
|
696
|
+
return compact_partition(**compact_partition_params)
|
@@ -0,0 +1,153 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
import json
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from deltacat.types.media import ContentType
|
8
|
+
|
9
|
+
|
10
|
+
class CompactPartitionParams(dict):
|
11
|
+
"""
|
12
|
+
This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
|
13
|
+
"""
|
14
|
+
|
15
|
+
@staticmethod
|
16
|
+
def of(params: Optional[Dict]) -> CompactPartitionParams:
|
17
|
+
if params is None:
|
18
|
+
params = {}
|
19
|
+
compact_partition_params = CompactPartitionParams()
|
20
|
+
compact_partition_params["destination_partition_locator"] = params.get(
|
21
|
+
"destination_partition_locator"
|
22
|
+
)
|
23
|
+
compact_partition_params["last_stream_position_to_compact"] = params.get(
|
24
|
+
"last_stream_position_to_compact"
|
25
|
+
)
|
26
|
+
compact_partition_params["source_partition_locator"] = params.get(
|
27
|
+
"source_partition_locator"
|
28
|
+
)
|
29
|
+
compact_partition_params["primary_keys"] = params.get("primary_keys")
|
30
|
+
compact_partition_params["rebase_source_partition_locator"] = params.get(
|
31
|
+
"rebase_source_partition_locator"
|
32
|
+
)
|
33
|
+
compact_partition_params["rebase_source_partition_high_watermark"] = params.get(
|
34
|
+
"rebase_source_partition_high_watermark"
|
35
|
+
)
|
36
|
+
compact_partition_params["hash_bucket_count"] = params.get("hash_bucket_count")
|
37
|
+
compact_partition_params["deltacat_storage"] = params.get("deltacat_storage")
|
38
|
+
compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
|
39
|
+
"compaction_artifact_s3_bucket"
|
40
|
+
)
|
41
|
+
compact_partition_params["properties"] = params.get("properties")
|
42
|
+
compact_partition_params["compacted_file_content_type"] = params.get(
|
43
|
+
"compacted_file_content_type"
|
44
|
+
)
|
45
|
+
compact_partition_params["list_deltas_kwargs"] = params.get(
|
46
|
+
"list_deltas_kwargs"
|
47
|
+
)
|
48
|
+
compact_partition_params["pg_config"] = params.get("pg_config")
|
49
|
+
compact_partition_params["read_kwargs_provider"] = params.get(
|
50
|
+
"read_kwargs_provider"
|
51
|
+
)
|
52
|
+
compact_partition_params["s3_table_writer_kwargs"] = params.get(
|
53
|
+
"s3_table_writer_kwargs"
|
54
|
+
)
|
55
|
+
return compact_partition_params
|
56
|
+
|
57
|
+
@property
|
58
|
+
def destination_partition_locator(self) -> Optional[dict]:
|
59
|
+
return self["destination_partition_locator"]
|
60
|
+
|
61
|
+
@property
|
62
|
+
def last_stream_position_to_compact(self) -> Optional[int]:
|
63
|
+
return self["last_stream_position_to_compact"]
|
64
|
+
|
65
|
+
@property
|
66
|
+
def source_partition_locator(self) -> Optional[dict]:
|
67
|
+
return self["source_partition_locator"]
|
68
|
+
|
69
|
+
@property
|
70
|
+
def primary_keys(self) -> Optional[List[str]]:
|
71
|
+
return list(self["primary_keys"])
|
72
|
+
|
73
|
+
@property
|
74
|
+
def rebase_source_partition_locator(self) -> Optional[dict]:
|
75
|
+
return self["rebase_source_partition_locator"]
|
76
|
+
|
77
|
+
@property
|
78
|
+
def rebase_source_partition_high_watermark(self) -> Optional[int]:
|
79
|
+
return self["rebase_source_partition_high_watermark"]
|
80
|
+
|
81
|
+
@property
|
82
|
+
def hash_bucket_count(self) -> Optional[int]:
|
83
|
+
return self["hash_bucket_count"]
|
84
|
+
|
85
|
+
@property
|
86
|
+
def deltacat_storage(self) -> Optional[str]:
|
87
|
+
return self["deltacat_storage"]
|
88
|
+
|
89
|
+
@property
|
90
|
+
def compaction_artifact_s3_bucket(self) -> Optional[str]:
|
91
|
+
return self["compaction_artifact_s3_bucket"]
|
92
|
+
|
93
|
+
@property
|
94
|
+
def properties(self) -> Optional[Dict[str, str]]:
|
95
|
+
return self["properties"]
|
96
|
+
|
97
|
+
@property
|
98
|
+
def compacted_file_content_type(self) -> Optional[ContentType]:
|
99
|
+
return self["compacted_file_content_type"]
|
100
|
+
|
101
|
+
@property
|
102
|
+
def list_deltas_kwargs(self) -> Optional[dict]:
|
103
|
+
return self["list_deltas_kwargs"]
|
104
|
+
|
105
|
+
@property
|
106
|
+
def pg_config(self) -> Optional[Any]:
|
107
|
+
return self["pg_config"]
|
108
|
+
|
109
|
+
@property
|
110
|
+
def read_kwargs_provider(self) -> Optional[Any]:
|
111
|
+
return self["read_kwargs_provider"]
|
112
|
+
|
113
|
+
@property
|
114
|
+
def s3_table_writer_kwargs(self) -> Optional[Any]:
|
115
|
+
return self["s3_table_writer_kwargs"]
|
116
|
+
|
117
|
+
@staticmethod
|
118
|
+
def json_handler_for_compact_partition_params(obj):
|
119
|
+
"""
|
120
|
+
A handler for the `json.dumps()` function that can be used to serialize sets to JSON.
|
121
|
+
If the `set_default()` handler is passed as the `default` argument to the `json.dumps()` function, it will be called whenever a set object is encountered.
|
122
|
+
The `set_default()` handler will then serialize the set as a list.
|
123
|
+
"""
|
124
|
+
try:
|
125
|
+
if isinstance(obj, set):
|
126
|
+
return list(obj)
|
127
|
+
elif hasattr(obj, "toJSON"):
|
128
|
+
return obj.toJSON()
|
129
|
+
else:
|
130
|
+
return obj.__dict__
|
131
|
+
except Exception:
|
132
|
+
return obj.__class__.__name__
|
133
|
+
|
134
|
+
def serialize(self) -> str:
|
135
|
+
"""
|
136
|
+
Serializes itself to a json-formatted string
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
The serialized object.
|
140
|
+
|
141
|
+
"""
|
142
|
+
to_serialize: Dict[str, Any] = {}
|
143
|
+
# individually try deepcopy the values from the self dictionary and just use the class name for the value when it is not possible to deepcopy
|
144
|
+
for attr, value in self.items():
|
145
|
+
try:
|
146
|
+
to_serialize[attr] = copy.deepcopy(value)
|
147
|
+
except Exception: # if unable to deep copy the objects like module objects for example then just provide the class name at minimum
|
148
|
+
to_serialize[attr] = value.__class__.__name__
|
149
|
+
serialized_arguments_compact_partition_args: str = json.dumps(
|
150
|
+
to_serialize,
|
151
|
+
default=CompactPartitionParams.json_handler_for_compact_partition_params,
|
152
|
+
)
|
153
|
+
return serialized_arguments_compact_partition_args
|