deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +184 -29
  3. deltacat/compute/compactor/model/compact_partition_params.py +153 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
  5. deltacat/compute/compactor/model/dedupe_result.py +3 -0
  6. deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
  7. deltacat/compute/compactor/model/delta_file_locator.py +11 -6
  8. deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
  9. deltacat/compute/compactor/model/materialize_result.py +27 -6
  10. deltacat/compute/compactor/model/round_completion_info.py +9 -0
  11. deltacat/compute/compactor/steps/dedupe.py +35 -19
  12. deltacat/compute/compactor/steps/hash_bucket.py +41 -16
  13. deltacat/compute/compactor/steps/materialize.py +73 -70
  14. deltacat/compute/compactor/utils/io.py +15 -0
  15. deltacat/compute/compactor/utils/primary_key_index.py +9 -15
  16. deltacat/compute/compactor/utils/round_completion_file.py +13 -4
  17. deltacat/compute/compactor/utils/system_columns.py +32 -0
  18. deltacat/io/__init__.py +0 -7
  19. deltacat/io/file_object_store.py +48 -0
  20. deltacat/io/memcached_object_store.py +121 -0
  21. deltacat/io/object_store.py +51 -0
  22. deltacat/io/ray_plasma_object_store.py +23 -0
  23. deltacat/io/redis_object_store.py +114 -0
  24. deltacat/io/s3_object_store.py +44 -0
  25. deltacat/storage/model/delta.py +2 -1
  26. deltacat/tests/compactor/test_compact_partition_params.py +237 -0
  27. deltacat/tests/compactor/utils/test_io.py +27 -5
  28. deltacat/tests/io/__init__.py +0 -0
  29. deltacat/tests/io/test_file_object_store.py +86 -0
  30. deltacat/tests/io/test_memcached_object_store.py +158 -0
  31. deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
  32. deltacat/tests/io/test_redis_object_store.py +103 -0
  33. deltacat/tests/io/test_s3_object_store.py +59 -0
  34. deltacat/tests/utils/test_record_batch_tables.py +1 -1
  35. deltacat/tests/utils/test_resources.py +9 -0
  36. deltacat/utils/ray_utils/concurrency.py +0 -2
  37. deltacat/utils/resources.py +30 -18
  38. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
  39. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
  40. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
  41. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
  42. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
43
43
 
44
44
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
45
 
46
- __version__ = "0.1.18b3"
46
+ __version__ = "0.1.18b7"
47
47
 
48
48
 
49
49
  __all__ = [
@@ -3,6 +3,10 @@ from contextlib import nullcontext
3
3
  import functools
4
4
  import logging
5
5
  import ray
6
+ import time
7
+ import json
8
+ from deltacat.aws import s3u as s3_utils
9
+ import deltacat
6
10
  from deltacat import logs
7
11
  import pyarrow as pa
8
12
  from deltacat.compute.compactor import (
@@ -12,6 +16,9 @@ from deltacat.compute.compactor import (
12
16
  )
13
17
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
14
18
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
19
+ from deltacat.io.object_store import IObjectStore
20
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
21
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
15
22
  from deltacat.compute.stats.models.delta_stats import DeltaStats
16
23
  from deltacat.storage import (
17
24
  Delta,
@@ -20,6 +27,9 @@ from deltacat.storage import (
20
27
  PartitionLocator,
21
28
  interface as unimplemented_deltacat_storage,
22
29
  )
30
+ from deltacat.compute.compactor.model.compact_partition_params import (
31
+ CompactPartitionParams,
32
+ )
23
33
  from deltacat.utils.ray_utils.concurrency import (
24
34
  invoke_parallel,
25
35
  round_robin_options_provider,
@@ -37,7 +47,11 @@ from deltacat.utils.placement import PlacementGroupConfig
37
47
  from typing import List, Set, Optional, Tuple, Dict, Any
38
48
  from collections import defaultdict
39
49
  from deltacat.utils.metrics import MetricsConfig
40
- from deltacat.utils.resources import log_current_cluster_utilization
50
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
51
+ CompactionSessionAuditInfo,
52
+ )
53
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
54
+
41
55
 
42
56
  if importlib.util.find_spec("memray"):
43
57
  import memray
@@ -100,6 +114,8 @@ def compact_partition(
100
114
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
101
115
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
102
116
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
117
+ object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
118
+ s3_client_kwargs: Optional[Dict[str, Any]] = None,
103
119
  deltacat_storage=unimplemented_deltacat_storage,
104
120
  **kwargs,
105
121
  ) -> Optional[str]:
@@ -139,6 +155,8 @@ def compact_partition(
139
155
  list_deltas_kwargs,
140
156
  read_kwargs_provider,
141
157
  s3_table_writer_kwargs,
158
+ object_store,
159
+ s3_client_kwargs,
142
160
  deltacat_storage,
143
161
  **kwargs,
144
162
  )
@@ -184,10 +202,29 @@ def _execute_compaction_round(
184
202
  list_deltas_kwargs: Optional[Dict[str, Any]],
185
203
  read_kwargs_provider: Optional[ReadKwargsProvider],
186
204
  s3_table_writer_kwargs: Optional[Dict[str, Any]],
205
+ object_store: Optional[IObjectStore],
206
+ s3_client_kwargs: Optional[Dict[str, Any]],
187
207
  deltacat_storage=unimplemented_deltacat_storage,
188
208
  **kwargs,
189
209
  ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
190
210
 
211
+ rcf_source_partition_locator = (
212
+ rebase_source_partition_locator
213
+ if rebase_source_partition_locator
214
+ else source_partition_locator
215
+ )
216
+
217
+ base_audit_url = rcf_source_partition_locator.path(
218
+ f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
219
+ )
220
+ audit_url = f"{base_audit_url}.json"
221
+
222
+ logger.info(f"Compaction audit will be written to {audit_url}")
223
+
224
+ compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
225
+
226
+ compaction_start = time.monotonic()
227
+
191
228
  if not primary_keys:
192
229
  # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
193
230
  # with normalized manifest entry sizes
@@ -230,6 +267,7 @@ def _execute_compaction_round(
230
267
  f"{node_resource_keys}"
231
268
  )
232
269
 
270
+ compaction_audit.set_cluster_cpu_max(cluster_cpus)
233
271
  # create a remote options provider to round-robin tasks across all nodes or allocated bundles
234
272
  logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
235
273
  round_robin_opt_provider = functools.partial(
@@ -257,6 +295,13 @@ def _execute_compaction_round(
257
295
  )
258
296
  logger.info(f"Round completion file: {round_completion_info}")
259
297
 
298
+ enable_manifest_entry_copy_by_reference = (
299
+ False if rebase_source_partition_locator else True
300
+ )
301
+ logger.info(
302
+ f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
303
+ )
304
+
260
305
  # discover input delta files
261
306
  # For rebase:
262
307
  # Copy the old compacted table to a new destination, plus any new deltas from rebased source
@@ -268,6 +313,7 @@ def _execute_compaction_round(
268
313
  round_completion_info.high_watermark if round_completion_info else None
269
314
  )
270
315
 
316
+ delta_discovery_start = time.monotonic()
271
317
  (
272
318
  input_deltas,
273
319
  previous_last_stream_position_compacted_on_destination_table,
@@ -282,6 +328,17 @@ def _execute_compaction_round(
282
328
  **list_deltas_kwargs,
283
329
  )
284
330
 
331
+ delta_discovery_end = time.monotonic()
332
+ compaction_audit.set_delta_discovery_time_in_seconds(
333
+ delta_discovery_end - delta_discovery_start
334
+ )
335
+
336
+ s3_utils.upload(
337
+ compaction_audit.audit_url,
338
+ str(json.dumps(compaction_audit)),
339
+ **s3_client_kwargs,
340
+ )
341
+
285
342
  if not input_deltas:
286
343
  logger.info("No input deltas found to compact.")
287
344
  return None, None, None
@@ -298,6 +355,7 @@ def _execute_compaction_round(
298
355
  io.fit_input_deltas(
299
356
  input_deltas,
300
357
  cluster_resources,
358
+ compaction_audit,
301
359
  hash_bucket_count,
302
360
  deltacat_storage=deltacat_storage,
303
361
  )
@@ -307,11 +365,14 @@ def _execute_compaction_round(
307
365
  cluster_resources,
308
366
  hash_bucket_count,
309
367
  min_hash_bucket_chunk_size,
368
+ compaction_audit=compaction_audit,
310
369
  input_deltas_stats=input_deltas_stats,
311
370
  deltacat_storage=deltacat_storage,
312
371
  )
313
372
  )
314
373
 
374
+ compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
375
+
315
376
  assert hash_bucket_count is not None and hash_bucket_count > 0, (
316
377
  f"Expected hash bucket count to be a positive integer, but found "
317
378
  f"`{hash_bucket_count}`"
@@ -335,6 +396,8 @@ def _execute_compaction_round(
335
396
  "Multiple rounds are not supported. Please increase the cluster size and run again."
336
397
  )
337
398
 
399
+ hb_start = time.monotonic()
400
+
338
401
  hb_tasks_pending = invoke_parallel(
339
402
  items=uniform_deltas,
340
403
  ray_task=hb.hash_bucket,
@@ -348,11 +411,32 @@ def _execute_compaction_round(
348
411
  enable_profiler=enable_profiler,
349
412
  metrics_config=metrics_config,
350
413
  read_kwargs_provider=read_kwargs_provider,
414
+ object_store=object_store,
351
415
  deltacat_storage=deltacat_storage,
352
416
  )
417
+
418
+ hb_invoke_end = time.monotonic()
419
+
353
420
  logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
354
421
  hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
355
422
  logger.info(f"Got {len(hb_results)} hash bucket results.")
423
+ hb_end = time.monotonic()
424
+ hb_results_retrieved_at = time.time()
425
+
426
+ telemetry_time_hb = compaction_audit.save_step_stats(
427
+ CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
428
+ hb_results,
429
+ hb_results_retrieved_at,
430
+ hb_invoke_end - hb_start,
431
+ hb_end - hb_start,
432
+ )
433
+
434
+ s3_utils.upload(
435
+ compaction_audit.audit_url,
436
+ str(json.dumps(compaction_audit)),
437
+ **s3_client_kwargs,
438
+ )
439
+
356
440
  all_hash_group_idx_to_obj_id = defaultdict(list)
357
441
  for hb_result in hb_results:
358
442
  for hash_group_index, object_id in enumerate(
@@ -367,6 +451,8 @@ def _execute_compaction_round(
367
451
  f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
368
452
  )
369
453
 
454
+ compaction_audit.set_input_records(total_hb_record_count.item())
455
+
370
456
  # TODO (pdames): when resources are freed during the last round of hash
371
457
  # bucketing, start running dedupe tasks that read existing dedupe
372
458
  # output from S3 then wait for hash bucketing to finish before continuing
@@ -389,10 +475,18 @@ def _execute_compaction_round(
389
475
  # identify the index of records to keep or drop based on sort keys
390
476
  num_materialize_buckets = max_parallelism
391
477
  logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
478
+
479
+ dedupe_start = time.monotonic()
480
+ dd_max_parallelism = int(
481
+ max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
482
+ )
483
+ logger.info(
484
+ f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
485
+ )
392
486
  dd_tasks_pending = invoke_parallel(
393
487
  items=all_hash_group_idx_to_obj_id.values(),
394
488
  ray_task=dd.dedupe,
395
- max_parallelism=max_parallelism,
489
+ max_parallelism=dd_max_parallelism,
396
490
  options_provider=round_robin_opt_provider,
397
491
  kwargs_provider=lambda index, item: {
398
492
  "dedupe_task_index": index,
@@ -402,12 +496,33 @@ def _execute_compaction_round(
402
496
  num_materialize_buckets=num_materialize_buckets,
403
497
  enable_profiler=enable_profiler,
404
498
  metrics_config=metrics_config,
499
+ object_store=object_store,
405
500
  )
501
+
502
+ dedupe_invoke_end = time.monotonic()
406
503
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
407
504
  dd_results: List[DedupeResult] = ray.get(dd_tasks_pending)
408
505
  logger.info(f"Got {len(dd_results)} dedupe results.")
506
+
507
+ # we use time.time() here because time.monotonic() has no reference point
508
+ # whereas time.time() measures epoch seconds. Hence, it will be reasonable
509
+ # to compare time.time()s captured in different nodes.
510
+ dedupe_results_retrieved_at = time.time()
511
+ dedupe_end = time.monotonic()
512
+
409
513
  total_dd_record_count = sum([ddr.deduped_record_count for ddr in dd_results])
410
514
  logger.info(f"Deduped {total_dd_record_count} records...")
515
+
516
+ telemetry_time_dd = compaction_audit.save_step_stats(
517
+ CompactionSessionAuditInfo.DEDUPE_STEP_NAME,
518
+ dd_results,
519
+ dedupe_results_retrieved_at,
520
+ dedupe_invoke_end - dedupe_start,
521
+ dedupe_end - dedupe_start,
522
+ )
523
+
524
+ compaction_audit.set_records_deduped(total_dd_record_count.item())
525
+
411
526
  all_mat_buckets_to_obj_id = defaultdict(list)
412
527
  for dd_result in dd_results:
413
528
  for (
@@ -420,6 +535,8 @@ def _execute_compaction_round(
420
535
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
421
536
  logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
422
537
 
538
+ compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
539
+
423
540
  # TODO(pdames): when resources are freed during the last round of deduping
424
541
  # start running materialize tasks that read materialization source file
425
542
  # tables from S3 then wait for deduping to finish before continuing
@@ -432,6 +549,15 @@ def _execute_compaction_round(
432
549
 
433
550
  # parallel step 3:
434
551
  # materialize records to keep by index
552
+
553
+ s3_utils.upload(
554
+ compaction_audit.audit_url,
555
+ str(json.dumps(compaction_audit)),
556
+ **s3_client_kwargs,
557
+ )
558
+
559
+ materialize_start = time.monotonic()
560
+
435
561
  mat_tasks_pending = invoke_parallel(
436
562
  items=all_mat_buckets_to_obj_id.items(),
437
563
  ray_task=mat.materialize,
@@ -445,38 +571,34 @@ def _execute_compaction_round(
445
571
  round_completion_info=round_completion_info,
446
572
  source_partition_locator=source_partition_locator,
447
573
  partition=partition,
574
+ enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
448
575
  max_records_per_output_file=records_per_compacted_file,
449
576
  compacted_file_content_type=compacted_file_content_type,
450
577
  enable_profiler=enable_profiler,
451
578
  metrics_config=metrics_config,
452
579
  read_kwargs_provider=read_kwargs_provider,
453
580
  s3_table_writer_kwargs=s3_table_writer_kwargs,
581
+ object_store=object_store,
454
582
  deltacat_storage=deltacat_storage,
455
583
  )
584
+
585
+ materialize_invoke_end = time.monotonic()
586
+
456
587
  logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
457
- mat_results = ray.get(mat_tasks_pending)
458
- total_count_of_src_dfl_not_touched = sum(
459
- m.count_of_src_dfl_not_touched for m in mat_results
460
- )
461
- total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
462
- logger.info(
463
- f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
464
- )
465
- logger.info(
466
- f"Got total of {total_length_src_dfl} manifest files during compaction."
467
- )
468
- manifest_entry_copied_by_reference_ratio = (
469
- (round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
470
- if total_length_src_dfl != 0
471
- else None
472
- )
473
- logger.info(
474
- f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
475
- )
588
+ mat_results: List[MaterializeResult] = ray.get(mat_tasks_pending)
476
589
 
477
590
  logger.info(f"Got {len(mat_results)} materialize result(s).")
478
591
 
479
- log_current_cluster_utilization(log_identifier="post_materialize")
592
+ materialize_end = time.monotonic()
593
+ materialize_results_retrieved_at = time.time()
594
+
595
+ telemetry_time_materialize = compaction_audit.save_step_stats(
596
+ CompactionSessionAuditInfo.MATERIALIZE_STEP_NAME,
597
+ mat_results,
598
+ materialize_results_retrieved_at,
599
+ materialize_invoke_end - materialize_start,
600
+ materialize_end - materialize_start,
601
+ )
480
602
 
481
603
  mat_results = sorted(mat_results, key=lambda m: m.task_index)
482
604
  deltas = [m.delta for m in mat_results]
@@ -494,6 +616,7 @@ def _execute_compaction_round(
494
616
  f" Materialized records: {merged_delta.meta.record_count}"
495
617
  )
496
618
  logger.info(record_info_msg)
619
+
497
620
  assert (
498
621
  total_hb_record_count - total_dd_record_count == merged_delta.meta.record_count
499
622
  ), (
@@ -506,6 +629,9 @@ def _execute_compaction_round(
506
629
  )
507
630
  logger.info(f"Committed compacted delta: {compacted_delta}")
508
631
 
632
+ compaction_end = time.monotonic()
633
+ compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
634
+
509
635
  new_compacted_delta_locator = DeltaLocator.of(
510
636
  new_compacted_partition_locator,
511
637
  compacted_delta.stream_position,
@@ -516,26 +642,55 @@ def _execute_compaction_round(
516
642
  if round_completion_info
517
643
  else None
518
644
  )
645
+
646
+ pyarrow_write_result = PyArrowWriteResult.union(
647
+ [m.pyarrow_write_result for m in mat_results]
648
+ )
649
+
650
+ session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
651
+ compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
652
+ session_peak_memory
653
+ )
654
+
655
+ compaction_audit.save_round_completion_stats(
656
+ mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
657
+ )
658
+
659
+ s3_utils.upload(
660
+ compaction_audit.audit_url,
661
+ str(json.dumps(compaction_audit)),
662
+ **s3_client_kwargs,
663
+ )
664
+
519
665
  new_round_completion_info = RoundCompletionInfo.of(
520
666
  last_stream_position_compacted,
521
667
  new_compacted_delta_locator,
522
- PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
668
+ pyarrow_write_result,
523
669
  bit_width_of_sort_keys,
524
670
  last_rebase_source_partition_locator,
525
- manifest_entry_copied_by_reference_ratio,
526
- )
527
- rcf_source_partition_locator = (
528
- rebase_source_partition_locator
529
- if rebase_source_partition_locator
530
- else source_partition_locator
671
+ compaction_audit.untouched_file_ratio,
672
+ audit_url,
531
673
  )
674
+
532
675
  logger.info(
533
676
  f"partition-{source_partition_locator.partition_values},"
534
677
  f"compacted at: {last_stream_position_compacted},"
535
678
  f"last position: {last_stream_position_to_compact}"
536
679
  )
680
+
537
681
  return (
538
682
  partition,
539
683
  new_round_completion_info,
540
684
  rcf_source_partition_locator,
541
685
  )
686
+
687
+
688
+ def compact_partition_from_request(
689
+ compact_partition_params: CompactPartitionParams,
690
+ ) -> Optional[str]:
691
+ """
692
+ Wrapper for compact_partition that allows for the compact_partition parameters to be
693
+ passed in as a custom dictionary-like CompactPartitionParams object.
694
+ :param compact_partition_params:
695
+ """
696
+ return compact_partition(**compact_partition_params)
@@ -0,0 +1,153 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ import json
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from deltacat.types.media import ContentType
8
+
9
+
10
+ class CompactPartitionParams(dict):
11
+ """
12
+ This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
13
+ """
14
+
15
+ @staticmethod
16
+ def of(params: Optional[Dict]) -> CompactPartitionParams:
17
+ if params is None:
18
+ params = {}
19
+ compact_partition_params = CompactPartitionParams()
20
+ compact_partition_params["destination_partition_locator"] = params.get(
21
+ "destination_partition_locator"
22
+ )
23
+ compact_partition_params["last_stream_position_to_compact"] = params.get(
24
+ "last_stream_position_to_compact"
25
+ )
26
+ compact_partition_params["source_partition_locator"] = params.get(
27
+ "source_partition_locator"
28
+ )
29
+ compact_partition_params["primary_keys"] = params.get("primary_keys")
30
+ compact_partition_params["rebase_source_partition_locator"] = params.get(
31
+ "rebase_source_partition_locator"
32
+ )
33
+ compact_partition_params["rebase_source_partition_high_watermark"] = params.get(
34
+ "rebase_source_partition_high_watermark"
35
+ )
36
+ compact_partition_params["hash_bucket_count"] = params.get("hash_bucket_count")
37
+ compact_partition_params["deltacat_storage"] = params.get("deltacat_storage")
38
+ compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
39
+ "compaction_artifact_s3_bucket"
40
+ )
41
+ compact_partition_params["properties"] = params.get("properties")
42
+ compact_partition_params["compacted_file_content_type"] = params.get(
43
+ "compacted_file_content_type"
44
+ )
45
+ compact_partition_params["list_deltas_kwargs"] = params.get(
46
+ "list_deltas_kwargs"
47
+ )
48
+ compact_partition_params["pg_config"] = params.get("pg_config")
49
+ compact_partition_params["read_kwargs_provider"] = params.get(
50
+ "read_kwargs_provider"
51
+ )
52
+ compact_partition_params["s3_table_writer_kwargs"] = params.get(
53
+ "s3_table_writer_kwargs"
54
+ )
55
+ return compact_partition_params
56
+
57
+ @property
58
+ def destination_partition_locator(self) -> Optional[dict]:
59
+ return self["destination_partition_locator"]
60
+
61
+ @property
62
+ def last_stream_position_to_compact(self) -> Optional[int]:
63
+ return self["last_stream_position_to_compact"]
64
+
65
+ @property
66
+ def source_partition_locator(self) -> Optional[dict]:
67
+ return self["source_partition_locator"]
68
+
69
+ @property
70
+ def primary_keys(self) -> Optional[List[str]]:
71
+ return list(self["primary_keys"])
72
+
73
+ @property
74
+ def rebase_source_partition_locator(self) -> Optional[dict]:
75
+ return self["rebase_source_partition_locator"]
76
+
77
+ @property
78
+ def rebase_source_partition_high_watermark(self) -> Optional[int]:
79
+ return self["rebase_source_partition_high_watermark"]
80
+
81
+ @property
82
+ def hash_bucket_count(self) -> Optional[int]:
83
+ return self["hash_bucket_count"]
84
+
85
+ @property
86
+ def deltacat_storage(self) -> Optional[str]:
87
+ return self["deltacat_storage"]
88
+
89
+ @property
90
+ def compaction_artifact_s3_bucket(self) -> Optional[str]:
91
+ return self["compaction_artifact_s3_bucket"]
92
+
93
+ @property
94
+ def properties(self) -> Optional[Dict[str, str]]:
95
+ return self["properties"]
96
+
97
+ @property
98
+ def compacted_file_content_type(self) -> Optional[ContentType]:
99
+ return self["compacted_file_content_type"]
100
+
101
+ @property
102
+ def list_deltas_kwargs(self) -> Optional[dict]:
103
+ return self["list_deltas_kwargs"]
104
+
105
+ @property
106
+ def pg_config(self) -> Optional[Any]:
107
+ return self["pg_config"]
108
+
109
+ @property
110
+ def read_kwargs_provider(self) -> Optional[Any]:
111
+ return self["read_kwargs_provider"]
112
+
113
+ @property
114
+ def s3_table_writer_kwargs(self) -> Optional[Any]:
115
+ return self["s3_table_writer_kwargs"]
116
+
117
+ @staticmethod
118
+ def json_handler_for_compact_partition_params(obj):
119
+ """
120
+ A handler for the `json.dumps()` function that can be used to serialize sets to JSON.
121
+ If the `set_default()` handler is passed as the `default` argument to the `json.dumps()` function, it will be called whenever a set object is encountered.
122
+ The `set_default()` handler will then serialize the set as a list.
123
+ """
124
+ try:
125
+ if isinstance(obj, set):
126
+ return list(obj)
127
+ elif hasattr(obj, "toJSON"):
128
+ return obj.toJSON()
129
+ else:
130
+ return obj.__dict__
131
+ except Exception:
132
+ return obj.__class__.__name__
133
+
134
+ def serialize(self) -> str:
135
+ """
136
+ Serializes itself to a json-formatted string
137
+
138
+ Returns:
139
+ The serialized object.
140
+
141
+ """
142
+ to_serialize: Dict[str, Any] = {}
143
+ # individually try deepcopy the values from the self dictionary and just use the class name for the value when it is not possible to deepcopy
144
+ for attr, value in self.items():
145
+ try:
146
+ to_serialize[attr] = copy.deepcopy(value)
147
+ except Exception: # if unable to deep copy the objects like module objects for example then just provide the class name at minimum
148
+ to_serialize[attr] = value.__class__.__name__
149
+ serialized_arguments_compact_partition_args: str = json.dumps(
150
+ to_serialize,
151
+ default=CompactPartitionParams.json_handler_for_compact_partition_params,
152
+ )
153
+ return serialized_arguments_compact_partition_args