deltacat 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +25 -0
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
  4. deltacat/compute/compactor/model/delta_file_envelope.py +21 -3
  5. deltacat/compute/compactor/model/table_object_store.py +51 -0
  6. deltacat/compute/compactor/utils/io.py +1 -1
  7. deltacat/compute/compactor_v2/compaction_session.py +80 -14
  8. deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  9. deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +83 -0
  10. deltacat/compute/compactor_v2/deletes/delete_strategy.py +82 -0
  11. deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +161 -0
  12. deltacat/compute/compactor_v2/deletes/model.py +23 -0
  13. deltacat/compute/compactor_v2/deletes/utils.py +164 -0
  14. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  15. deltacat/compute/compactor_v2/model/merge_input.py +24 -1
  16. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  17. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -6
  18. deltacat/compute/compactor_v2/steps/merge.py +221 -50
  19. deltacat/compute/compactor_v2/utils/delta.py +11 -1
  20. deltacat/compute/compactor_v2/utils/merge.py +10 -0
  21. deltacat/compute/compactor_v2/utils/task_options.py +94 -8
  22. deltacat/io/memcached_object_store.py +20 -0
  23. deltacat/io/ray_plasma_object_store.py +6 -0
  24. deltacat/logs.py +29 -2
  25. deltacat/storage/__init__.py +3 -0
  26. deltacat/storage/interface.py +2 -0
  27. deltacat/storage/model/delete_parameters.py +40 -0
  28. deltacat/storage/model/delta.py +25 -1
  29. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +1930 -0
  30. deltacat/tests/compute/compact_partition_test_cases.py +16 -822
  31. deltacat/tests/compute/compactor/utils/test_io.py +4 -4
  32. deltacat/tests/compute/test_compact_partition_incremental.py +4 -0
  33. deltacat/tests/compute/test_compact_partition_params.py +5 -0
  34. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +32 -20
  35. deltacat/tests/compute/test_util_create_table_deltas_repo.py +28 -10
  36. deltacat/tests/io/test_memcached_object_store.py +19 -0
  37. deltacat/tests/local_deltacat_storage/__init__.py +3 -0
  38. deltacat/tests/test_utils/constants.py +1 -2
  39. deltacat/tests/test_utils/pyarrow.py +27 -10
  40. deltacat/utils/pandas.py +1 -1
  41. deltacat/utils/ray_utils/runtime.py +3 -3
  42. deltacat/utils/resources.py +7 -5
  43. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/METADATA +1 -1
  44. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/RECORD +47 -38
  45. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/LICENSE +0 -0
  46. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/WHEEL +0 -0
  47. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  import pyarrow as pa
2
- import string
3
2
  from typing import Callable, Dict, List, Optional, Set, Tuple, Union
4
3
  from deltacat.tests.compute.test_util_common import (
5
4
  offer_iso8601_timestamp_list,
@@ -23,21 +22,15 @@ from deltacat.storage import (
23
22
  from deltacat.compute.compactor_v2.compaction_session import (
24
23
  compact_partition as compact_partition_v2,
25
24
  )
26
- from deltacat.types.media import ContentType
27
25
 
28
26
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
29
27
 
30
28
  from deltacat.storage.model.sort_key import SortKey
31
29
 
32
- from deltacat.utils.pyarrow import (
33
- ReadKwargsProviderPyArrowSchemaOverride,
34
- content_type_to_reader_kwargs,
35
- pyarrow_read_csv,
36
- )
37
-
38
30
  ZERO_VALUED_SORT_KEY, ZERO_VALUED_PARTITION_VALUES_PARAM = [], []
39
31
  ZERO_VALUED_PARTITION_KEYS_PARAM = None
40
32
  ZERO_VALUED_PRIMARY_KEY = {}
33
+ ZERO_VALUED_PROPERTIES = {}
41
34
 
42
35
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
43
36
 
@@ -60,9 +53,10 @@ class BaseCompactorTestCase:
60
53
  input_deltas: List[pa.Array] - argument required for delta creation during compact_partition test setup. Actual incoming deltas expressed as a PyArrow array (https://arrow.apache.org/docs/python/generated/pyarrow.array.html)
61
54
  input_deltas_delta_type: DeltaType - enumerated argument required for delta creation during compact_partition test setup. Available values are (DeltaType.APPEND, DeltaType.UPSERT, DeltaType.DELETE). DeltaType.APPEND is not supported by compactor v1 or v2
62
55
  expected_terminal_compact_partition_result: pa.Table - expected PyArrow table after compaction (i.e,. the state of the table after applying all row UPDATES/DELETES/INSERTS)
56
+ expected_terminal_exception: BaseException - expected exception during compaction
63
57
  do_create_placement_group: bool - toggles whether to create a placement group (https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html) or not
64
58
  records_per_compacted_file: int - argument for the records_per_compacted_file parameter in compact_partition
65
- hash_bucket_count_param: int - argument for the hash_bucket_count parameter in compact_partition. Needs to be > 1
59
+ hash_bucket_count_param: int - argument for the hash_bucket_count parameter in compact_partition
66
60
  read_kwargs_provider: Optional[ReadKwargsProvider] - argument for read_kwargs_provider parameter in compact_partition. If None then no ReadKwargsProvider is provided to compact_partition_params
67
61
  drop_duplicates: bool - argument for drop_duplicates parameter in compact_partition. Only recognized by compactor v2.
68
62
  skip_enabled_compact_partition_drivers: List[CompactorVersion] - skip whatever enabled_compact_partition_drivers are included in this list
@@ -75,6 +69,7 @@ class BaseCompactorTestCase:
75
69
  input_deltas: Union[List[pa.Array], pa.Table]
76
70
  input_deltas_delta_type: DeltaType
77
71
  expected_terminal_compact_partition_result: pa.Table
72
+ expected_terminal_exception: BaseException
78
73
  do_create_placement_group: bool
79
74
  records_per_compacted_file: int
80
75
  hash_bucket_count: int
@@ -92,23 +87,6 @@ class IncrementalCompactionTestCaseParams(BaseCompactorTestCase):
92
87
  pass
93
88
 
94
89
 
95
- @dataclass(frozen=True)
96
- class RebaseThenIncrementalCompactionTestCaseParams(BaseCompactorTestCase):
97
- """
98
- A pytest parameterized test case for the `compact_partition` function with rebase and incremental compaction.
99
-
100
- Args:
101
- * (inherited from CompactorTestCase): see CompactorTestCase docstring for details
102
- incremental_deltas: pa.Table - argument required for delta creation during the incremental phase of compact_partition test setup. Incoming deltas during incremental expressed as a pyarrow array
103
- incremental_deltas_delta_type: DeltaType - argument required for delta creation during the incremental phase of compact_partition test setup. Available values are (DeltaType.APPEND, DeltaType.UPSERT, DeltaType.DELETE). DeltaType.APPEND is not supported by compactor v1 or v2
104
- rebase_expected_compact_partition_result: pa.Table - expected table after rebase compaction runs. An output that is asserted on in Rebase then Incremental unit tests
105
- """
106
-
107
- incremental_deltas: Optional[pa.Table]
108
- incremental_deltas_delta_type: DeltaType
109
- rebase_expected_compact_partition_result: pa.Table
110
-
111
-
112
90
  @dataclass(frozen=True)
113
91
  class NoRCFOutputCompactionTestCaseParams(BaseCompactorTestCase):
114
92
  pass
@@ -155,6 +133,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
155
133
  [pa.array([str(i) for i in range(10)])],
156
134
  names=["pk_col_1"],
157
135
  ),
136
+ expected_terminal_exception=None,
158
137
  do_create_placement_group=False,
159
138
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
160
139
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -179,6 +158,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
179
158
  [pa.array([str(i) for i in range(10)]), pa.array(["test"] * 10)],
180
159
  names=["pk_col_1", "sk_col_1"],
181
160
  ),
161
+ expected_terminal_exception=None,
182
162
  do_create_placement_group=False,
183
163
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
184
164
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -212,6 +192,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
212
192
  ],
213
193
  names=["pk_col_1", "sk_col_1", "sk_col_2"],
214
194
  ),
195
+ expected_terminal_exception=None,
215
196
  do_create_placement_group=False,
216
197
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
217
198
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -244,6 +225,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
244
225
  ],
245
226
  names=["pk_col_1", "sk_col_1", "sk_col_2"],
246
227
  ),
228
+ expected_terminal_exception=None,
247
229
  do_create_placement_group=False,
248
230
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
249
231
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -271,6 +253,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
271
253
  ],
272
254
  names=["pk_col_1", "sk_col_1"],
273
255
  ),
256
+ expected_terminal_exception=None,
274
257
  do_create_placement_group=False,
275
258
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
276
259
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -298,6 +281,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
298
281
  ],
299
282
  names=["pk_col_1", "sk_col_1"],
300
283
  ),
284
+ expected_terminal_exception=None,
301
285
  do_create_placement_group=False,
302
286
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
303
287
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -325,6 +309,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
325
309
  ],
326
310
  names=["pk_col_1", "sk_col_1"],
327
311
  ),
312
+ expected_terminal_exception=None,
328
313
  do_create_placement_group=False,
329
314
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
330
315
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -354,6 +339,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
354
339
  ],
355
340
  names=["pk_col_1", "pk_col_2", "sk_col_1"],
356
341
  ),
342
+ expected_terminal_exception=None,
357
343
  do_create_placement_group=False,
358
344
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
359
345
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -381,6 +367,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
381
367
  ],
382
368
  names=["pk_col_1", "sk_col_1"],
383
369
  ),
370
+ expected_terminal_exception=None,
384
371
  do_create_placement_group=False,
385
372
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
386
373
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -408,6 +395,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
408
395
  ],
409
396
  names=["pk_col_1", "sk_col_1"],
410
397
  ),
398
+ expected_terminal_exception=None,
411
399
  do_create_placement_group=False,
412
400
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
413
401
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -435,6 +423,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
435
423
  ],
436
424
  names=["pk_col_1", "sk_col_1"],
437
425
  ),
426
+ expected_terminal_exception=None,
438
427
  do_create_placement_group=False,
439
428
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
440
429
  hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
@@ -462,797 +451,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
462
451
  ],
463
452
  names=["pk_col_1", "sk_col_1"],
464
453
  ),
465
- do_create_placement_group=False,
466
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
467
- hash_bucket_count=1,
468
- read_kwargs_provider=None,
469
- drop_duplicates=True,
470
- skip_enabled_compact_partition_drivers=None,
471
- ),
472
- }
473
-
474
- REBASE_THEN_INCREMENTAL_TEST_CASES = {
475
- "1-rebase-then-incremental-sanity": RebaseThenIncrementalCompactionTestCaseParams(
476
- primary_keys={"pk_col_1"},
477
- sort_keys=[
478
- SortKey.of(key_name="sk_col_1"),
479
- SortKey.of(key_name="sk_col_2"),
480
- ],
481
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
482
- partition_values=["1"],
483
- input_deltas=pa.Table.from_arrays(
484
- [
485
- pa.array([str(i) for i in range(10)]),
486
- pa.array([i for i in range(0, 10)]),
487
- pa.array(["foo"] * 10),
488
- pa.array([i / 10 for i in range(10, 20)]),
489
- ],
490
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
491
- ),
492
- input_deltas_delta_type=DeltaType.UPSERT,
493
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
494
- [
495
- pa.array([str(i) for i in range(10)]),
496
- pa.array([i for i in range(0, 10)]),
497
- pa.array(["foo"] * 10),
498
- pa.array([i / 10 for i in range(10, 20)]),
499
- ],
500
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
501
- ),
502
- incremental_deltas=pa.Table.from_arrays(
503
- [
504
- pa.array([str(i) for i in range(10)]),
505
- pa.array([i for i in range(20, 30)]),
506
- pa.array(["foo"] * 10),
507
- pa.array([i / 10 for i in range(40, 50)]),
508
- ],
509
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
510
- ),
511
- incremental_deltas_delta_type=DeltaType.UPSERT,
512
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
513
- [
514
- pa.array([str(i) for i in range(10)]),
515
- pa.array([i for i in range(20, 30)]),
516
- pa.array(["foo"] * 10),
517
- pa.array([i / 10 for i in range(40, 50)]),
518
- ],
519
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
520
- ),
521
- do_create_placement_group=False,
522
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
523
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
524
- read_kwargs_provider=None,
525
- drop_duplicates=True,
526
- skip_enabled_compact_partition_drivers=None,
527
- ),
528
- "2-rebase-then-incremental-pk-multi": RebaseThenIncrementalCompactionTestCaseParams(
529
- primary_keys={"pk_col_1", "pk_col_2"},
530
- sort_keys=[
531
- SortKey.of(key_name="sk_col_1"),
532
- ],
533
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
534
- partition_values=["1"],
535
- input_deltas=pa.Table.from_arrays(
536
- [
537
- pa.array([str(i % 4) for i in range(10)]),
538
- pa.array([(i % 4) / 10 for i in range(9, -1, -1)]),
539
- pa.array(offer_iso8601_timestamp_list(10, "minutes")),
540
- pa.array([i / 10 for i in range(10, 20)]),
541
- ],
542
- names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
543
- ),
544
- input_deltas_delta_type=DeltaType.UPSERT,
545
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
546
- [
547
- pa.array(["0", "1", "2", "3"]),
548
- pa.array([0.1, 0, 0.3, 0.2]),
549
- pa.array(
550
- [
551
- "2023-05-03T10:00:00Z",
552
- "2023-05-03T09:59:00Z",
553
- "2023-05-03T09:58:00Z",
554
- "2023-05-03T09:57:00Z",
555
- ]
556
- ),
557
- pa.array([1, 1.1, 1.2, 1.3]),
558
- ],
559
- names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
560
- ),
561
- incremental_deltas=pa.Table.from_arrays(
562
- [
563
- pa.array(["0", "1", "2", "3"]),
564
- pa.array([0.1, 0, 0.3, 0.2]),
565
- pa.array(
566
- [
567
- "2023-05-03T10:00:00Z",
568
- "2023-05-03T09:59:00Z",
569
- "2023-05-03T09:58:00Z",
570
- "2023-05-03T09:57:00Z",
571
- ]
572
- ),
573
- pa.array([1, 1.1, 1.2, 1.3]),
574
- ],
575
- names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
576
- ),
577
- incremental_deltas_delta_type=DeltaType.UPSERT,
578
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
579
- [
580
- pa.array(["0", "1", "2", "3"]),
581
- pa.array([0.1, 0, 0.3, 0.2]),
582
- pa.array(
583
- [
584
- "2023-05-03T10:00:00Z",
585
- "2023-05-03T09:59:00Z",
586
- "2023-05-03T09:58:00Z",
587
- "2023-05-03T09:57:00Z",
588
- ]
589
- ),
590
- pa.array([1, 1.1, 1.2, 1.3]),
591
- ],
592
- names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
593
- ),
594
- do_create_placement_group=False,
595
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
596
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
597
- read_kwargs_provider=None,
598
- drop_duplicates=True,
599
- skip_enabled_compact_partition_drivers=None,
600
- ),
601
- "3-rebase-then-incremental-no-sk-no-partition-key": RebaseThenIncrementalCompactionTestCaseParams(
602
- primary_keys={"pk_col_1"},
603
- sort_keys=ZERO_VALUED_SORT_KEY,
604
- partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
605
- partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
606
- input_deltas=pa.Table.from_arrays(
607
- [
608
- pa.array([str(i % 4) for i in range(12)]),
609
- pa.array([i / 10 for i in range(10, 22)]),
610
- ],
611
- names=["pk_col_1", "col_1"],
612
- ),
613
- input_deltas_delta_type=DeltaType.UPSERT,
614
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
615
- [
616
- pa.array(["0", "1", "2", "3"]),
617
- pa.array([1.8, 1.9, 2.0, 2.1]),
618
- ],
619
- names=["pk_col_1", "col_1"],
620
- ),
621
- incremental_deltas=pa.Table.from_arrays(
622
- [
623
- pa.array(["0", "1", "2", "3"]),
624
- pa.array([18.0, 19.0, 20.0, 21.0]),
625
- ],
626
- names=["pk_col_1", "col_1"],
627
- ),
628
- incremental_deltas_delta_type=DeltaType.UPSERT,
629
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
630
- [
631
- pa.array(["0", "1", "2", "3"]),
632
- pa.array([18.0, 19.0, 20.0, 21.0]),
633
- ],
634
- names=["pk_col_1", "col_1"],
635
- ),
636
- do_create_placement_group=False,
637
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
638
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
639
- read_kwargs_provider=None,
640
- drop_duplicates=True,
641
- skip_enabled_compact_partition_drivers=None,
642
- ),
643
- "4-rebase-then-incremental-partial-deltas-on-incremental-deltas": RebaseThenIncrementalCompactionTestCaseParams(
644
- primary_keys={"pk_col_1"},
645
- sort_keys=ZERO_VALUED_SORT_KEY,
646
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
647
- partition_values=["1"],
648
- input_deltas=pa.Table.from_arrays(
649
- [
650
- pa.array([str(i) for i in range(10)]),
651
- pa.array([i / 10 for i in range(10)]),
652
- ],
653
- names=["pk_col_1", "col_1"],
654
- ),
655
- input_deltas_delta_type=DeltaType.UPSERT,
656
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
657
- [
658
- pa.array([str(i) for i in range(10)]),
659
- pa.array([i / 10 for i in range(10)]),
660
- ],
661
- names=["pk_col_1", "col_1"],
662
- ),
663
- incremental_deltas=pa.Table.from_arrays(
664
- [
665
- pa.array(["8", "9"]),
666
- pa.array([200.0, 100.0]),
667
- ],
668
- names=["pk_col_1", "col_1"],
669
- ),
670
- incremental_deltas_delta_type=DeltaType.UPSERT,
671
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
672
- [
673
- pa.array([str(i) for i in range(10)]),
674
- pa.array([i / 10 for i in range(8)] + [200.0] + [100.0]),
675
- ],
676
- names=["pk_col_1", "col_1"],
677
- ),
678
- do_create_placement_group=False,
679
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
680
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
681
- read_kwargs_provider=None,
682
- drop_duplicates=True,
683
- skip_enabled_compact_partition_drivers=None,
684
- ),
685
- "5-rebase-then-incremental-partial-deltas-on-incremental-deltas-2": RebaseThenIncrementalCompactionTestCaseParams(
686
- primary_keys={"pk_col_1"},
687
- sort_keys=[
688
- SortKey.of(key_name="sk_col_1"),
689
- ],
690
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
691
- partition_values=["1"],
692
- input_deltas=pa.Table.from_arrays(
693
- [
694
- pa.array([i % 4 for i in range(12)]),
695
- pa.array([(i / 10 * 10) % 4 for i in range(12)][::-1]),
696
- pa.array(list(string.ascii_lowercase)[:12]),
697
- ],
698
- names=["pk_col_1", "sk_col_1", "col_1"],
699
- ),
700
- input_deltas_delta_type=DeltaType.UPSERT,
701
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
702
- [
703
- pa.array([0, 1, 2, 3]),
704
- pa.array([3.0, 2.0, 1.0, 0.0]),
705
- pa.array(["i", "j", "k", "l"]),
706
- ],
707
- names=["pk_col_1", "sk_col_1", "col_1"],
708
- ),
709
- incremental_deltas=pa.Table.from_arrays(
710
- [
711
- pa.array([1, 4]),
712
- pa.array([4.0, 2.0]),
713
- pa.array(["a", "b"]),
714
- ],
715
- names=["pk_col_1", "sk_col_1", "col_1"],
716
- ),
717
- incremental_deltas_delta_type=DeltaType.UPSERT,
718
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
719
- [
720
- pa.array([0, 1, 2, 3, 4]),
721
- pa.array([3.0, 4.0, 1.0, 0.0, 2.0]),
722
- pa.array(["i", "a", "k", "l", "b"]),
723
- ],
724
- names=["pk_col_1", "sk_col_1", "col_1"],
725
- ),
726
- do_create_placement_group=False,
727
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
728
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
729
- read_kwargs_provider=None,
730
- drop_duplicates=True,
731
- skip_enabled_compact_partition_drivers=None,
732
- ),
733
- "6-rebase-then-incremental-hash-bucket-GT-records-per-compacted-file-v2-only": RebaseThenIncrementalCompactionTestCaseParams(
734
- primary_keys={"pk_col_1"},
735
- sort_keys=[
736
- SortKey.of(key_name="sk_col_1"),
737
- SortKey.of(key_name="sk_col_2"),
738
- ],
739
- partition_keys=[PartitionKey.of("day", PartitionKeyType.TIMESTAMP)],
740
- partition_values=["2022-01-01T00:00:00.000Z"],
741
- input_deltas=pa.Table.from_arrays(
742
- [
743
- pa.array([str(i) for i in range(12)]),
744
- pa.array([i for i in range(0, 12)]),
745
- pa.array(["foo"] * 12),
746
- pa.array([i / 10 for i in range(10, 22)]),
747
- ],
748
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
749
- ),
750
- input_deltas_delta_type=DeltaType.UPSERT,
751
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
752
- [
753
- pa.array([str(i) for i in range(12)]),
754
- pa.array([i for i in range(0, 12)]),
755
- pa.array(["foo"] * 12),
756
- pa.array([i / 10 for i in range(10, 22)]),
757
- ],
758
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
759
- ),
760
- incremental_deltas=pa.Table.from_arrays(
761
- [
762
- pa.array([str(i) for i in range(12)]),
763
- pa.array([i for i in range(20, 32)]),
764
- pa.array(["foo"] * 12),
765
- pa.array([i / 10 for i in range(40, 52)]),
766
- ],
767
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
768
- ),
769
- incremental_deltas_delta_type=DeltaType.UPSERT,
770
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
771
- [
772
- pa.array([str(i) for i in range(12)]),
773
- pa.array([i for i in range(20, 32)]),
774
- pa.array(["foo"] * 12),
775
- pa.array([i / 10 for i in range(40, 52)]),
776
- ],
777
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
778
- ),
779
- do_create_placement_group=False,
780
- records_per_compacted_file=10,
781
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT + 10,
782
- read_kwargs_provider=None,
783
- drop_duplicates=True,
784
- skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
785
- ),
786
- "7-rebase-then-incremental-no-pk-compactor-v2-only": RebaseThenIncrementalCompactionTestCaseParams(
787
- primary_keys=ZERO_VALUED_PRIMARY_KEY,
788
- sort_keys=[
789
- SortKey.of(key_name="sk_col_1"),
790
- ],
791
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
792
- partition_values=["1"],
793
- input_deltas=pa.Table.from_arrays(
794
- [
795
- pa.array([1, 2, 3]),
796
- pa.array([1.0, 2.0, 3.0]),
797
- ],
798
- names=["sk_col_1", "col_1"],
799
- ),
800
- input_deltas_delta_type=DeltaType.UPSERT,
801
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
802
- [
803
- pa.array([1, 1, 2, 2, 3, 3]),
804
- pa.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0]),
805
- ],
806
- names=["sk_col_1", "col_1"],
807
- ),
808
- incremental_deltas=pa.Table.from_arrays(
809
- [
810
- pa.array([4, 5, 6]),
811
- pa.array([10.0, 11.0, 12.0]),
812
- ],
813
- names=["sk_col_1", "col_1"],
814
- ),
815
- incremental_deltas_delta_type=DeltaType.UPSERT,
816
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
817
- [
818
- pa.array([1, 1, 2, 2, 3, 3, 4, 5, 6]),
819
- pa.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 10.0, 11.0, 12.0]),
820
- ],
821
- names=["sk_col_1", "col_1"],
822
- ),
823
- do_create_placement_group=False,
824
- records_per_compacted_file=10,
825
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
826
- read_kwargs_provider=None,
827
- drop_duplicates=True,
828
- skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
829
- ),
830
- "8-rebase-then-incremental-delete-type-delta-on-incremental": RebaseThenIncrementalCompactionTestCaseParams(
831
- primary_keys={"pk_col_1"},
832
- sort_keys=ZERO_VALUED_SORT_KEY,
833
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
834
- partition_values=["1"],
835
- input_deltas=pa.Table.from_arrays(
836
- [
837
- pa.array([i for i in range(12)]),
838
- pa.array([str(i) for i in range(0, 12)]),
839
- ],
840
- names=["pk_col_1", "col_1"],
841
- ),
842
- input_deltas_delta_type=DeltaType.UPSERT,
843
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
844
- [
845
- pa.array([i for i in range(12)]),
846
- pa.array([str(i) for i in range(0, 12)]),
847
- ],
848
- names=["pk_col_1", "col_1"],
849
- ),
850
- incremental_deltas=pa.Table.from_arrays(
851
- [ # delete last two primary keys
852
- pa.array([10, 11]),
853
- pa.array(["", ""]),
854
- ],
855
- names=["pk_col_1", "col_1"],
856
- ),
857
- incremental_deltas_delta_type=DeltaType.DELETE,
858
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
859
- [
860
- pa.array([i for i in range(10)]),
861
- pa.array([str(i) for i in range(0, 10)]),
862
- ],
863
- names=["pk_col_1", "col_1"],
864
- ),
865
- do_create_placement_group=False,
866
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
867
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
868
- read_kwargs_provider=None,
869
- drop_duplicates=True,
870
- skip_enabled_compact_partition_drivers=None,
871
- ),
872
- "9-rebase-then-incremental-delete-type-delta-on-incremental-multi-pk": RebaseThenIncrementalCompactionTestCaseParams(
873
- primary_keys={"pk_col_1", "pk_col_2"},
874
- sort_keys=ZERO_VALUED_SORT_KEY,
875
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
876
- partition_values=["2022-01-01T00:00:00.000Z"],
877
- input_deltas=pa.Table.from_arrays(
878
- [
879
- pa.array([(i % 4) for i in range(12)]),
880
- pa.array([float(i % 4) for i in range(12, 0, -1)]),
881
- pa.array([str(i) for i in range(0, 12)]),
882
- ],
883
- names=["pk_col_1", "pk_col_2", "col_1"],
884
- ),
885
- input_deltas_delta_type=DeltaType.UPSERT,
886
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
887
- [
888
- pa.array([0, 1, 2, 3]),
889
- pa.array([0.0, 3.0, 2.0, 1.0]),
890
- pa.array(["8", "9", "10", "11"]),
891
- ],
892
- names=["pk_col_1", "pk_col_2", "col_1"],
893
- ),
894
- incremental_deltas=pa.Table.from_arrays(
895
- [ # delete last two primary keys
896
- pa.array([2, 3]),
897
- pa.array([2.0, 1.0]),
898
- pa.array(["", ""]),
899
- ],
900
- names=["pk_col_1", "pk_col_2", "col_1"],
901
- ),
902
- incremental_deltas_delta_type=DeltaType.DELETE,
903
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
904
- [
905
- pa.array([0, 1]),
906
- pa.array([0.0, 3.0]),
907
- pa.array(["8", "9"]),
908
- ],
909
- names=["pk_col_1", "pk_col_2", "col_1"],
910
- ),
911
- do_create_placement_group=False,
912
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
913
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
914
- read_kwargs_provider=None,
915
- drop_duplicates=True,
916
- skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
917
- ),
918
- "10-rebase-then-incremental-delete-type-delta-on-incremental-multi-pk-delete-all": RebaseThenIncrementalCompactionTestCaseParams(
919
- primary_keys={"pk_col_1", "pk_col_2"},
920
- sort_keys=ZERO_VALUED_SORT_KEY,
921
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
922
- partition_values=["2022-01-01T00:00:00.000Z"],
923
- input_deltas=pa.Table.from_arrays(
924
- [
925
- pa.array([(i % 4) for i in range(12)]),
926
- pa.array([float(i % 4) for i in range(12, 0, -1)]),
927
- pa.array([str(i) for i in range(0, 12)]),
928
- ],
929
- names=["pk_col_1", "pk_col_2", "col_1"],
930
- ),
931
- input_deltas_delta_type=DeltaType.UPSERT,
932
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
933
- [
934
- pa.array([0, 1, 2, 3]),
935
- pa.array([0.0, 3.0, 2.0, 1.0]),
936
- pa.array(["8", "9", "10", "11"]),
937
- ],
938
- names=["pk_col_1", "pk_col_2", "col_1"],
939
- ),
940
- incremental_deltas=pa.Table.from_arrays(
941
- [ # delete last two primary keys
942
- pa.array([0, 1, 2, 3]),
943
- pa.array([0.0, 3.0, 2.0, 1.0]),
944
- pa.array(["8", "9", "10", "11"]),
945
- ],
946
- names=["pk_col_1", "pk_col_2", "col_1"],
947
- ),
948
- incremental_deltas_delta_type=DeltaType.DELETE,
949
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
950
- [
951
- pa.array([]),
952
- pa.array([]),
953
- pa.array([]),
954
- ],
955
- schema=pa.schema(
956
- [
957
- ("pk_col_1", pa.int64()),
958
- ("pk_col_2", pa.float64()),
959
- ("col_1", pa.string()),
960
- ]
961
- ),
962
- ),
963
- do_create_placement_group=False,
964
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
965
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
966
- read_kwargs_provider=None,
967
- drop_duplicates=True,
968
- skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
969
- ),
970
- "11-rebase-then-incremental-empty-csv-delta-case": RebaseThenIncrementalCompactionTestCaseParams(
971
- primary_keys={"pk_col_1"},
972
- sort_keys=ZERO_VALUED_SORT_KEY,
973
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
974
- partition_values=["1"],
975
- input_deltas=pa.Table.from_arrays(
976
- [
977
- pa.array([str(i) for i in range(10)]),
978
- pa.array([i / 10 for i in range(10, 20)]),
979
- ],
980
- names=["pk_col_1", "col_1"],
981
- ),
982
- input_deltas_delta_type=DeltaType.UPSERT,
983
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
984
- [
985
- pa.array([str(i) for i in range(10)]),
986
- pa.array([i / 10 for i in range(10, 20)]),
987
- ],
988
- names=["pk_col_1", "col_1"],
989
- ),
990
- incremental_deltas=pyarrow_read_csv(
991
- EMPTY_UTSV_PATH,
992
- **ReadKwargsProviderPyArrowSchemaOverride(
993
- schema=pa.schema(
994
- [
995
- ("pk_col_1", pa.string()),
996
- ("col_1", pa.float64()),
997
- ]
998
- )
999
- )(
1000
- ContentType.UNESCAPED_TSV.value,
1001
- content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value),
1002
- ),
1003
- ),
1004
- incremental_deltas_delta_type=DeltaType.UPSERT,
1005
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
1006
- [
1007
- pa.array(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]),
1008
- pa.array([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
1009
- ],
1010
- names=["pk_col_1", "col_1"],
1011
- ),
1012
- do_create_placement_group=False,
1013
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1014
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1015
- read_kwargs_provider=None,
1016
- drop_duplicates=True,
1017
- skip_enabled_compact_partition_drivers=None,
1018
- ),
1019
- "12-rebase-then-incremental-single-hash-bucket": RebaseThenIncrementalCompactionTestCaseParams(
1020
- primary_keys={"pk_col_1"},
1021
- sort_keys=[
1022
- SortKey.of(key_name="sk_col_1"),
1023
- SortKey.of(key_name="sk_col_2"),
1024
- ],
1025
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1026
- partition_values=["1"],
1027
- input_deltas=pa.Table.from_arrays(
1028
- [
1029
- pa.array([str(i) for i in range(10)]),
1030
- pa.array([i for i in range(0, 10)]),
1031
- pa.array(["foo"] * 10),
1032
- pa.array([i / 10 for i in range(10, 20)]),
1033
- ],
1034
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1035
- ),
1036
- input_deltas_delta_type=DeltaType.UPSERT,
1037
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
1038
- [
1039
- pa.array([str(i) for i in range(10)]),
1040
- pa.array([i for i in range(0, 10)]),
1041
- pa.array(["foo"] * 10),
1042
- pa.array([i / 10 for i in range(10, 20)]),
1043
- ],
1044
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1045
- ),
1046
- incremental_deltas=pa.Table.from_arrays(
1047
- [
1048
- pa.array([str(i) for i in range(10)]),
1049
- pa.array([i for i in range(20, 30)]),
1050
- pa.array(["foo"] * 10),
1051
- pa.array([i / 10 for i in range(40, 50)]),
1052
- ],
1053
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1054
- ),
1055
- incremental_deltas_delta_type=DeltaType.UPSERT,
1056
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
1057
- [
1058
- pa.array([str(i) for i in range(10)]),
1059
- pa.array([i for i in range(20, 30)]),
1060
- pa.array(["foo"] * 10),
1061
- pa.array([i / 10 for i in range(40, 50)]),
1062
- ],
1063
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1064
- ),
1065
- do_create_placement_group=False,
1066
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1067
- hash_bucket_count=1,
1068
- read_kwargs_provider=None,
1069
- drop_duplicates=True,
1070
- skip_enabled_compact_partition_drivers=None,
1071
- ),
1072
- "13-rebase-then-incremental-drop-duplicates-false-on-incremental-v2-only": RebaseThenIncrementalCompactionTestCaseParams(
1073
- primary_keys={"pk_col_1"},
1074
- sort_keys=[
1075
- SortKey.of(key_name="sk_col_1"),
1076
- ],
1077
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1078
- partition_values=["1"],
1079
- input_deltas=pa.Table.from_arrays(
1080
- [
1081
- pa.array([(i % 4) for i in range(8)]),
1082
- pa.array([(i % 2) for i in range(8)]),
1083
- pa.array([i / 10 for i in range(10, 18)]),
1084
- ],
1085
- names=["pk_col_1", "sk_col_1", "col_1"],
1086
- ),
1087
- input_deltas_delta_type=DeltaType.UPSERT,
1088
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
1089
- [
1090
- pa.array([0, 1, 2, 3]),
1091
- pa.array([0, 1, 0, 1]),
1092
- pa.array([1.4, 1.5, 1.6, 1.7]),
1093
- ],
1094
- names=["pk_col_1", "sk_col_1", "col_1"],
1095
- ),
1096
- incremental_deltas=pa.Table.from_arrays(
1097
- [
1098
- pa.array([0, 1, 2, 3, 1]),
1099
- pa.array([0, 1, 0, 1, 0]),
1100
- pa.array([i / 10 for i in range(20, 25)]),
1101
- ],
1102
- names=["pk_col_1", "sk_col_1", "col_1"],
1103
- ),
1104
- incremental_deltas_delta_type=DeltaType.UPSERT,
1105
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
1106
- [
1107
- pa.array([0, 0, 1, 1, 1, 2, 2, 3, 3]),
1108
- pa.array([0, 0, 1, 0, 1, 0, 0, 1, 1]),
1109
- pa.array([1.4, 2, 1.5, 2.4, 2.1, 1.6, 2.2, 1.7, 2.3]),
1110
- ],
1111
- names=["pk_col_1", "sk_col_1", "col_1"],
1112
- ),
1113
- do_create_placement_group=False,
1114
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1115
- hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1116
- read_kwargs_provider=None,
1117
- drop_duplicates=False,
1118
- skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1119
- ),
1120
- "14-rebase-then-empty-incremental-delta": RebaseThenIncrementalCompactionTestCaseParams(
1121
- primary_keys={"pk_col_1"},
1122
- sort_keys=[
1123
- SortKey.of(key_name="sk_col_1"),
1124
- SortKey.of(key_name="sk_col_2"),
1125
- ],
1126
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1127
- partition_values=["1"],
1128
- input_deltas=pa.Table.from_arrays(
1129
- [
1130
- pa.array([str(i) for i in range(10)]),
1131
- pa.array([i for i in range(0, 10)]),
1132
- pa.array(["foo"] * 10),
1133
- pa.array([i / 10 for i in range(10, 20)]),
1134
- ],
1135
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1136
- ),
1137
- input_deltas_delta_type=DeltaType.UPSERT,
1138
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
1139
- [
1140
- pa.array([str(i) for i in range(10)]),
1141
- pa.array([i for i in range(0, 10)]),
1142
- pa.array(["foo"] * 10),
1143
- pa.array([i / 10 for i in range(10, 20)]),
1144
- ],
1145
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1146
- ),
1147
- incremental_deltas=None,
1148
- incremental_deltas_delta_type=DeltaType.UPSERT,
1149
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
1150
- [
1151
- pa.array([str(i) for i in range(10)]),
1152
- pa.array([i for i in range(0, 10)]),
1153
- pa.array(["foo"] * 10),
1154
- pa.array([i / 10 for i in range(10, 20)]),
1155
- ],
1156
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1157
- ),
1158
- do_create_placement_group=False,
1159
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1160
- hash_bucket_count=3,
1161
- read_kwargs_provider=None,
1162
- drop_duplicates=True,
1163
- skip_enabled_compact_partition_drivers=None,
1164
- ),
1165
- "15-rebase-then-incremental-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
1166
- primary_keys={"pk_col_1"},
1167
- sort_keys=[
1168
- SortKey.of(key_name="sk_col_1"),
1169
- SortKey.of(key_name="sk_col_2"),
1170
- ],
1171
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1172
- partition_values=["1"],
1173
- input_deltas=pa.Table.from_arrays(
1174
- [
1175
- pa.array([str(i) for i in range(10)]),
1176
- pa.array([i for i in range(0, 10)]),
1177
- pa.array(["foo"] * 10),
1178
- pa.array([i / 10 for i in range(10, 20)]),
1179
- ],
1180
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1181
- ),
1182
- input_deltas_delta_type=DeltaType.UPSERT,
1183
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
1184
- [
1185
- pa.array([str(i) for i in range(10)]),
1186
- pa.array([i for i in range(0, 10)]),
1187
- pa.array(["foo"] * 10),
1188
- pa.array([i / 10 for i in range(10, 20)]),
1189
- ],
1190
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1191
- ),
1192
- incremental_deltas=pa.Table.from_arrays(
1193
- [
1194
- pa.array([str(i) for i in range(10)]),
1195
- pa.array([i for i in range(20, 30)]),
1196
- pa.array(["foo"] * 10),
1197
- pa.array([i / 10 for i in range(40, 50)]),
1198
- ],
1199
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1200
- ),
1201
- incremental_deltas_delta_type=DeltaType.UPSERT,
1202
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
1203
- [
1204
- pa.array([str(i) for i in range(10)]),
1205
- pa.array([i for i in range(20, 30)]),
1206
- pa.array(["foo"] * 10),
1207
- pa.array([i / 10 for i in range(40, 50)]),
1208
- ],
1209
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1210
- ),
1211
- do_create_placement_group=False,
1212
- records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1213
- hash_bucket_count=1,
1214
- read_kwargs_provider=None,
1215
- drop_duplicates=True,
1216
- skip_enabled_compact_partition_drivers=None,
1217
- ),
1218
- "16-rebase-then-empty-incremental-delta-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
1219
- primary_keys={"pk_col_1"},
1220
- sort_keys=[
1221
- SortKey.of(key_name="sk_col_1"),
1222
- SortKey.of(key_name="sk_col_2"),
1223
- ],
1224
- partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1225
- partition_values=["1"],
1226
- input_deltas=pa.Table.from_arrays(
1227
- [
1228
- pa.array([str(i) for i in range(10)]),
1229
- pa.array([i for i in range(0, 10)]),
1230
- pa.array(["foo"] * 10),
1231
- pa.array([i / 10 for i in range(10, 20)]),
1232
- ],
1233
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1234
- ),
1235
- input_deltas_delta_type=DeltaType.UPSERT,
1236
- rebase_expected_compact_partition_result=pa.Table.from_arrays(
1237
- [
1238
- pa.array([str(i) for i in range(10)]),
1239
- pa.array([i for i in range(0, 10)]),
1240
- pa.array(["foo"] * 10),
1241
- pa.array([i / 10 for i in range(10, 20)]),
1242
- ],
1243
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1244
- ),
1245
- incremental_deltas=None,
1246
- incremental_deltas_delta_type=DeltaType.UPSERT,
1247
- expected_terminal_compact_partition_result=pa.Table.from_arrays(
1248
- [
1249
- pa.array([str(i) for i in range(10)]),
1250
- pa.array([i for i in range(0, 10)]),
1251
- pa.array(["foo"] * 10),
1252
- pa.array([i / 10 for i in range(10, 20)]),
1253
- ],
1254
- names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1255
- ),
454
+ expected_terminal_exception=None,
1256
455
  do_create_placement_group=False,
1257
456
  records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1258
457
  hash_bucket_count=1,
@@ -1263,8 +462,3 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
1263
462
  }
1264
463
 
1265
464
  INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
1266
-
1267
-
1268
- REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
1269
- REBASE_THEN_INCREMENTAL_TEST_CASES
1270
- )