deltacat 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +25 -0
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
  4. deltacat/compute/compactor/model/delta_file_envelope.py +21 -3
  5. deltacat/compute/compactor/model/table_object_store.py +51 -0
  6. deltacat/compute/compactor/utils/io.py +1 -1
  7. deltacat/compute/compactor_v2/compaction_session.py +80 -14
  8. deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  9. deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +83 -0
  10. deltacat/compute/compactor_v2/deletes/delete_strategy.py +82 -0
  11. deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +161 -0
  12. deltacat/compute/compactor_v2/deletes/model.py +23 -0
  13. deltacat/compute/compactor_v2/deletes/utils.py +164 -0
  14. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  15. deltacat/compute/compactor_v2/model/merge_input.py +24 -1
  16. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  17. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -6
  18. deltacat/compute/compactor_v2/steps/merge.py +221 -50
  19. deltacat/compute/compactor_v2/utils/delta.py +11 -1
  20. deltacat/compute/compactor_v2/utils/merge.py +10 -0
  21. deltacat/compute/compactor_v2/utils/task_options.py +94 -8
  22. deltacat/io/memcached_object_store.py +20 -0
  23. deltacat/io/ray_plasma_object_store.py +6 -0
  24. deltacat/logs.py +29 -2
  25. deltacat/storage/__init__.py +3 -0
  26. deltacat/storage/interface.py +2 -0
  27. deltacat/storage/model/delete_parameters.py +40 -0
  28. deltacat/storage/model/delta.py +25 -1
  29. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +1930 -0
  30. deltacat/tests/compute/compact_partition_test_cases.py +16 -822
  31. deltacat/tests/compute/compactor/utils/test_io.py +4 -4
  32. deltacat/tests/compute/test_compact_partition_incremental.py +4 -0
  33. deltacat/tests/compute/test_compact_partition_params.py +5 -0
  34. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +32 -20
  35. deltacat/tests/compute/test_util_create_table_deltas_repo.py +28 -10
  36. deltacat/tests/io/test_memcached_object_store.py +19 -0
  37. deltacat/tests/local_deltacat_storage/__init__.py +3 -0
  38. deltacat/tests/test_utils/constants.py +1 -2
  39. deltacat/tests/test_utils/pyarrow.py +27 -10
  40. deltacat/utils/pandas.py +1 -1
  41. deltacat/utils/ray_utils/runtime.py +3 -3
  42. deltacat/utils/resources.py +7 -5
  43. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/METADATA +1 -1
  44. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/RECORD +47 -38
  45. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/LICENSE +0 -0
  46. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/WHEEL +0 -0
  47. {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1930 @@
1
+ import pyarrow as pa
2
+ from typing import List, Optional, Tuple
3
+ from deltacat.tests.compute.test_util_common import (
4
+ offer_iso8601_timestamp_list,
5
+ PartitionKey,
6
+ PartitionKeyType,
7
+ )
8
+ from deltacat.tests.compute.test_util_constant import (
9
+ DEFAULT_MAX_RECORDS_PER_FILE,
10
+ DEFAULT_HASH_BUCKET_COUNT,
11
+ )
12
+ import string
13
+ from dataclasses import dataclass
14
+
15
+
16
+ from deltacat.storage import (
17
+ DeltaType,
18
+ )
19
+ from deltacat.types.media import ContentType
20
+
21
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
22
+
23
+ from deltacat.storage.model.sort_key import SortKey
24
+
25
+ from deltacat.utils.pyarrow import (
26
+ ReadKwargsProviderPyArrowSchemaOverride,
27
+ content_type_to_reader_kwargs,
28
+ pyarrow_read_csv,
29
+ )
30
+ from deltacat.tests.compute.compact_partition_test_cases import (
31
+ BaseCompactorTestCase,
32
+ with_compactor_version_func_test_param,
33
+ ZERO_VALUED_SORT_KEY,
34
+ ZERO_VALUED_PARTITION_VALUES_PARAM,
35
+ ZERO_VALUED_PARTITION_KEYS_PARAM,
36
+ ZERO_VALUED_PRIMARY_KEY,
37
+ EMPTY_UTSV_PATH,
38
+ )
39
+ from deltacat.storage import DeleteParameters
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class RebaseThenIncrementalCompactionTestCaseParams(BaseCompactorTestCase):
44
+ """
45
+ A pytest parameterized test case for the `compact_partition` function with rebase and incremental compaction.
46
+
47
+ Args:
48
+ * (inherited from CompactorTestCase): see CompactorTestCase docstring for details
49
+ incremental_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]] - argument required for delta creation during the incremental phase of compact_partition test setup. Incoming deltas during incremental expressed as a pyarrow array
50
+ rebase_expected_compact_partition_result: pa.Table - expected table after rebase compaction runs. An output that is asserted on in Rebase then Incremental unit tests
51
+ """
52
+
53
+ incremental_deltas: List[Tuple[pa.Table, DeltaType, Optional[DeleteParameters]]]
54
+ rebase_expected_compact_partition_result: pa.Table
55
+
56
+
57
+ REBASE_THEN_INCREMENTAL_TEST_CASES = {
58
+ "1-rebase-then-incremental-sanity": RebaseThenIncrementalCompactionTestCaseParams(
59
+ primary_keys={"pk_col_1"},
60
+ sort_keys=[
61
+ SortKey.of(key_name="sk_col_1"),
62
+ SortKey.of(key_name="sk_col_2"),
63
+ ],
64
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
65
+ partition_values=["1"],
66
+ input_deltas=pa.Table.from_arrays(
67
+ [
68
+ pa.array([str(i) for i in range(10)]),
69
+ pa.array([i for i in range(0, 10)]),
70
+ pa.array(["foo"] * 10),
71
+ pa.array([i / 10 for i in range(10, 20)]),
72
+ ],
73
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
74
+ ),
75
+ input_deltas_delta_type=DeltaType.UPSERT,
76
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
77
+ [
78
+ pa.array([str(i) for i in range(10)]),
79
+ pa.array([i for i in range(0, 10)]),
80
+ pa.array(["foo"] * 10),
81
+ pa.array([i / 10 for i in range(10, 20)]),
82
+ ],
83
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
84
+ ),
85
+ incremental_deltas=[
86
+ (
87
+ pa.Table.from_arrays(
88
+ [
89
+ pa.array([str(i) for i in range(10)]),
90
+ pa.array([i for i in range(20, 30)]),
91
+ pa.array(["foo"] * 10),
92
+ pa.array([i / 10 for i in range(40, 50)]),
93
+ ],
94
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
95
+ ),
96
+ DeltaType.UPSERT,
97
+ None,
98
+ )
99
+ ],
100
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
101
+ [
102
+ pa.array([str(i) for i in range(10)]),
103
+ pa.array([i for i in range(20, 30)]),
104
+ pa.array(["foo"] * 10),
105
+ pa.array([i / 10 for i in range(40, 50)]),
106
+ ],
107
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
108
+ ),
109
+ expected_terminal_exception=None,
110
+ do_create_placement_group=False,
111
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
112
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
113
+ read_kwargs_provider=None,
114
+ drop_duplicates=True,
115
+ skip_enabled_compact_partition_drivers=None,
116
+ ),
117
+ "2-rebase-then-incremental-pk-multi": RebaseThenIncrementalCompactionTestCaseParams(
118
+ primary_keys={"pk_col_1", "pk_col_2"},
119
+ sort_keys=[
120
+ SortKey.of(key_name="sk_col_1"),
121
+ ],
122
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
123
+ partition_values=["1"],
124
+ input_deltas=pa.Table.from_arrays(
125
+ [
126
+ pa.array([str(i % 4) for i in range(10)]),
127
+ pa.array([(i % 4) / 10 for i in range(9, -1, -1)]),
128
+ pa.array(offer_iso8601_timestamp_list(10, "minutes")),
129
+ pa.array([i / 10 for i in range(10, 20)]),
130
+ ],
131
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
132
+ ),
133
+ input_deltas_delta_type=DeltaType.UPSERT,
134
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
135
+ [
136
+ pa.array(["0", "1", "2", "3"]),
137
+ pa.array([0.1, 0, 0.3, 0.2]),
138
+ pa.array(
139
+ [
140
+ "2023-05-03T10:00:00Z",
141
+ "2023-05-03T09:59:00Z",
142
+ "2023-05-03T09:58:00Z",
143
+ "2023-05-03T09:57:00Z",
144
+ ]
145
+ ),
146
+ pa.array([1, 1.1, 1.2, 1.3]),
147
+ ],
148
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
149
+ ),
150
+ incremental_deltas=[
151
+ (
152
+ pa.Table.from_arrays(
153
+ [
154
+ pa.array(["0", "1", "2", "3"]),
155
+ pa.array([0.1, 0, 0.3, 0.2]),
156
+ pa.array(
157
+ [
158
+ "2023-05-03T10:00:00Z",
159
+ "2023-05-03T09:59:00Z",
160
+ "2023-05-03T09:58:00Z",
161
+ "2023-05-03T09:57:00Z",
162
+ ]
163
+ ),
164
+ pa.array([1, 1.1, 1.2, 1.3]),
165
+ ],
166
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
167
+ ),
168
+ DeltaType.UPSERT,
169
+ {},
170
+ )
171
+ ],
172
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
173
+ [
174
+ pa.array(["0", "1", "2", "3"]),
175
+ pa.array([0.1, 0, 0.3, 0.2]),
176
+ pa.array(
177
+ [
178
+ "2023-05-03T10:00:00Z",
179
+ "2023-05-03T09:59:00Z",
180
+ "2023-05-03T09:58:00Z",
181
+ "2023-05-03T09:57:00Z",
182
+ ]
183
+ ),
184
+ pa.array([1, 1.1, 1.2, 1.3]),
185
+ ],
186
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
187
+ ),
188
+ expected_terminal_exception=None,
189
+ do_create_placement_group=False,
190
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
191
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
192
+ read_kwargs_provider=None,
193
+ drop_duplicates=True,
194
+ skip_enabled_compact_partition_drivers=None,
195
+ ),
196
+ "3-rebase-then-incremental-no-sk-no-partition-key": RebaseThenIncrementalCompactionTestCaseParams(
197
+ primary_keys={"pk_col_1"},
198
+ sort_keys=ZERO_VALUED_SORT_KEY,
199
+ partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
200
+ partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
201
+ input_deltas=pa.Table.from_arrays(
202
+ [
203
+ pa.array([str(i % 4) for i in range(12)]),
204
+ pa.array([i / 10 for i in range(10, 22)]),
205
+ ],
206
+ names=["pk_col_1", "col_1"],
207
+ ),
208
+ input_deltas_delta_type=DeltaType.UPSERT,
209
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
210
+ [
211
+ pa.array(["0", "1", "2", "3"]),
212
+ pa.array([1.8, 1.9, 2.0, 2.1]),
213
+ ],
214
+ names=["pk_col_1", "col_1"],
215
+ ),
216
+ incremental_deltas=[
217
+ (
218
+ pa.Table.from_arrays(
219
+ [
220
+ pa.array(["0", "1", "2", "3"]),
221
+ pa.array([18.0, 19.0, 20.0, 21.0]),
222
+ ],
223
+ names=["pk_col_1", "col_1"],
224
+ ),
225
+ DeltaType.UPSERT,
226
+ None,
227
+ )
228
+ ],
229
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
230
+ [
231
+ pa.array(["0", "1", "2", "3"]),
232
+ pa.array([18.0, 19.0, 20.0, 21.0]),
233
+ ],
234
+ names=["pk_col_1", "col_1"],
235
+ ),
236
+ expected_terminal_exception=None,
237
+ do_create_placement_group=False,
238
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
239
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
240
+ read_kwargs_provider=None,
241
+ drop_duplicates=True,
242
+ skip_enabled_compact_partition_drivers=None,
243
+ ),
244
+ "4-rebase-then-incremental-partial-deltas-on-incremental-deltas": RebaseThenIncrementalCompactionTestCaseParams(
245
+ primary_keys={"pk_col_1"},
246
+ sort_keys=ZERO_VALUED_SORT_KEY,
247
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
248
+ partition_values=["1"],
249
+ input_deltas=pa.Table.from_arrays(
250
+ [
251
+ pa.array([str(i) for i in range(10)]),
252
+ pa.array([i / 10 for i in range(10)]),
253
+ ],
254
+ names=["pk_col_1", "col_1"],
255
+ ),
256
+ input_deltas_delta_type=DeltaType.UPSERT,
257
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
258
+ [
259
+ pa.array([str(i) for i in range(10)]),
260
+ pa.array([i / 10 for i in range(10)]),
261
+ ],
262
+ names=["pk_col_1", "col_1"],
263
+ ),
264
+ incremental_deltas=[
265
+ (
266
+ pa.Table.from_arrays(
267
+ [
268
+ pa.array(["8", "9"]),
269
+ pa.array([200.0, 100.0]),
270
+ ],
271
+ names=["pk_col_1", "col_1"],
272
+ ),
273
+ DeltaType.UPSERT,
274
+ None,
275
+ )
276
+ ],
277
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
278
+ [
279
+ pa.array([str(i) for i in range(10)]),
280
+ pa.array([i / 10 for i in range(8)] + [200.0] + [100.0]),
281
+ ],
282
+ names=["pk_col_1", "col_1"],
283
+ ),
284
+ expected_terminal_exception=None,
285
+ do_create_placement_group=False,
286
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
287
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
288
+ read_kwargs_provider=None,
289
+ drop_duplicates=True,
290
+ skip_enabled_compact_partition_drivers=None,
291
+ ),
292
+ "5-rebase-then-incremental-partial-deltas-on-incremental-deltas-2": RebaseThenIncrementalCompactionTestCaseParams(
293
+ primary_keys={"pk_col_1"},
294
+ sort_keys=[
295
+ SortKey.of(key_name="sk_col_1"),
296
+ ],
297
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
298
+ partition_values=["1"],
299
+ input_deltas=pa.Table.from_arrays(
300
+ [
301
+ pa.array([i % 4 for i in range(12)]),
302
+ pa.array([(i / 10 * 10) % 4 for i in range(12)][::-1]),
303
+ pa.array(list(string.ascii_lowercase)[:12]),
304
+ ],
305
+ names=["pk_col_1", "sk_col_1", "col_1"],
306
+ ),
307
+ input_deltas_delta_type=DeltaType.UPSERT,
308
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
309
+ [
310
+ pa.array([0, 1, 2, 3]),
311
+ pa.array([3.0, 2.0, 1.0, 0.0]),
312
+ pa.array(["i", "j", "k", "l"]),
313
+ ],
314
+ names=["pk_col_1", "sk_col_1", "col_1"],
315
+ ),
316
+ incremental_deltas=[
317
+ (
318
+ pa.Table.from_arrays(
319
+ [
320
+ pa.array([1, 4]),
321
+ pa.array([4.0, 2.0]),
322
+ pa.array(["a", "b"]),
323
+ ],
324
+ names=["pk_col_1", "sk_col_1", "col_1"],
325
+ ),
326
+ DeltaType.UPSERT,
327
+ None,
328
+ )
329
+ ],
330
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
331
+ [
332
+ pa.array([0, 1, 2, 3, 4]),
333
+ pa.array([3.0, 4.0, 1.0, 0.0, 2.0]),
334
+ pa.array(["i", "a", "k", "l", "b"]),
335
+ ],
336
+ names=["pk_col_1", "sk_col_1", "col_1"],
337
+ ),
338
+ expected_terminal_exception=None,
339
+ do_create_placement_group=False,
340
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
341
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
342
+ read_kwargs_provider=None,
343
+ drop_duplicates=True,
344
+ skip_enabled_compact_partition_drivers=None,
345
+ ),
346
+ "6-rebase-then-incremental-hash-bucket-GT-records-per-compacted-file-v2-only": RebaseThenIncrementalCompactionTestCaseParams(
347
+ primary_keys={"pk_col_1"},
348
+ sort_keys=[
349
+ SortKey.of(key_name="sk_col_1"),
350
+ SortKey.of(key_name="sk_col_2"),
351
+ ],
352
+ partition_keys=[PartitionKey.of("day", PartitionKeyType.TIMESTAMP)],
353
+ partition_values=["2022-01-01T00:00:00.000Z"],
354
+ input_deltas=pa.Table.from_arrays(
355
+ [
356
+ pa.array([str(i) for i in range(12)]),
357
+ pa.array([i for i in range(0, 12)]),
358
+ pa.array(["foo"] * 12),
359
+ pa.array([i / 10 for i in range(10, 22)]),
360
+ ],
361
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
362
+ ),
363
+ input_deltas_delta_type=DeltaType.UPSERT,
364
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
365
+ [
366
+ pa.array([str(i) for i in range(12)]),
367
+ pa.array([i for i in range(0, 12)]),
368
+ pa.array(["foo"] * 12),
369
+ pa.array([i / 10 for i in range(10, 22)]),
370
+ ],
371
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
372
+ ),
373
+ incremental_deltas=[
374
+ (
375
+ pa.Table.from_arrays(
376
+ [
377
+ pa.array([str(i) for i in range(12)]),
378
+ pa.array([i for i in range(20, 32)]),
379
+ pa.array(["foo"] * 12),
380
+ pa.array([i / 10 for i in range(40, 52)]),
381
+ ],
382
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
383
+ ),
384
+ DeltaType.UPSERT,
385
+ None,
386
+ )
387
+ ],
388
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
389
+ [
390
+ pa.array([str(i) for i in range(12)]),
391
+ pa.array([i for i in range(20, 32)]),
392
+ pa.array(["foo"] * 12),
393
+ pa.array([i / 10 for i in range(40, 52)]),
394
+ ],
395
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
396
+ ),
397
+ expected_terminal_exception=None,
398
+ do_create_placement_group=False,
399
+ records_per_compacted_file=10,
400
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT + 10,
401
+ read_kwargs_provider=None,
402
+ drop_duplicates=True,
403
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
404
+ ),
405
+ "7-rebase-then-incremental-no-pk-compactor-v2-only": RebaseThenIncrementalCompactionTestCaseParams(
406
+ primary_keys=ZERO_VALUED_PRIMARY_KEY,
407
+ sort_keys=[
408
+ SortKey.of(key_name="sk_col_1"),
409
+ ],
410
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
411
+ partition_values=["1"],
412
+ input_deltas=pa.Table.from_arrays(
413
+ [
414
+ pa.array([1, 2, 3]),
415
+ pa.array([1.0, 2.0, 3.0]),
416
+ ],
417
+ names=["sk_col_1", "col_1"],
418
+ ),
419
+ input_deltas_delta_type=DeltaType.UPSERT,
420
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
421
+ [
422
+ pa.array([1, 1, 2, 2, 3, 3]),
423
+ pa.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0]),
424
+ ],
425
+ names=["sk_col_1", "col_1"],
426
+ ),
427
+ incremental_deltas=[
428
+ (
429
+ pa.Table.from_arrays(
430
+ [
431
+ pa.array([4, 5, 6]),
432
+ pa.array([10.0, 11.0, 12.0]),
433
+ ],
434
+ names=["sk_col_1", "col_1"],
435
+ ),
436
+ DeltaType.UPSERT,
437
+ None,
438
+ )
439
+ ],
440
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
441
+ [
442
+ pa.array([1, 1, 2, 2, 3, 3, 4, 5, 6]),
443
+ pa.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 10.0, 11.0, 12.0]),
444
+ ],
445
+ names=["sk_col_1", "col_1"],
446
+ ),
447
+ expected_terminal_exception=None,
448
+ do_create_placement_group=False,
449
+ records_per_compacted_file=10,
450
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
451
+ read_kwargs_provider=None,
452
+ drop_duplicates=True,
453
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
454
+ ),
455
+ "8-rebase-then-incremental-empty-csv-delta-case": RebaseThenIncrementalCompactionTestCaseParams(
456
+ primary_keys={"pk_col_1"},
457
+ sort_keys=ZERO_VALUED_SORT_KEY,
458
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
459
+ partition_values=["1"],
460
+ input_deltas=pa.Table.from_arrays(
461
+ [
462
+ pa.array([str(i) for i in range(10)]),
463
+ pa.array([i / 10 for i in range(10, 20)]),
464
+ ],
465
+ names=["pk_col_1", "col_1"],
466
+ ),
467
+ input_deltas_delta_type=DeltaType.UPSERT,
468
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
469
+ [
470
+ pa.array([str(i) for i in range(10)]),
471
+ pa.array([i / 10 for i in range(10, 20)]),
472
+ ],
473
+ names=["pk_col_1", "col_1"],
474
+ ),
475
+ incremental_deltas=[
476
+ (
477
+ pyarrow_read_csv(
478
+ EMPTY_UTSV_PATH,
479
+ **ReadKwargsProviderPyArrowSchemaOverride(
480
+ schema=pa.schema(
481
+ [
482
+ ("pk_col_1", pa.string()),
483
+ ("col_1", pa.float64()),
484
+ ]
485
+ )
486
+ )(
487
+ ContentType.UNESCAPED_TSV.value,
488
+ content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value),
489
+ ),
490
+ ),
491
+ DeltaType.UPSERT,
492
+ None,
493
+ )
494
+ ],
495
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
496
+ [
497
+ pa.array(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]),
498
+ pa.array([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
499
+ ],
500
+ names=["pk_col_1", "col_1"],
501
+ ),
502
+ expected_terminal_exception=None,
503
+ do_create_placement_group=False,
504
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
505
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
506
+ read_kwargs_provider=None,
507
+ drop_duplicates=True,
508
+ skip_enabled_compact_partition_drivers=None,
509
+ ),
510
+ "9-rebase-then-incremental-single-hash-bucket": RebaseThenIncrementalCompactionTestCaseParams(
511
+ primary_keys={"pk_col_1"},
512
+ sort_keys=[
513
+ SortKey.of(key_name="sk_col_1"),
514
+ SortKey.of(key_name="sk_col_2"),
515
+ ],
516
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
517
+ partition_values=["1"],
518
+ input_deltas=pa.Table.from_arrays(
519
+ [
520
+ pa.array([str(i) for i in range(10)]),
521
+ pa.array([i for i in range(0, 10)]),
522
+ pa.array(["foo"] * 10),
523
+ pa.array([i / 10 for i in range(10, 20)]),
524
+ ],
525
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
526
+ ),
527
+ input_deltas_delta_type=DeltaType.UPSERT,
528
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
529
+ [
530
+ pa.array([str(i) for i in range(10)]),
531
+ pa.array([i for i in range(0, 10)]),
532
+ pa.array(["foo"] * 10),
533
+ pa.array([i / 10 for i in range(10, 20)]),
534
+ ],
535
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
536
+ ),
537
+ incremental_deltas=[
538
+ (
539
+ pa.Table.from_arrays(
540
+ [
541
+ pa.array([str(i) for i in range(10)]),
542
+ pa.array([i for i in range(20, 30)]),
543
+ pa.array(["foo"] * 10),
544
+ pa.array([i / 10 for i in range(40, 50)]),
545
+ ],
546
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
547
+ ),
548
+ DeltaType.UPSERT,
549
+ None,
550
+ )
551
+ ],
552
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
553
+ [
554
+ pa.array([str(i) for i in range(10)]),
555
+ pa.array([i for i in range(20, 30)]),
556
+ pa.array(["foo"] * 10),
557
+ pa.array([i / 10 for i in range(40, 50)]),
558
+ ],
559
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
560
+ ),
561
+ expected_terminal_exception=None,
562
+ do_create_placement_group=False,
563
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
564
+ hash_bucket_count=1,
565
+ read_kwargs_provider=None,
566
+ drop_duplicates=True,
567
+ skip_enabled_compact_partition_drivers=None,
568
+ ),
569
+ "10-rebase-then-incremental-drop-duplicates-false-on-incremental-v2-only": RebaseThenIncrementalCompactionTestCaseParams(
570
+ primary_keys={"pk_col_1"},
571
+ sort_keys=[
572
+ SortKey.of(key_name="sk_col_1"),
573
+ ],
574
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
575
+ partition_values=["1"],
576
+ input_deltas=pa.Table.from_arrays(
577
+ [
578
+ pa.array([(i % 4) for i in range(8)]),
579
+ pa.array([(i % 2) for i in range(8)]),
580
+ pa.array([i / 10 for i in range(10, 18)]),
581
+ ],
582
+ names=["pk_col_1", "sk_col_1", "col_1"],
583
+ ),
584
+ input_deltas_delta_type=DeltaType.UPSERT,
585
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
586
+ [
587
+ pa.array([0, 1, 2, 3]),
588
+ pa.array([0, 1, 0, 1]),
589
+ pa.array([1.4, 1.5, 1.6, 1.7]),
590
+ ],
591
+ names=["pk_col_1", "sk_col_1", "col_1"],
592
+ ),
593
+ incremental_deltas=[
594
+ (
595
+ pa.Table.from_arrays(
596
+ [
597
+ pa.array([0, 1, 2, 3, 1]),
598
+ pa.array([0, 1, 0, 1, 0]),
599
+ pa.array([i / 10 for i in range(20, 25)]),
600
+ ],
601
+ names=["pk_col_1", "sk_col_1", "col_1"],
602
+ ),
603
+ DeltaType.UPSERT,
604
+ None,
605
+ )
606
+ ],
607
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
608
+ [
609
+ pa.array([0, 0, 1, 1, 1, 2, 2, 3, 3]),
610
+ pa.array([0, 0, 1, 0, 1, 0, 0, 1, 1]),
611
+ pa.array([1.4, 2, 1.5, 2.4, 2.1, 1.6, 2.2, 1.7, 2.3]),
612
+ ],
613
+ names=["pk_col_1", "sk_col_1", "col_1"],
614
+ ),
615
+ expected_terminal_exception=None,
616
+ do_create_placement_group=False,
617
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
618
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
619
+ read_kwargs_provider=None,
620
+ drop_duplicates=False,
621
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
622
+ ),
623
+ "11-rebase-then-empty-incremental-delta": RebaseThenIncrementalCompactionTestCaseParams(
624
+ primary_keys={"pk_col_1"},
625
+ sort_keys=[
626
+ SortKey.of(key_name="sk_col_1"),
627
+ SortKey.of(key_name="sk_col_2"),
628
+ ],
629
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
630
+ partition_values=["1"],
631
+ input_deltas=pa.Table.from_arrays(
632
+ [
633
+ pa.array([str(i) for i in range(10)]),
634
+ pa.array([i for i in range(0, 10)]),
635
+ pa.array(["foo"] * 10),
636
+ pa.array([i / 10 for i in range(10, 20)]),
637
+ ],
638
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
639
+ ),
640
+ input_deltas_delta_type=DeltaType.UPSERT,
641
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
642
+ [
643
+ pa.array([str(i) for i in range(10)]),
644
+ pa.array([i for i in range(0, 10)]),
645
+ pa.array(["foo"] * 10),
646
+ pa.array([i / 10 for i in range(10, 20)]),
647
+ ],
648
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
649
+ ),
650
+ incremental_deltas=[(None, DeltaType.UPSERT, None)],
651
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
652
+ [
653
+ pa.array([str(i) for i in range(10)]),
654
+ pa.array([i for i in range(0, 10)]),
655
+ pa.array(["foo"] * 10),
656
+ pa.array([i / 10 for i in range(10, 20)]),
657
+ ],
658
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
659
+ ),
660
+ expected_terminal_exception=None,
661
+ do_create_placement_group=False,
662
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
663
+ hash_bucket_count=3,
664
+ read_kwargs_provider=None,
665
+ drop_duplicates=True,
666
+ skip_enabled_compact_partition_drivers=None,
667
+ ),
668
+ "12-rebase-then-incremental-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
669
+ primary_keys={"pk_col_1"},
670
+ sort_keys=[
671
+ SortKey.of(key_name="sk_col_1"),
672
+ SortKey.of(key_name="sk_col_2"),
673
+ ],
674
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
675
+ partition_values=["1"],
676
+ input_deltas=pa.Table.from_arrays(
677
+ [
678
+ pa.array([str(i) for i in range(10)]),
679
+ pa.array([i for i in range(0, 10)]),
680
+ pa.array(["foo"] * 10),
681
+ pa.array([i / 10 for i in range(10, 20)]),
682
+ ],
683
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
684
+ ),
685
+ input_deltas_delta_type=DeltaType.UPSERT,
686
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
687
+ [
688
+ pa.array([str(i) for i in range(10)]),
689
+ pa.array([i for i in range(0, 10)]),
690
+ pa.array(["foo"] * 10),
691
+ pa.array([i / 10 for i in range(10, 20)]),
692
+ ],
693
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
694
+ ),
695
+ incremental_deltas=[
696
+ (
697
+ pa.Table.from_arrays(
698
+ [
699
+ pa.array([str(i) for i in range(10)]),
700
+ pa.array([i for i in range(20, 30)]),
701
+ pa.array(["foo"] * 10),
702
+ pa.array([i / 10 for i in range(40, 50)]),
703
+ ],
704
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
705
+ ),
706
+ DeltaType.UPSERT,
707
+ None,
708
+ )
709
+ ],
710
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
711
+ [
712
+ pa.array([str(i) for i in range(10)]),
713
+ pa.array([i for i in range(20, 30)]),
714
+ pa.array(["foo"] * 10),
715
+ pa.array([i / 10 for i in range(40, 50)]),
716
+ ],
717
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
718
+ ),
719
+ expected_terminal_exception=None,
720
+ do_create_placement_group=False,
721
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
722
+ hash_bucket_count=1,
723
+ read_kwargs_provider=None,
724
+ drop_duplicates=True,
725
+ skip_enabled_compact_partition_drivers=None,
726
+ ),
727
+ "13-rebase-then-empty-incremental-delta-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
728
+ primary_keys={"pk_col_1"},
729
+ sort_keys=[
730
+ SortKey.of(key_name="sk_col_1"),
731
+ SortKey.of(key_name="sk_col_2"),
732
+ ],
733
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
734
+ partition_values=["1"],
735
+ input_deltas=pa.Table.from_arrays(
736
+ [
737
+ pa.array([str(i) for i in range(10)]),
738
+ pa.array([i for i in range(0, 10)]),
739
+ pa.array(["foo"] * 10),
740
+ pa.array([i / 10 for i in range(10, 20)]),
741
+ ],
742
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
743
+ ),
744
+ input_deltas_delta_type=DeltaType.UPSERT,
745
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
746
+ [
747
+ pa.array([str(i) for i in range(10)]),
748
+ pa.array([i for i in range(0, 10)]),
749
+ pa.array(["foo"] * 10),
750
+ pa.array([i / 10 for i in range(10, 20)]),
751
+ ],
752
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
753
+ ),
754
+ incremental_deltas=[(None, DeltaType.UPSERT, None)],
755
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
756
+ [
757
+ pa.array([str(i) for i in range(10)]),
758
+ pa.array([i for i in range(0, 10)]),
759
+ pa.array(["foo"] * 10),
760
+ pa.array([i / 10 for i in range(10, 20)]),
761
+ ],
762
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
763
+ ),
764
+ expected_terminal_exception=None,
765
+ do_create_placement_group=False,
766
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
767
+ hash_bucket_count=1,
768
+ read_kwargs_provider=None,
769
+ drop_duplicates=True,
770
+ skip_enabled_compact_partition_drivers=None,
771
+ ),
772
+ }
773
+
774
+ REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
775
+ "14-rebase-then-incremental-delete-type-delta-on-incremental": RebaseThenIncrementalCompactionTestCaseParams(
776
+ primary_keys={"pk_col_1"},
777
+ sort_keys=ZERO_VALUED_SORT_KEY,
778
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
779
+ partition_values=["1"],
780
+ input_deltas=pa.Table.from_arrays(
781
+ [
782
+ pa.array([i for i in range(12)]),
783
+ pa.array([str(i) for i in range(0, 12)]),
784
+ ],
785
+ names=["pk_col_1", "col_1"],
786
+ ),
787
+ input_deltas_delta_type=DeltaType.UPSERT,
788
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
789
+ [
790
+ pa.array([i for i in range(12)]),
791
+ pa.array([str(i) for i in range(0, 12)]),
792
+ ],
793
+ names=["pk_col_1", "col_1"],
794
+ ),
795
+ incremental_deltas=[
796
+ (
797
+ pa.Table.from_arrays(
798
+ [
799
+ pa.array([10, 11, 12, 13]),
800
+ pa.array(["a", "b", "c", "d"]),
801
+ ],
802
+ names=["pk_col_1", "col_1"],
803
+ ),
804
+ DeltaType.UPSERT,
805
+ None,
806
+ ),
807
+ (
808
+ pa.Table.from_arrays(
809
+ [pa.array([10, 11]), pa.array(["a", "b"])],
810
+ names=["pk_col_1", "col_1"],
811
+ ),
812
+ DeltaType.DELETE,
813
+ DeleteParameters.of(["pk_col_1", "col_1"]),
814
+ ),
815
+ (
816
+ pa.Table.from_arrays(
817
+ [pa.array(["c"])],
818
+ names=["col_1"],
819
+ ),
820
+ DeltaType.DELETE,
821
+ DeleteParameters.of(["col_1"]),
822
+ ),
823
+ (
824
+ pa.Table.from_arrays(
825
+ [pa.array(["c"])],
826
+ names=["col_1"],
827
+ ),
828
+ DeltaType.DELETE,
829
+ DeleteParameters.of(["col_1"]),
830
+ ),
831
+ (
832
+ pa.Table.from_arrays(
833
+ [pa.array(["c"])],
834
+ names=["col_1"],
835
+ ),
836
+ DeltaType.DELETE,
837
+ DeleteParameters.of(["col_1"]),
838
+ ),
839
+ (
840
+ pa.Table.from_arrays(
841
+ [pa.array([10, 11]), pa.array(["a", "b"])],
842
+ names=["pk_col_1", "col_1"],
843
+ ),
844
+ DeltaType.DELETE,
845
+ DeleteParameters.of(["pk_col_1", "col_1"]),
846
+ ),
847
+ (
848
+ pa.Table.from_arrays(
849
+ [pa.array(["c"])],
850
+ names=["col_1"],
851
+ ),
852
+ DeltaType.DELETE,
853
+ DeleteParameters.of(["col_1"]),
854
+ ),
855
+ ],
856
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
857
+ [
858
+ pa.array([i for i in range(10)] + [13]),
859
+ pa.array([str(i) for i in range(0, 10)] + ["d"]),
860
+ ],
861
+ names=["pk_col_1", "col_1"],
862
+ ),
863
+ expected_terminal_exception=None,
864
+ do_create_placement_group=False,
865
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
866
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
867
+ read_kwargs_provider=None,
868
+ drop_duplicates=True,
869
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
870
+ ),
871
+ "15-rebase-then-incremental-delete-type-delta-on-incremental-multi-pk": RebaseThenIncrementalCompactionTestCaseParams(
872
+ primary_keys={"pk_col_1", "pk_col_2"},
873
+ sort_keys=ZERO_VALUED_SORT_KEY,
874
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
875
+ partition_values=["2022-01-01T00:00:00.000Z"],
876
+ input_deltas=pa.Table.from_arrays(
877
+ [
878
+ pa.array([(i % 4) for i in range(12)]),
879
+ pa.array([float(i % 4) for i in range(12, 0, -1)]),
880
+ pa.array([str(i) for i in range(0, 12)]),
881
+ ],
882
+ names=["pk_col_1", "pk_col_2", "col_1"],
883
+ ),
884
+ input_deltas_delta_type=DeltaType.UPSERT,
885
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
886
+ [
887
+ pa.array([0, 1, 2, 3]),
888
+ pa.array([0.0, 3.0, 2.0, 1.0]),
889
+ pa.array(["8", "9", "10", "11"]),
890
+ ],
891
+ names=["pk_col_1", "pk_col_2", "col_1"],
892
+ ),
893
+ incremental_deltas=[
894
+ (
895
+ pa.Table.from_arrays(
896
+ [ # delete last two primary keys
897
+ pa.array(["10", "11"]),
898
+ ],
899
+ names=["col_1"],
900
+ ),
901
+ DeltaType.DELETE,
902
+ DeleteParameters.of(["col_1"]),
903
+ )
904
+ ],
905
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
906
+ [
907
+ pa.array([0, 1]),
908
+ pa.array([0.0, 3.0]),
909
+ pa.array(["8", "9"]),
910
+ ],
911
+ names=["pk_col_1", "pk_col_2", "col_1"],
912
+ ),
913
+ expected_terminal_exception=None,
914
+ do_create_placement_group=False,
915
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
916
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
917
+ read_kwargs_provider=None,
918
+ drop_duplicates=True,
919
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
920
+ ),
921
+ "16-rebase-then-incremental-delete-type-delta-on-incremental-multi-pk-delete-all": RebaseThenIncrementalCompactionTestCaseParams(
922
+ primary_keys={"pk_col_1", "pk_col_2"},
923
+ sort_keys=ZERO_VALUED_SORT_KEY,
924
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
925
+ partition_values=["2022-01-01T00:00:00.000Z"],
926
+ input_deltas=pa.Table.from_arrays(
927
+ [
928
+ pa.array([(i % 4) for i in range(12)]),
929
+ pa.array([float(i % 4) for i in range(12, 0, -1)]),
930
+ pa.array([str(i) for i in range(0, 12)]),
931
+ ],
932
+ names=["pk_col_1", "pk_col_2", "col_1"],
933
+ ),
934
+ input_deltas_delta_type=DeltaType.UPSERT,
935
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
936
+ [
937
+ pa.array([0, 1, 2, 3]),
938
+ pa.array([0.0, 3.0, 2.0, 1.0]),
939
+ pa.array(["8", "9", "10", "11"]),
940
+ ],
941
+ names=["pk_col_1", "pk_col_2", "col_1"],
942
+ ),
943
+ incremental_deltas=[
944
+ (
945
+ pa.Table.from_arrays(
946
+ [
947
+ pa.array(["8", "9", "10", "11"]),
948
+ ],
949
+ names=["col_1"],
950
+ ),
951
+ DeltaType.DELETE,
952
+ DeleteParameters.of(["col_1"]),
953
+ )
954
+ ],
955
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
956
+ [
957
+ pa.array([]),
958
+ pa.array([]),
959
+ pa.array([]),
960
+ ],
961
+ schema=pa.schema(
962
+ [
963
+ ("pk_col_1", pa.int64()),
964
+ ("pk_col_2", pa.float64()),
965
+ ("col_1", pa.string()),
966
+ ]
967
+ ),
968
+ ),
969
+ expected_terminal_exception=None,
970
+ do_create_placement_group=False,
971
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
972
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
973
+ read_kwargs_provider=None,
974
+ drop_duplicates=True,
975
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
976
+ ),
977
+ "17-rebase-then-incremental-delete-type-delta-delete-entire-base-table": RebaseThenIncrementalCompactionTestCaseParams(
978
+ primary_keys={"pk_col_1"},
979
+ sort_keys=ZERO_VALUED_SORT_KEY,
980
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
981
+ partition_values=["2022-01-01T00:00:00.000Z"],
982
+ input_deltas=pa.Table.from_arrays(
983
+ [
984
+ pa.array([(i % 4) for i in range(1000)] + [4, 5]),
985
+ pa.array([str(i) for i in range(0, 1000)] + ["fiz", "buz"]),
986
+ ],
987
+ names=["pk_col_1", "col_1"],
988
+ ),
989
+ input_deltas_delta_type=DeltaType.UPSERT,
990
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
991
+ [
992
+ pa.array([0, 1, 2, 3, 4, 5]),
993
+ pa.array(["996", "997", "998", "999", "fiz", "buz"]),
994
+ ],
995
+ names=["pk_col_1", "col_1"],
996
+ ),
997
+ incremental_deltas=[
998
+ (
999
+ pa.Table.from_arrays(
1000
+ [
1001
+ pa.array(["996", "997", "998", "999", "fiz", "buz"]),
1002
+ ],
1003
+ names=["col_1"],
1004
+ ),
1005
+ DeltaType.DELETE,
1006
+ DeleteParameters.of(["col_1"]),
1007
+ )
1008
+ ],
1009
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1010
+ [
1011
+ pa.array([]),
1012
+ pa.array([]),
1013
+ ],
1014
+ schema=pa.schema(
1015
+ [
1016
+ ("pk_col_1", pa.int64()),
1017
+ ("col_1", pa.string()),
1018
+ ]
1019
+ ),
1020
+ ),
1021
+ expected_terminal_exception=None,
1022
+ do_create_placement_group=True,
1023
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1024
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1025
+ read_kwargs_provider=None,
1026
+ drop_duplicates=True,
1027
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1028
+ ),
1029
+ "18-rebase-then-incremental-delete-type-delta-keep-base-table-drop-all-incremental": RebaseThenIncrementalCompactionTestCaseParams(
1030
+ primary_keys={"pk_col_1"},
1031
+ sort_keys=ZERO_VALUED_SORT_KEY,
1032
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1033
+ partition_values=["2022-01-01T00:00:00.000Z"],
1034
+ input_deltas=pa.Table.from_arrays(
1035
+ [
1036
+ pa.array([(i % 4) for i in range(1000)] + [4, 5]),
1037
+ pa.array([str(i) for i in range(0, 1000)] + ["fiz", "buz"]),
1038
+ ],
1039
+ names=["pk_col_1", "col_1"],
1040
+ ),
1041
+ input_deltas_delta_type=DeltaType.UPSERT,
1042
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1043
+ [
1044
+ pa.array([0, 1, 2, 3, 4, 5]),
1045
+ pa.array(["996", "997", "998", "999", "fiz", "buz"]),
1046
+ ],
1047
+ names=["pk_col_1", "col_1"],
1048
+ ),
1049
+ incremental_deltas=[
1050
+ (
1051
+ pa.Table.from_arrays(
1052
+ [
1053
+ pa.array([0, 1, 2]),
1054
+ pa.array(["0", "1", "2"]),
1055
+ ],
1056
+ names=["pk_col_1", "col_1"],
1057
+ ),
1058
+ DeltaType.UPSERT,
1059
+ None,
1060
+ ),
1061
+ (
1062
+ pa.Table.from_arrays(
1063
+ [
1064
+ pa.array([2, 3]),
1065
+ pa.array(["abc", "def"]),
1066
+ ],
1067
+ names=["pk_col_1", "col_1"],
1068
+ ),
1069
+ DeltaType.UPSERT,
1070
+ None,
1071
+ ),
1072
+ (
1073
+ pa.Table.from_arrays(
1074
+ [
1075
+ pa.array([4]),
1076
+ pa.array(["ghi"]),
1077
+ ],
1078
+ names=["pk_col_1", "col_1"],
1079
+ ),
1080
+ DeltaType.UPSERT,
1081
+ None,
1082
+ ),
1083
+ (
1084
+ pa.Table.from_arrays(
1085
+ [
1086
+ pa.array(["0", "1", "2", "abc", "def", "ghi"]),
1087
+ ],
1088
+ names=["col_1"],
1089
+ ),
1090
+ DeltaType.DELETE,
1091
+ DeleteParameters.of(["col_1"]),
1092
+ ),
1093
+ ],
1094
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1095
+ [
1096
+ pa.array([5]),
1097
+ pa.array(["buz"]),
1098
+ ],
1099
+ names=["pk_col_1", "col_1"],
1100
+ ),
1101
+ expected_terminal_exception=None,
1102
+ do_create_placement_group=True,
1103
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1104
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1105
+ read_kwargs_provider=None,
1106
+ drop_duplicates=True,
1107
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1108
+ ),
1109
+ "19-rebase-then-incremental-delete-type-delta-drop-only-from-base-table-keep-all-incremental": RebaseThenIncrementalCompactionTestCaseParams(
1110
+ primary_keys={"pk_col_1"},
1111
+ sort_keys=ZERO_VALUED_SORT_KEY,
1112
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1113
+ partition_values=["2022-01-01T00:00:00.000Z"],
1114
+ input_deltas=pa.Table.from_arrays(
1115
+ [
1116
+ pa.array([(i % 4) for i in range(1000)] + [4, 5]),
1117
+ pa.array([str(i) for i in range(0, 1000)] + ["fiz", "buz"]),
1118
+ ],
1119
+ names=["pk_col_1", "col_1"],
1120
+ ),
1121
+ input_deltas_delta_type=DeltaType.UPSERT,
1122
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1123
+ [
1124
+ pa.array([0, 1, 2, 3, 4, 5]),
1125
+ pa.array(["996", "997", "998", "999", "fiz", "buz"]),
1126
+ ],
1127
+ names=["pk_col_1", "col_1"],
1128
+ ),
1129
+ incremental_deltas=[
1130
+ (
1131
+ pa.Table.from_arrays(
1132
+ [
1133
+ pa.array([0, 1, 2, 3, 4, 5]),
1134
+ pa.array(["0", "1", "2", "3", "4", "5"]),
1135
+ ],
1136
+ names=["pk_col_1", "col_1"],
1137
+ ),
1138
+ DeltaType.UPSERT,
1139
+ None,
1140
+ ),
1141
+ (
1142
+ pa.Table.from_arrays(
1143
+ [
1144
+ pa.array([0]),
1145
+ pa.array(["foo"]),
1146
+ ],
1147
+ names=["pk_col_1", "col_1"],
1148
+ ),
1149
+ DeltaType.UPSERT,
1150
+ None,
1151
+ ),
1152
+ (
1153
+ pa.Table.from_arrays(
1154
+ [
1155
+ pa.array(["996", "997", "998", "999", "fiz", "buz"]),
1156
+ ],
1157
+ names=["col_1"],
1158
+ ),
1159
+ DeltaType.DELETE,
1160
+ DeleteParameters.of(["col_1"]),
1161
+ ),
1162
+ ],
1163
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1164
+ [
1165
+ pa.array([0, 1, 2, 3, 4, 5]),
1166
+ pa.array(["foo", "1", "2", "3", "4", "5"]),
1167
+ ],
1168
+ names=["pk_col_1", "col_1"],
1169
+ ),
1170
+ expected_terminal_exception=None,
1171
+ do_create_placement_group=True,
1172
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1173
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1174
+ read_kwargs_provider=None,
1175
+ drop_duplicates=True,
1176
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1177
+ ),
1178
+ "20-rebase-then-incremental-delete-type-delta-drop-all-base-table-drop-all-incremental": RebaseThenIncrementalCompactionTestCaseParams(
1179
+ primary_keys={"pk_col_1"},
1180
+ sort_keys=ZERO_VALUED_SORT_KEY,
1181
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1182
+ partition_values=["2022-01-01T00:00:00.000Z"],
1183
+ input_deltas=pa.Table.from_arrays(
1184
+ [
1185
+ pa.array([(i % 4) for i in range(1000)] + [4, 5]),
1186
+ pa.array([str(i) for i in range(0, 1000)] + ["fiz", "buz"]),
1187
+ ],
1188
+ names=["pk_col_1", "col_1"],
1189
+ ),
1190
+ input_deltas_delta_type=DeltaType.UPSERT,
1191
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1192
+ [
1193
+ pa.array([0, 1, 2, 3, 4, 5]),
1194
+ pa.array(["996", "997", "998", "999", "fiz", "buz"]),
1195
+ ],
1196
+ names=["pk_col_1", "col_1"],
1197
+ ),
1198
+ incremental_deltas=[
1199
+ (
1200
+ pa.Table.from_arrays(
1201
+ [
1202
+ pa.array([6]),
1203
+ pa.array(["foo"]),
1204
+ ],
1205
+ names=["pk_col_1", "col_1"],
1206
+ ),
1207
+ DeltaType.UPSERT,
1208
+ None,
1209
+ ),
1210
+ (
1211
+ pa.Table.from_arrays(
1212
+ [
1213
+ pa.array(["996", "997", "998", "999", "fiz", "buz", "foo"]),
1214
+ ],
1215
+ names=["col_1"],
1216
+ ),
1217
+ DeltaType.DELETE,
1218
+ DeleteParameters.of(["col_1"]),
1219
+ ),
1220
+ ],
1221
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1222
+ [
1223
+ pa.array([]),
1224
+ pa.array([]),
1225
+ ],
1226
+ schema=pa.schema(
1227
+ [
1228
+ ("pk_col_1", pa.int64()),
1229
+ ("col_1", pa.string()),
1230
+ ]
1231
+ ),
1232
+ ),
1233
+ expected_terminal_exception=None,
1234
+ do_create_placement_group=True,
1235
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1236
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1237
+ read_kwargs_provider=None,
1238
+ drop_duplicates=True,
1239
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1240
+ ),
1241
+ "21-rebase-then-incremental-delete-type-delta-UDDUUDD": RebaseThenIncrementalCompactionTestCaseParams(
1242
+ primary_keys={"pk_col_1"},
1243
+ sort_keys=ZERO_VALUED_SORT_KEY,
1244
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1245
+ partition_values=["2022-01-01T00:00:00.000Z"],
1246
+ input_deltas=pa.Table.from_arrays(
1247
+ [
1248
+ pa.array([(i % 4) for i in range(1000)] + [4, 5]),
1249
+ pa.array([str(i) for i in range(0, 1000)] + ["fiz", "buz"]),
1250
+ ],
1251
+ names=["pk_col_1", "col_1"],
1252
+ ),
1253
+ input_deltas_delta_type=DeltaType.UPSERT,
1254
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1255
+ [
1256
+ pa.array([0, 1, 2, 3, 4, 5]),
1257
+ pa.array(["996", "997", "998", "999", "fiz", "buz"]),
1258
+ ],
1259
+ names=["pk_col_1", "col_1"],
1260
+ ),
1261
+ incremental_deltas=[
1262
+ (
1263
+ pa.Table.from_arrays(
1264
+ [
1265
+ pa.array([0, 1, 2, 3, 4, 5]),
1266
+ pa.array(["0", "1", "2", "3", "4", "5"]),
1267
+ ],
1268
+ names=["pk_col_1", "col_1"],
1269
+ ),
1270
+ DeltaType.UPSERT,
1271
+ None,
1272
+ ),
1273
+ (
1274
+ pa.Table.from_arrays(
1275
+ [
1276
+ pa.array(["DOESNOTEXIST"]),
1277
+ ],
1278
+ names=["col_1"],
1279
+ ),
1280
+ DeltaType.DELETE,
1281
+ DeleteParameters.of(["col_1"]),
1282
+ ),
1283
+ (
1284
+ pa.Table.from_arrays(
1285
+ [
1286
+ pa.array([1]),
1287
+ pa.array(["1"]),
1288
+ ],
1289
+ names=["pk_col_1", "col_1"],
1290
+ ),
1291
+ DeltaType.DELETE,
1292
+ DeleteParameters.of(["col_1"]),
1293
+ ),
1294
+ (
1295
+ pa.Table.from_arrays(
1296
+ [
1297
+ pa.array([2, 3, 6, 7]),
1298
+ pa.array(["boo", "bar", "fiz", "aaa"]),
1299
+ ],
1300
+ names=["pk_col_1", "col_1"],
1301
+ ),
1302
+ DeltaType.UPSERT,
1303
+ None,
1304
+ ),
1305
+ (
1306
+ pa.Table.from_arrays(
1307
+ [
1308
+ pa.array(["fiz", "bar", "boo"]),
1309
+ ],
1310
+ names=["col_1"],
1311
+ ),
1312
+ DeltaType.DELETE,
1313
+ DeleteParameters.of(["col_1"]),
1314
+ ),
1315
+ ],
1316
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1317
+ [
1318
+ pa.array([0, 4, 5, 7]),
1319
+ pa.array(["0", "4", "5", "aaa"]),
1320
+ ],
1321
+ names=["pk_col_1", "col_1"],
1322
+ ),
1323
+ expected_terminal_exception=None,
1324
+ do_create_placement_group=True,
1325
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1326
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1327
+ read_kwargs_provider=None,
1328
+ drop_duplicates=True,
1329
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1330
+ ),
1331
+ "22-rebase-then-incremental-delete-type-delta-UD-affects-compacted-and-incremental": RebaseThenIncrementalCompactionTestCaseParams(
1332
+ primary_keys={"pk_col_1"},
1333
+ sort_keys=ZERO_VALUED_SORT_KEY,
1334
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1335
+ partition_values=["2022-01-01T00:00:00.000Z"],
1336
+ input_deltas=pa.Table.from_arrays(
1337
+ [
1338
+ pa.array([0, 1]),
1339
+ pa.array(["0", "1"]),
1340
+ ],
1341
+ names=["pk_col_1", "col_1"],
1342
+ ),
1343
+ input_deltas_delta_type=DeltaType.UPSERT,
1344
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1345
+ [
1346
+ pa.array([0, 1]),
1347
+ pa.array(["0", "1"]),
1348
+ ],
1349
+ names=["pk_col_1", "col_1"],
1350
+ ),
1351
+ incremental_deltas=[
1352
+ (
1353
+ pa.Table.from_arrays(
1354
+ [
1355
+ pa.array([0]),
1356
+ pa.array(["1"]),
1357
+ ],
1358
+ names=["pk_col_1", "col_1"],
1359
+ ),
1360
+ DeltaType.UPSERT,
1361
+ None,
1362
+ ),
1363
+ (
1364
+ pa.Table.from_arrays(
1365
+ [
1366
+ pa.array(["1"]),
1367
+ ],
1368
+ names=["col_1"],
1369
+ ),
1370
+ DeltaType.DELETE,
1371
+ DeleteParameters.of(["col_1"]),
1372
+ ),
1373
+ ],
1374
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1375
+ [
1376
+ pa.array([]),
1377
+ pa.array([]),
1378
+ ],
1379
+ schema=pa.schema(
1380
+ [
1381
+ ("pk_col_1", pa.int64()),
1382
+ ("col_1", pa.string()),
1383
+ ]
1384
+ ),
1385
+ ),
1386
+ expected_terminal_exception=None,
1387
+ do_create_placement_group=True,
1388
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1389
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1390
+ read_kwargs_provider=None,
1391
+ drop_duplicates=True,
1392
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1393
+ ),
1394
+ "23-rebase-then-incremental-delete-type-delta-UDU-upsert-again": RebaseThenIncrementalCompactionTestCaseParams(
1395
+ primary_keys={"pk_col_1"},
1396
+ sort_keys=ZERO_VALUED_SORT_KEY,
1397
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1398
+ partition_values=["2022-01-01T00:00:00.000Z"],
1399
+ input_deltas=pa.Table.from_arrays(
1400
+ [
1401
+ pa.array([0, 1]),
1402
+ pa.array(["0", "1"]),
1403
+ ],
1404
+ names=["pk_col_1", "col_1"],
1405
+ ),
1406
+ input_deltas_delta_type=DeltaType.UPSERT,
1407
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1408
+ [
1409
+ pa.array([0, 1]),
1410
+ pa.array(["0", "1"]),
1411
+ ],
1412
+ names=["pk_col_1", "col_1"],
1413
+ ),
1414
+ incremental_deltas=[
1415
+ (
1416
+ pa.Table.from_arrays(
1417
+ [
1418
+ pa.array([0]),
1419
+ pa.array(["1"]),
1420
+ ],
1421
+ names=["pk_col_1", "col_1"],
1422
+ ),
1423
+ DeltaType.UPSERT,
1424
+ None,
1425
+ ),
1426
+ (
1427
+ pa.Table.from_arrays(
1428
+ [
1429
+ pa.array(["1"]),
1430
+ ],
1431
+ names=["col_1"],
1432
+ ),
1433
+ DeltaType.DELETE,
1434
+ DeleteParameters.of(["col_1"]),
1435
+ ),
1436
+ (
1437
+ pa.Table.from_arrays(
1438
+ [
1439
+ pa.array([0]),
1440
+ pa.array(["1"]),
1441
+ ],
1442
+ names=["pk_col_1", "col_1"],
1443
+ ),
1444
+ DeltaType.UPSERT,
1445
+ None,
1446
+ ),
1447
+ ],
1448
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1449
+ [
1450
+ pa.array([0]),
1451
+ pa.array(["1"]),
1452
+ ],
1453
+ names=["pk_col_1", "col_1"],
1454
+ ),
1455
+ expected_terminal_exception=None,
1456
+ do_create_placement_group=True,
1457
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1458
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1459
+ read_kwargs_provider=None,
1460
+ drop_duplicates=True,
1461
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1462
+ ),
1463
+ "24-rebase-then-incremental-delete-type-no-delete-column-has-delete-deltas-expected-exception": RebaseThenIncrementalCompactionTestCaseParams(
1464
+ primary_keys={"pk_col_1"},
1465
+ sort_keys=ZERO_VALUED_SORT_KEY,
1466
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1467
+ partition_values=["2022-01-01T00:00:00.000Z"],
1468
+ input_deltas=pa.Table.from_arrays(
1469
+ [
1470
+ pa.array([0, 1]),
1471
+ pa.array(["0", "1"]),
1472
+ ],
1473
+ names=["pk_col_1", "col_1"],
1474
+ ),
1475
+ input_deltas_delta_type=DeltaType.UPSERT,
1476
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1477
+ [
1478
+ pa.array([0, 1]),
1479
+ pa.array(["0", "1"]),
1480
+ ],
1481
+ names=["pk_col_1", "col_1"],
1482
+ ),
1483
+ incremental_deltas=[
1484
+ (
1485
+ pa.Table.from_arrays(
1486
+ [
1487
+ pa.array([0]),
1488
+ pa.array(["1"]),
1489
+ ],
1490
+ names=["pk_col_1", "col_1"],
1491
+ ),
1492
+ DeltaType.UPSERT,
1493
+ None,
1494
+ ),
1495
+ (
1496
+ pa.Table.from_arrays(
1497
+ [
1498
+ pa.array(["1"]),
1499
+ ],
1500
+ names=["col_1"],
1501
+ ),
1502
+ DeltaType.DELETE,
1503
+ None,
1504
+ ),
1505
+ ],
1506
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1507
+ [
1508
+ pa.array([]),
1509
+ pa.array([]),
1510
+ ],
1511
+ schema=pa.schema(
1512
+ [
1513
+ ("pk_col_1", pa.int64()),
1514
+ ("col_1", pa.string()),
1515
+ ]
1516
+ ),
1517
+ ),
1518
+ expected_terminal_exception=AssertionError,
1519
+ do_create_placement_group=True,
1520
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1521
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1522
+ read_kwargs_provider=None,
1523
+ drop_duplicates=True,
1524
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1525
+ ),
1526
+ "25-rebase-then-incremental-delete-type-delta-has-delete-column-no-delete-records": RebaseThenIncrementalCompactionTestCaseParams(
1527
+ primary_keys={"pk_col_1"},
1528
+ sort_keys=ZERO_VALUED_SORT_KEY,
1529
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1530
+ partition_values=["2022-01-01T00:00:00.000Z"],
1531
+ input_deltas=pa.Table.from_arrays(
1532
+ [
1533
+ pa.array([0, 1, 2]),
1534
+ pa.array(["0", "1", "2"]),
1535
+ ],
1536
+ names=["pk_col_1", "col_1"],
1537
+ ),
1538
+ input_deltas_delta_type=DeltaType.UPSERT,
1539
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1540
+ [
1541
+ pa.array([0, 1, 2]),
1542
+ pa.array(["0", "1", "2"]),
1543
+ ],
1544
+ names=["pk_col_1", "col_1"],
1545
+ ),
1546
+ incremental_deltas=[
1547
+ (
1548
+ pa.Table.from_arrays(
1549
+ [
1550
+ pa.array([0]),
1551
+ pa.array(["1"]),
1552
+ ],
1553
+ names=["pk_col_1", "col_1"],
1554
+ ),
1555
+ DeltaType.UPSERT,
1556
+ None,
1557
+ ),
1558
+ (
1559
+ pa.Table.from_arrays(
1560
+ [
1561
+ pa.array([]),
1562
+ pa.array([]),
1563
+ ],
1564
+ schema=pa.schema(
1565
+ [
1566
+ ("pk_col_1", pa.int64()),
1567
+ ("col_1", pa.string()),
1568
+ ]
1569
+ ),
1570
+ ),
1571
+ DeltaType.DELETE,
1572
+ DeleteParameters.of(["col_1"]),
1573
+ ),
1574
+ ],
1575
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1576
+ [
1577
+ pa.array([0, 1, 2]),
1578
+ pa.array(["1", "1", "2"]),
1579
+ ],
1580
+ names=["pk_col_1", "col_1"],
1581
+ ),
1582
+ expected_terminal_exception=None,
1583
+ do_create_placement_group=True,
1584
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1585
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1586
+ read_kwargs_provider=None,
1587
+ drop_duplicates=True,
1588
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1589
+ ),
1590
+ "26-rebase-then-incremental-delete-type-delta-UDU-duplicate-delete-records": RebaseThenIncrementalCompactionTestCaseParams(
1591
+ primary_keys={"pk_col_1"},
1592
+ sort_keys=ZERO_VALUED_SORT_KEY,
1593
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1594
+ partition_values=["2022-01-01T00:00:00.000Z"],
1595
+ input_deltas=pa.Table.from_arrays(
1596
+ [
1597
+ pa.array([0, 1, 2, 3]),
1598
+ pa.array(["0", "1", "2", "3"]),
1599
+ ],
1600
+ names=["pk_col_1", "col_1"],
1601
+ ),
1602
+ input_deltas_delta_type=DeltaType.UPSERT,
1603
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1604
+ [
1605
+ pa.array([0, 1, 2, 3]),
1606
+ pa.array(["0", "1", "2", "3"]),
1607
+ ],
1608
+ names=["pk_col_1", "col_1"],
1609
+ ),
1610
+ incremental_deltas=[
1611
+ (
1612
+ pa.Table.from_arrays(
1613
+ [
1614
+ pa.array([0]),
1615
+ pa.array(["1"]),
1616
+ ],
1617
+ names=["pk_col_1", "col_1"],
1618
+ ),
1619
+ DeltaType.UPSERT,
1620
+ None,
1621
+ ),
1622
+ (
1623
+ pa.Table.from_arrays(
1624
+ [
1625
+ pa.array(["1", "1", "1", "2", "2", "2"]),
1626
+ ],
1627
+ names=["col_1"],
1628
+ ),
1629
+ DeltaType.DELETE,
1630
+ DeleteParameters.of(["col_1"]),
1631
+ ),
1632
+ ],
1633
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1634
+ [
1635
+ pa.array([3]),
1636
+ pa.array(["3"]),
1637
+ ],
1638
+ names=["pk_col_1", "col_1"],
1639
+ ),
1640
+ expected_terminal_exception=None,
1641
+ do_create_placement_group=True,
1642
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1643
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1644
+ read_kwargs_provider=None,
1645
+ drop_duplicates=True,
1646
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1647
+ ),
1648
+ "27-rebase-then-incremental-delete-type-delta-DDU-deletes-then-upserts": RebaseThenIncrementalCompactionTestCaseParams(
1649
+ primary_keys={"pk_col_1"},
1650
+ sort_keys=ZERO_VALUED_SORT_KEY,
1651
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.TIMESTAMP)],
1652
+ partition_values=["2022-01-01T00:00:00.000Z"],
1653
+ input_deltas=pa.Table.from_arrays(
1654
+ [
1655
+ pa.array([0, 1, 2, 3, 4]),
1656
+ pa.array(["0", "1", "2", "3", "4"]),
1657
+ ],
1658
+ names=["pk_col_1", "col_1"],
1659
+ ),
1660
+ input_deltas_delta_type=DeltaType.UPSERT,
1661
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1662
+ [
1663
+ pa.array([0, 1, 2, 3, 4]),
1664
+ pa.array(["0", "1", "2", "3", "4"]),
1665
+ ],
1666
+ names=["pk_col_1", "col_1"],
1667
+ ),
1668
+ incremental_deltas=[
1669
+ (
1670
+ pa.Table.from_arrays(
1671
+ [
1672
+ pa.array(["1", "1", "2"]),
1673
+ ],
1674
+ names=["col_1"],
1675
+ ),
1676
+ DeltaType.DELETE,
1677
+ DeleteParameters.of(["col_1"]),
1678
+ ),
1679
+ (
1680
+ pa.Table.from_arrays(
1681
+ [
1682
+ pa.array([0]),
1683
+ ],
1684
+ names=["pk_col_1"],
1685
+ ),
1686
+ DeltaType.DELETE,
1687
+ DeleteParameters.of(["pk_col_1"]),
1688
+ ),
1689
+ (
1690
+ pa.Table.from_arrays(
1691
+ [
1692
+ pa.array([0, 3]),
1693
+ pa.array(["a", "b"]),
1694
+ ],
1695
+ names=["pk_col_1", "col_1"],
1696
+ ),
1697
+ DeltaType.UPSERT,
1698
+ None,
1699
+ ),
1700
+ ],
1701
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1702
+ [
1703
+ pa.array([0, 3, 4]),
1704
+ pa.array(["a", "b", "4"]),
1705
+ ],
1706
+ names=["pk_col_1", "col_1"],
1707
+ ),
1708
+ expected_terminal_exception=None,
1709
+ do_create_placement_group=True,
1710
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1711
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1712
+ read_kwargs_provider=None,
1713
+ drop_duplicates=True,
1714
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1715
+ ),
1716
+ "28-rebase-then-incremental-delete-type-delta-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
1717
+ primary_keys={"pk_col_1"},
1718
+ sort_keys=[
1719
+ SortKey.of(key_name="sk_col_1"),
1720
+ SortKey.of(key_name="sk_col_2"),
1721
+ ],
1722
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1723
+ partition_values=["1"],
1724
+ input_deltas=pa.Table.from_arrays(
1725
+ [
1726
+ pa.array([str(i) for i in range(10)]),
1727
+ pa.array([i for i in range(0, 10)]),
1728
+ pa.array(["foo"] * 10),
1729
+ pa.array([i / 10 for i in range(10, 20)]),
1730
+ ],
1731
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1732
+ ),
1733
+ input_deltas_delta_type=DeltaType.UPSERT,
1734
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1735
+ [
1736
+ pa.array([str(i) for i in range(10)]),
1737
+ pa.array([i for i in range(0, 10)]),
1738
+ pa.array(["foo"] * 10),
1739
+ pa.array([i / 10 for i in range(10, 20)]),
1740
+ ],
1741
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1742
+ ),
1743
+ incremental_deltas=[
1744
+ (
1745
+ pa.Table.from_arrays(
1746
+ [
1747
+ pa.array([str(i) for i in range(10)]),
1748
+ pa.array([i for i in range(20, 30)]),
1749
+ pa.array(["foo"] * 10),
1750
+ pa.array([i / 10 for i in range(40, 50)]),
1751
+ ],
1752
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1753
+ ),
1754
+ DeltaType.UPSERT,
1755
+ None,
1756
+ ),
1757
+ (
1758
+ pa.Table.from_arrays(
1759
+ [
1760
+ pa.array([i / 10 for i in range(0, 3)]),
1761
+ ],
1762
+ names=["col_1"],
1763
+ ),
1764
+ DeltaType.DELETE,
1765
+ DeleteParameters.of(["col_1"]),
1766
+ ),
1767
+ (
1768
+ pa.Table.from_arrays(
1769
+ [
1770
+ pa.array([i / 10 for i in range(43, 45)]),
1771
+ ],
1772
+ names=["col_1"],
1773
+ ),
1774
+ DeltaType.DELETE,
1775
+ DeleteParameters.of(["col_1"]),
1776
+ ),
1777
+ (
1778
+ pa.Table.from_arrays(
1779
+ [
1780
+ pa.array([i for i in range(20, 25)]),
1781
+ ],
1782
+ names=["sk_col_1"],
1783
+ ),
1784
+ DeltaType.DELETE,
1785
+ DeleteParameters.of(["sk_col_1"]),
1786
+ ),
1787
+ ],
1788
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1789
+ [
1790
+ pa.array([str(i) for i in range(5, 10)]),
1791
+ pa.array([i for i in range(25, 30)]),
1792
+ pa.array(["foo"] * 5),
1793
+ pa.array([i / 10 for i in range(45, 50)]),
1794
+ ],
1795
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1796
+ ),
1797
+ expected_terminal_exception=None,
1798
+ do_create_placement_group=False,
1799
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1800
+ hash_bucket_count=1,
1801
+ read_kwargs_provider=None,
1802
+ drop_duplicates=True,
1803
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1804
+ ),
1805
+ "29-rebase-then-incremental-delete-type-delta-no-pk-compactor": RebaseThenIncrementalCompactionTestCaseParams(
1806
+ primary_keys=ZERO_VALUED_PRIMARY_KEY,
1807
+ sort_keys=[
1808
+ SortKey.of(key_name="sk_col_1"),
1809
+ ],
1810
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1811
+ partition_values=["1"],
1812
+ input_deltas=pa.Table.from_arrays(
1813
+ [
1814
+ pa.array([1, 2, 3]),
1815
+ pa.array([1.0, 2.0, 3.0]),
1816
+ ],
1817
+ names=["sk_col_1", "col_1"],
1818
+ ),
1819
+ input_deltas_delta_type=DeltaType.UPSERT,
1820
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1821
+ [
1822
+ pa.array([1, 1, 2, 2, 3, 3]),
1823
+ pa.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0]),
1824
+ ],
1825
+ names=["sk_col_1", "col_1"],
1826
+ ),
1827
+ incremental_deltas=[
1828
+ (
1829
+ pa.Table.from_arrays(
1830
+ [
1831
+ pa.array([3.0]),
1832
+ ],
1833
+ names=["col_1"],
1834
+ ),
1835
+ DeltaType.DELETE,
1836
+ DeleteParameters.of(["col_1"]),
1837
+ ),
1838
+ (
1839
+ pa.Table.from_arrays(
1840
+ [
1841
+ pa.array([4, 5, 6]),
1842
+ pa.array([10.0, 11.0, 12.0]),
1843
+ ],
1844
+ names=["sk_col_1", "col_1"],
1845
+ ),
1846
+ DeltaType.UPSERT,
1847
+ None,
1848
+ ),
1849
+ (
1850
+ pa.Table.from_arrays(
1851
+ [
1852
+ pa.array([6]),
1853
+ ],
1854
+ names=["sk_col_1"],
1855
+ ),
1856
+ DeltaType.DELETE,
1857
+ DeleteParameters.of(["sk_col_1"]),
1858
+ ),
1859
+ ],
1860
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1861
+ [
1862
+ pa.array([1, 1, 2, 2, 4, 5]),
1863
+ pa.array([1.0, 1.0, 2.0, 2.0, 10.0, 11.0]),
1864
+ ],
1865
+ names=["sk_col_1", "col_1"],
1866
+ ),
1867
+ expected_terminal_exception=None,
1868
+ do_create_placement_group=False,
1869
+ records_per_compacted_file=10,
1870
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1871
+ read_kwargs_provider=None,
1872
+ drop_duplicates=True,
1873
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1874
+ ),
1875
+ "30-rebase-then-incremental-delete-type-delta-on-incremental-compactor-v1-v2": RebaseThenIncrementalCompactionTestCaseParams(
1876
+ primary_keys={"pk_col_1"},
1877
+ sort_keys=ZERO_VALUED_SORT_KEY,
1878
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1879
+ partition_values=["1"],
1880
+ input_deltas=pa.Table.from_arrays(
1881
+ [
1882
+ pa.array([i for i in range(12)]),
1883
+ pa.array([str(i) for i in range(0, 12)]),
1884
+ ],
1885
+ names=["pk_col_1", "col_1"],
1886
+ ),
1887
+ input_deltas_delta_type=DeltaType.UPSERT,
1888
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1889
+ [
1890
+ pa.array([i for i in range(12)]),
1891
+ pa.array([str(i) for i in range(0, 12)]),
1892
+ ],
1893
+ names=["pk_col_1", "col_1"],
1894
+ ),
1895
+ incremental_deltas=[
1896
+ (
1897
+ pa.Table.from_arrays(
1898
+ [
1899
+ pa.array([10, 11]),
1900
+ pa.array(["10", "11"]),
1901
+ ],
1902
+ names=["pk_col_1", "col_1"],
1903
+ ),
1904
+ DeltaType.DELETE,
1905
+ DeleteParameters.of(["pk_col_1", "col_1"]),
1906
+ ),
1907
+ ],
1908
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1909
+ [
1910
+ pa.array([i for i in range(10)]),
1911
+ pa.array([str(i) for i in range(0, 10)]),
1912
+ ],
1913
+ names=["pk_col_1", "col_1"],
1914
+ ),
1915
+ expected_terminal_exception=None,
1916
+ do_create_placement_group=False,
1917
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1918
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
1919
+ read_kwargs_provider=None,
1920
+ drop_duplicates=True,
1921
+ skip_enabled_compact_partition_drivers=None,
1922
+ ),
1923
+ }
1924
+
1925
+ REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
1926
+ {
1927
+ **REBASE_THEN_INCREMENTAL_TEST_CASES,
1928
+ **REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES,
1929
+ },
1930
+ )