deltacat 1.1.14__py3-none-any.whl → 1.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +3 -2
  3. deltacat/compute/compactor/model/compact_partition_params.py +11 -1
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +2 -2
  5. deltacat/compute/compactor/model/delta_annotated.py +2 -4
  6. deltacat/compute/compactor/steps/hash_bucket.py +2 -3
  7. deltacat/compute/compactor_v2/compaction_session.py +26 -27
  8. deltacat/compute/compactor_v2/constants.py +4 -0
  9. deltacat/compute/compactor_v2/private/compaction_utils.py +103 -66
  10. deltacat/compute/compactor_v2/steps/merge.py +0 -3
  11. deltacat/compute/compactor_v2/utils/delta.py +2 -3
  12. deltacat/compute/compactor_v2/utils/io.py +0 -2
  13. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  14. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +855 -0
  15. deltacat/tests/compute/compactor_v2/test_compaction_session.py +1 -1
  16. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +330 -0
  17. deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
  18. deltacat/tests/compute/test_util_create_table_deltas_repo.py +118 -0
  19. deltacat/tests/local_deltacat_storage/__init__.py +8 -5
  20. {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/METADATA +1 -1
  21. {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/RECORD +24 -22
  22. {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/LICENSE +0 -0
  23. {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/WHEEL +0 -0
  24. {deltacat-1.1.14.dist-info → deltacat-1.1.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,855 @@
1
+ import pyarrow as pa
2
+ from typing import Callable, List, Optional, Set, Union
3
+ from deltacat.utils.common import ReadKwargsProvider
4
+ from deltacat.tests.compute.test_util_common import (
5
+ PartitionKey,
6
+ PartitionKeyType,
7
+ )
8
+ from deltacat.tests.compute.test_util_constant import (
9
+ DEFAULT_MAX_RECORDS_PER_FILE,
10
+ DEFAULT_HASH_BUCKET_COUNT,
11
+ )
12
+ from dataclasses import dataclass, fields
13
+
14
+ from deltacat.exceptions import ValidationError
15
+
16
+ from deltacat.storage import (
17
+ DeltaType,
18
+ DeleteParameters,
19
+ )
20
+
21
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
22
+
23
+ from deltacat.storage.model.sort_key import SortKey
24
+
25
+ from deltacat.tests.compute.compact_partition_test_cases import (
26
+ with_compactor_version_func_test_param,
27
+ ZERO_VALUED_SORT_KEY,
28
+ )
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class MultipleRoundsTestCaseParams:
33
+ """
34
+ A pytest parameterized test case for the `compact_partition` function.
35
+
36
+ Args:
37
+ primary_keys: Set[str] - argument for the primary_keys parameter in compact_partition. Also needed for table/delta creation
38
+ sort_keys: List[SortKey] - argument for the sort_keys parameter in compact_partition. Also needed for table/delta creation
39
+ partition_keys_param: List[PartitionKey] - argument for the partition_keys parameter. Needed for table/delta creation
40
+ partition_values_param: List[Optional[str]] - argument for the partition_valued parameter. Needed for table/delta creation
41
+ input_deltas: List[pa.Array] - argument required for delta creation during compact_partition test setup. Actual incoming deltas expressed as a PyArrow array (https://arrow.apache.org/docs/python/generated/pyarrow.array.html)
42
+ expected_terminal_compact_partition_result: pa.Table - expected PyArrow table after compaction (i.e,. the state of the table after applying all row UPDATES/DELETES/INSERTS)
43
+ expected_terminal_exception: BaseException - expected exception during compaction
44
+ expected_terminal_exception_message: Optional[str] - expected exception message if present.
45
+ do_create_placement_group: bool - toggles whether to create a placement group (https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html) or not
46
+ records_per_compacted_file: int - argument for the records_per_compacted_file parameter in compact_partition
47
+ hash_bucket_count_param: int - argument for the hash_bucket_count parameter in compact_partition
48
+ read_kwargs_provider: Optional[ReadKwargsProvider] - argument for read_kwargs_provider parameter in compact_partition. If None then no ReadKwargsProvider is provided to compact_partition_params
49
+ drop_duplicates: bool - argument for drop_duplicates parameter in compact_partition. Only recognized by compactor v2.
50
+ skip_enabled_compact_partition_drivers: List[CompactorVersion] - skip whatever enabled_compact_partition_drivers are included in this list
51
+ assert_compaction_audit: Optional[Callable] - argument that asserts compaction_audit is updated only if compactor_version is v2.
52
+ rebase_expected_compact_partition_result: pa.Table - expected table after rebase compaction runs. An output that is asserted on in Rebase unit tests
53
+ num_rounds: int - parameter that specifies the number of rounds of compaction (how many batches of uniform deltas to make). Default is 1 round
54
+ """
55
+
56
+ primary_keys: Set[str]
57
+ sort_keys: List[Optional[SortKey]]
58
+ partition_keys: Optional[List[PartitionKey]]
59
+ partition_values: List[Optional[str]]
60
+ input_deltas: Union[List[pa.Array], DeltaType, DeleteParameters]
61
+ expected_terminal_compact_partition_result: pa.Table
62
+ expected_terminal_exception: BaseException
63
+ expected_terminal_exception_message: str
64
+ do_create_placement_group: bool
65
+ records_per_compacted_file: int
66
+ hash_bucket_count: int
67
+ read_kwargs_provider: Optional[ReadKwargsProvider]
68
+ drop_duplicates: bool
69
+ skip_enabled_compact_partition_drivers: List[CompactorVersion]
70
+ assert_compaction_audit: Optional[Callable]
71
+ rebase_expected_compact_partition_result: pa.Table
72
+ num_rounds: int
73
+
74
+ # makes MultipleRoundsTestCase iterable which is required to build the list of pytest.param values to pass to pytest.mark.parametrize
75
+ def __iter__(self):
76
+ return (getattr(self, field.name) for field in fields(self))
77
+
78
+
79
+ MULTIPLE_ROUNDS_TEST_CASES = {
80
+ # 4 input deltas that are identical, 2 rounds requested.
81
+ # Expect to see a table that aggregates 40 records across the 2 rounds
82
+ # (dropDuplicates = False)
83
+ "1-multiple-rounds-sanity": MultipleRoundsTestCaseParams(
84
+ primary_keys={"pk_col_1"},
85
+ sort_keys=[
86
+ SortKey.of(key_name="sk_col_1"),
87
+ SortKey.of(key_name="sk_col_2"),
88
+ ],
89
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
90
+ partition_values=["1"],
91
+ input_deltas=[
92
+ (
93
+ pa.Table.from_arrays(
94
+ [
95
+ pa.array([str(i) for i in range(10)]),
96
+ pa.array([i for i in range(0, 10)]),
97
+ pa.array(["foo"] * 10),
98
+ pa.array([i / 10 for i in range(0, 10)]),
99
+ ],
100
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
101
+ ),
102
+ DeltaType.UPSERT,
103
+ None,
104
+ ),
105
+ (
106
+ pa.Table.from_arrays(
107
+ [
108
+ pa.array([str(i) for i in range(10)]),
109
+ pa.array([i for i in range(0, 10)]),
110
+ pa.array(["foo"] * 10),
111
+ pa.array([i / 10 for i in range(0, 10)]),
112
+ ],
113
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
114
+ ),
115
+ DeltaType.UPSERT,
116
+ None,
117
+ ),
118
+ (
119
+ pa.Table.from_arrays(
120
+ [
121
+ pa.array([str(i) for i in range(10)]),
122
+ pa.array([i for i in range(0, 10)]),
123
+ pa.array(["foo"] * 10),
124
+ pa.array([i / 10 for i in range(0, 10)]),
125
+ ],
126
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
127
+ ),
128
+ DeltaType.UPSERT,
129
+ None,
130
+ ),
131
+ (
132
+ pa.Table.from_arrays(
133
+ [
134
+ pa.array([str(i) for i in range(10)]),
135
+ pa.array([i for i in range(0, 10)]),
136
+ pa.array(["foo"] * 10),
137
+ pa.array([i / 10 for i in range(0, 10)]),
138
+ ],
139
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
140
+ ),
141
+ DeltaType.UPSERT,
142
+ None,
143
+ ),
144
+ ],
145
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
146
+ [
147
+ pa.array([str(i) for i in range(0, 10)] * 4),
148
+ pa.array([i for i in range(0, 10)] * 4),
149
+ pa.array(["foo"] * 40),
150
+ pa.array([i / 10 for i in range(0, 10)] * 4),
151
+ ],
152
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
153
+ ),
154
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
155
+ [
156
+ pa.array([str(i) for i in range(0, 10)] * 4),
157
+ pa.array([i for i in range(0, 10)] * 4),
158
+ pa.array(["foo"] * 40),
159
+ pa.array([i / 10 for i in range(0, 10)] * 4),
160
+ ],
161
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
162
+ ),
163
+ expected_terminal_exception=None,
164
+ expected_terminal_exception_message=None,
165
+ do_create_placement_group=False,
166
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
167
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
168
+ read_kwargs_provider=None,
169
+ drop_duplicates=False,
170
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
171
+ assert_compaction_audit=None,
172
+ num_rounds=2,
173
+ ),
174
+ # 4 input deltas that are unique, 2 rounds requested.
175
+ # Expect to see a table that aggregates 40 unique records across the 2 rounds
176
+ # (dropDuplicates = False)
177
+ "2-multiple-rounds-unique-values": MultipleRoundsTestCaseParams(
178
+ primary_keys={"pk_col_1"},
179
+ sort_keys=[
180
+ SortKey.of(key_name="sk_col_1"),
181
+ SortKey.of(key_name="sk_col_2"),
182
+ ],
183
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
184
+ partition_values=["1"],
185
+ input_deltas=[
186
+ (
187
+ pa.Table.from_arrays(
188
+ [
189
+ pa.array([str(i) for i in range(10)]),
190
+ pa.array([i for i in range(0, 10)]),
191
+ pa.array(["foo"] * 10),
192
+ pa.array([i / 10 for i in range(0, 10)]),
193
+ ],
194
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
195
+ ),
196
+ DeltaType.UPSERT,
197
+ None,
198
+ ),
199
+ (
200
+ pa.Table.from_arrays(
201
+ [
202
+ pa.array([str(i) for i in range(10, 20)]),
203
+ pa.array([i for i in range(0, 10)]),
204
+ pa.array(["bar"] * 10),
205
+ pa.array([i / 10 for i in range(10, 20)]),
206
+ ],
207
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
208
+ ),
209
+ DeltaType.UPSERT,
210
+ None,
211
+ ),
212
+ (
213
+ pa.Table.from_arrays(
214
+ [
215
+ pa.array([str(i) for i in range(20, 30)]),
216
+ pa.array([i for i in range(0, 10)]),
217
+ pa.array(["foo"] * 10),
218
+ pa.array([i / 10 for i in range(20, 30)]),
219
+ ],
220
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
221
+ ),
222
+ DeltaType.UPSERT,
223
+ None,
224
+ ),
225
+ (
226
+ pa.Table.from_arrays(
227
+ [
228
+ pa.array([str(i) for i in range(30, 40)]),
229
+ pa.array([i for i in range(0, 10)]),
230
+ pa.array(["foo"] * 10),
231
+ pa.array([i / 10 for i in range(30, 40)]),
232
+ ],
233
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
234
+ ),
235
+ DeltaType.UPSERT,
236
+ None,
237
+ ),
238
+ ],
239
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
240
+ [
241
+ pa.array([str(i) for i in range(0, 40)]),
242
+ pa.array([i for i in range(0, 10)] * 4),
243
+ pa.array(["foo"] * 10 + ["bar"] * 10 + ["foo"] * 20),
244
+ pa.array([i / 10 for i in range(0, 40)]),
245
+ ],
246
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
247
+ ),
248
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
249
+ [
250
+ pa.array([str(i) for i in range(0, 40)]),
251
+ pa.array([i for i in range(0, 10)] * 4),
252
+ pa.array(["foo"] * 10 + ["bar"] * 10 + ["foo"] * 20),
253
+ pa.array([i / 10 for i in range(0, 40)]),
254
+ ],
255
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
256
+ ),
257
+ expected_terminal_exception=None,
258
+ expected_terminal_exception_message=None,
259
+ do_create_placement_group=False,
260
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
261
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
262
+ read_kwargs_provider=None,
263
+ drop_duplicates=False,
264
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
265
+ assert_compaction_audit=None,
266
+ num_rounds=2,
267
+ ),
268
+ # Testing assert that checks if the num_rounds passed in
269
+ # is less than the len(uniform_deltas).
270
+ "3-num-rounds-greater-than-deltas-count": MultipleRoundsTestCaseParams(
271
+ primary_keys={"pk_col_1"},
272
+ sort_keys=[
273
+ SortKey.of(key_name="sk_col_1"),
274
+ SortKey.of(key_name="sk_col_2"),
275
+ ],
276
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
277
+ partition_values=["1"],
278
+ input_deltas=[
279
+ (
280
+ pa.Table.from_arrays(
281
+ [
282
+ pa.array([str(i) for i in range(10)]),
283
+ pa.array([i for i in range(0, 10)]),
284
+ pa.array(["foo"] * 10),
285
+ pa.array([i / 10 for i in range(0, 10)]),
286
+ ],
287
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
288
+ ),
289
+ DeltaType.UPSERT,
290
+ None,
291
+ ),
292
+ (
293
+ pa.Table.from_arrays(
294
+ [
295
+ pa.array([str(i) for i in range(10, 20)]),
296
+ pa.array([i for i in range(0, 10)]),
297
+ pa.array(["foo"] * 10),
298
+ pa.array([i / 10 for i in range(10, 20)]),
299
+ ],
300
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
301
+ ),
302
+ DeltaType.UPSERT,
303
+ None,
304
+ ),
305
+ (
306
+ pa.Table.from_arrays(
307
+ [
308
+ pa.array([str(i) for i in range(20, 30)]),
309
+ pa.array([i for i in range(0, 10)]),
310
+ pa.array(["foo"] * 10),
311
+ pa.array([i / 10 for i in range(20, 30)]),
312
+ ],
313
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
314
+ ),
315
+ DeltaType.UPSERT,
316
+ None,
317
+ ),
318
+ (
319
+ pa.Table.from_arrays(
320
+ [
321
+ pa.array([str(i) for i in range(30, 40)]),
322
+ pa.array([i for i in range(0, 10)]),
323
+ pa.array(["foo"] * 10),
324
+ pa.array([i / 10 for i in range(30, 40)]),
325
+ ],
326
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
327
+ ),
328
+ DeltaType.UPSERT,
329
+ None,
330
+ ),
331
+ ],
332
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
333
+ [
334
+ pa.array([str(i) for i in range(0, 40)]),
335
+ pa.array([i for i in range(0, 10)] * 4),
336
+ pa.array(["foo"] * 40),
337
+ pa.array([i / 10 for i in range(0, 40)]),
338
+ ],
339
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
340
+ ),
341
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
342
+ [
343
+ pa.array([str(i) for i in range(0, 40)]),
344
+ pa.array([i for i in range(0, 10)] * 4),
345
+ pa.array(["foo"] * 40),
346
+ pa.array([i / 10 for i in range(0, 40)]),
347
+ ],
348
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
349
+ ),
350
+ expected_terminal_exception=ValidationError,
351
+ expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
352
+ do_create_placement_group=False,
353
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
354
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
355
+ read_kwargs_provider=None,
356
+ drop_duplicates=False,
357
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
358
+ assert_compaction_audit=None,
359
+ num_rounds=15,
360
+ ),
361
+ # 4 input deltas that are identical, 2 rounds requested.
362
+ # Expect to see a table that aggregates 40 records across the 2 rounds
363
+ # (dropDuplicates = False), hb_count = 1
364
+ "4-multiple-rounds-hb-count-equals-1": MultipleRoundsTestCaseParams(
365
+ primary_keys={"pk_col_1"},
366
+ sort_keys=[
367
+ SortKey.of(key_name="sk_col_1"),
368
+ SortKey.of(key_name="sk_col_2"),
369
+ ],
370
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
371
+ partition_values=["1"],
372
+ input_deltas=[
373
+ (
374
+ pa.Table.from_arrays(
375
+ [
376
+ pa.array([str(i) for i in range(10)]),
377
+ pa.array([i for i in range(0, 10)]),
378
+ pa.array(["foo"] * 10),
379
+ pa.array([i / 10 for i in range(0, 10)]),
380
+ ],
381
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
382
+ ),
383
+ DeltaType.UPSERT,
384
+ None,
385
+ ),
386
+ (
387
+ pa.Table.from_arrays(
388
+ [
389
+ pa.array([str(i) for i in range(10, 20)]),
390
+ pa.array([i for i in range(0, 10)]),
391
+ pa.array(["foo"] * 10),
392
+ pa.array([i / 10 for i in range(10, 20)]),
393
+ ],
394
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
395
+ ),
396
+ DeltaType.UPSERT,
397
+ None,
398
+ ),
399
+ (
400
+ pa.Table.from_arrays(
401
+ [
402
+ pa.array([str(i) for i in range(20, 30)]),
403
+ pa.array([i for i in range(0, 10)]),
404
+ pa.array(["foo"] * 10),
405
+ pa.array([i / 10 for i in range(20, 30)]),
406
+ ],
407
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
408
+ ),
409
+ DeltaType.UPSERT,
410
+ None,
411
+ ),
412
+ (
413
+ pa.Table.from_arrays(
414
+ [
415
+ pa.array([str(i) for i in range(30, 40)]),
416
+ pa.array([i for i in range(0, 10)]),
417
+ pa.array(["foo"] * 10),
418
+ pa.array([i / 10 for i in range(30, 40)]),
419
+ ],
420
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
421
+ ),
422
+ DeltaType.UPSERT,
423
+ None,
424
+ ),
425
+ ],
426
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
427
+ [
428
+ pa.array([str(i) for i in range(0, 40)]),
429
+ pa.array([i for i in range(0, 10)] * 4),
430
+ pa.array(["foo"] * 40),
431
+ pa.array([i / 10 for i in range(0, 40)]),
432
+ ],
433
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
434
+ ),
435
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
436
+ [
437
+ pa.array([str(i) for i in range(0, 40)]),
438
+ pa.array([i for i in range(0, 10)] * 4),
439
+ pa.array(["foo"] * 40),
440
+ pa.array([i / 10 for i in range(0, 40)]),
441
+ ],
442
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
443
+ ),
444
+ expected_terminal_exception=None,
445
+ expected_terminal_exception_message=None,
446
+ do_create_placement_group=False,
447
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
448
+ hash_bucket_count=1,
449
+ read_kwargs_provider=None,
450
+ drop_duplicates=False,
451
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
452
+ assert_compaction_audit=None,
453
+ num_rounds=2,
454
+ ),
455
+ # Testing assert that ensure we are running multiple rounds only when
456
+ # drop_duplicates is False (rebase). Running backfill on multiple rounds
457
+ # is currently not supported.
458
+ "5-multiple-rounds-only-supports-rebase": MultipleRoundsTestCaseParams(
459
+ primary_keys={"pk_col_1"},
460
+ sort_keys=[
461
+ SortKey.of(key_name="sk_col_1"),
462
+ SortKey.of(key_name="sk_col_2"),
463
+ ],
464
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
465
+ partition_values=["1"],
466
+ input_deltas=[
467
+ (
468
+ pa.Table.from_arrays(
469
+ [
470
+ pa.array([str(i) for i in range(10)]),
471
+ pa.array([i for i in range(0, 10)]),
472
+ pa.array(["foo"] * 10),
473
+ pa.array([i / 10 for i in range(0, 10)]),
474
+ ],
475
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
476
+ ),
477
+ DeltaType.UPSERT,
478
+ None,
479
+ ),
480
+ (
481
+ pa.Table.from_arrays(
482
+ [
483
+ pa.array([str(i) for i in range(10, 20)]),
484
+ pa.array([i for i in range(0, 10)]),
485
+ pa.array(["foo"] * 10),
486
+ pa.array([i / 10 for i in range(10, 20)]),
487
+ ],
488
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
489
+ ),
490
+ DeltaType.UPSERT,
491
+ None,
492
+ ),
493
+ (
494
+ pa.Table.from_arrays(
495
+ [
496
+ pa.array([str(i) for i in range(20, 30)]),
497
+ pa.array([i for i in range(0, 10)]),
498
+ pa.array(["foo"] * 10),
499
+ pa.array([i / 10 for i in range(20, 30)]),
500
+ ],
501
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
502
+ ),
503
+ DeltaType.UPSERT,
504
+ None,
505
+ ),
506
+ (
507
+ pa.Table.from_arrays(
508
+ [
509
+ pa.array([str(i) for i in range(30, 40)]),
510
+ pa.array([i for i in range(0, 10)]),
511
+ pa.array(["foo"] * 10),
512
+ pa.array([i / 10 for i in range(30, 40)]),
513
+ ],
514
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
515
+ ),
516
+ DeltaType.UPSERT,
517
+ None,
518
+ ),
519
+ ],
520
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
521
+ [
522
+ pa.array([str(i) for i in range(0, 40)]),
523
+ pa.array([i for i in range(0, 10)] * 4),
524
+ pa.array(["foo"] * 40),
525
+ pa.array([i / 10 for i in range(0, 40)]),
526
+ ],
527
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
528
+ ),
529
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
530
+ [
531
+ pa.array([str(i) for i in range(0, 40)]),
532
+ pa.array([i for i in range(0, 10)] * 4),
533
+ pa.array(["foo"] * 40),
534
+ pa.array([i / 10 for i in range(0, 40)]),
535
+ ],
536
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
537
+ ),
538
+ expected_terminal_exception=ValidationError,
539
+ expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
540
+ do_create_placement_group=False,
541
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
542
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
543
+ read_kwargs_provider=None,
544
+ drop_duplicates=True,
545
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
546
+ assert_compaction_audit=None,
547
+ num_rounds=2,
548
+ ),
549
+ # 4 input deltas that are identical, 2 rounds requested.
550
+ # Expect to see a table that aggregates 40 records across the 2 rounds
551
+ # (dropDuplicates = False), tests placement group parameter functionality
552
+ # (do_create_placement_group = True)
553
+ "6-multiple-rounds-test-pgm": MultipleRoundsTestCaseParams(
554
+ primary_keys={"pk_col_1"},
555
+ sort_keys=[
556
+ SortKey.of(key_name="sk_col_1"),
557
+ SortKey.of(key_name="sk_col_2"),
558
+ ],
559
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
560
+ partition_values=["1"],
561
+ input_deltas=[
562
+ (
563
+ pa.Table.from_arrays(
564
+ [
565
+ pa.array([str(i) for i in range(10)]),
566
+ pa.array([i for i in range(0, 10)]),
567
+ pa.array(["foo"] * 10),
568
+ pa.array([i / 10 for i in range(0, 10)]),
569
+ ],
570
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
571
+ ),
572
+ DeltaType.UPSERT,
573
+ None,
574
+ ),
575
+ (
576
+ pa.Table.from_arrays(
577
+ [
578
+ pa.array([str(i) for i in range(10)]),
579
+ pa.array([i for i in range(0, 10)]),
580
+ pa.array(["foo"] * 10),
581
+ pa.array([i / 10 for i in range(0, 10)]),
582
+ ],
583
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
584
+ ),
585
+ DeltaType.UPSERT,
586
+ None,
587
+ ),
588
+ (
589
+ pa.Table.from_arrays(
590
+ [
591
+ pa.array([str(i) for i in range(10)]),
592
+ pa.array([i for i in range(0, 10)]),
593
+ pa.array(["foo"] * 10),
594
+ pa.array([i / 10 for i in range(0, 10)]),
595
+ ],
596
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
597
+ ),
598
+ DeltaType.UPSERT,
599
+ None,
600
+ ),
601
+ (
602
+ pa.Table.from_arrays(
603
+ [
604
+ pa.array([str(i) for i in range(10)]),
605
+ pa.array([i for i in range(0, 10)]),
606
+ pa.array(["foo"] * 10),
607
+ pa.array([i / 10 for i in range(0, 10)]),
608
+ ],
609
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
610
+ ),
611
+ DeltaType.UPSERT,
612
+ None,
613
+ ),
614
+ ],
615
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
616
+ [
617
+ pa.array([str(i) for i in range(0, 10)] * 4),
618
+ pa.array([i for i in range(0, 10)] * 4),
619
+ pa.array(["foo"] * 40),
620
+ pa.array([i / 10 for i in range(0, 10)] * 4),
621
+ ],
622
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
623
+ ),
624
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
625
+ [
626
+ pa.array([str(i) for i in range(0, 10)] * 4),
627
+ pa.array([i for i in range(0, 10)] * 4),
628
+ pa.array(["foo"] * 40),
629
+ pa.array([i / 10 for i in range(0, 10)] * 4),
630
+ ],
631
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
632
+ ),
633
+ expected_terminal_exception=None,
634
+ expected_terminal_exception_message=None,
635
+ do_create_placement_group=True,
636
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
637
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
638
+ read_kwargs_provider=None,
639
+ drop_duplicates=False,
640
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
641
+ assert_compaction_audit=None,
642
+ num_rounds=2,
643
+ ),
644
+ # 4 input deltas (3 upsert, 1 delete delta), 2 rounds requested
645
+ # Expect to see a table that aggregates 10 records total
646
+ # (12 upserts - 2 deletes = 10 records)
647
+ # (dropDuplicates = False)
648
+ "7-multiple-rounds-delete-deltas": MultipleRoundsTestCaseParams(
649
+ primary_keys={"pk_col_1"},
650
+ sort_keys=ZERO_VALUED_SORT_KEY,
651
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
652
+ partition_values=["1"],
653
+ input_deltas=[
654
+ (
655
+ pa.Table.from_arrays(
656
+ [
657
+ pa.array([10, 11, 12, 13]),
658
+ pa.array(["a", "b", "c", "d"]),
659
+ ],
660
+ names=["pk_col_1", "col_1"],
661
+ ),
662
+ DeltaType.UPSERT,
663
+ None,
664
+ ),
665
+ (
666
+ pa.Table.from_arrays(
667
+ [
668
+ pa.array([14, 15, 16, 17]),
669
+ pa.array(["e", "f", "g", "h"]),
670
+ ],
671
+ names=["pk_col_1", "col_1"],
672
+ ),
673
+ DeltaType.UPSERT,
674
+ None,
675
+ ),
676
+ (
677
+ pa.Table.from_arrays(
678
+ [
679
+ pa.array([18, 19, 20, 21]),
680
+ pa.array(["i", "j", "k", "l"]),
681
+ ],
682
+ names=["pk_col_1", "col_1"],
683
+ ),
684
+ DeltaType.UPSERT,
685
+ None,
686
+ ),
687
+ (
688
+ pa.Table.from_arrays(
689
+ [pa.array([10, 11]), pa.array(["a", "b"])],
690
+ names=["pk_col_1", "col_1"],
691
+ ),
692
+ DeltaType.DELETE,
693
+ DeleteParameters.of(["pk_col_1", "col_1"]),
694
+ ),
695
+ ],
696
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
697
+ [
698
+ pa.array([i for i in range(12, 22)]),
699
+ pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
700
+ ],
701
+ names=["pk_col_1", "col_1"],
702
+ ),
703
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
704
+ [
705
+ pa.array([i for i in range(12, 22)]),
706
+ pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
707
+ ],
708
+ names=["pk_col_1", "col_1"],
709
+ ),
710
+ expected_terminal_exception=None,
711
+ expected_terminal_exception_message=None,
712
+ do_create_placement_group=False,
713
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
714
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
715
+ read_kwargs_provider=None,
716
+ drop_duplicates=False,
717
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
718
+ assert_compaction_audit=None,
719
+ num_rounds=2,
720
+ ),
721
+ # 6 input deltas (4 upsert, 2 delete deltas), 3 rounds requested
722
+ # Testing multiple delete deltas in between upserts with odd
723
+ # number of rounds requested
724
+ # (dropDuplicates = False)
725
+ "8-multiple-rounds-multiple-delete-deltas": MultipleRoundsTestCaseParams(
726
+ primary_keys={"pk_col_1"},
727
+ sort_keys=ZERO_VALUED_SORT_KEY,
728
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
729
+ partition_values=["1"],
730
+ input_deltas=[
731
+ (
732
+ pa.Table.from_arrays(
733
+ [
734
+ pa.array([1, 2, 3, 4]),
735
+ pa.array(
736
+ ["iron man", "captain america", "black widow", "hulk"]
737
+ ),
738
+ ],
739
+ names=["pk_col_1", "col_1"],
740
+ ),
741
+ DeltaType.UPSERT,
742
+ None,
743
+ ),
744
+ (
745
+ pa.Table.from_arrays(
746
+ [
747
+ pa.array([5, 6, 7, 8]),
748
+ pa.array(["hawkeye", "thor", "star lord", "gamora"]),
749
+ ],
750
+ names=["pk_col_1", "col_1"],
751
+ ),
752
+ DeltaType.UPSERT,
753
+ None,
754
+ ),
755
+ (
756
+ pa.Table.from_arrays(
757
+ [pa.array([1, 3]), pa.array(["iron man", "black widow"])],
758
+ names=["pk_col_1", "col_1"],
759
+ ),
760
+ DeltaType.DELETE,
761
+ DeleteParameters.of(["pk_col_1", "col_1"]),
762
+ ),
763
+ (
764
+ pa.Table.from_arrays(
765
+ [pa.array([8]), pa.array(["gamora"])],
766
+ names=["pk_col_1", "col_1"],
767
+ ),
768
+ DeltaType.DELETE,
769
+ DeleteParameters.of(["pk_col_1", "col_1"]),
770
+ ),
771
+ (
772
+ pa.Table.from_arrays(
773
+ [
774
+ pa.array([9, 10, 11, 12]),
775
+ pa.array(["war machine", "scarlet witch", "vision", "falcon"]),
776
+ ],
777
+ names=["pk_col_1", "col_1"],
778
+ ),
779
+ DeltaType.UPSERT,
780
+ None,
781
+ ),
782
+ (
783
+ pa.Table.from_arrays(
784
+ [
785
+ pa.array([13, 14, 15, 16]),
786
+ pa.array(["ant man", "wasp", "rocket raccoon", "groot"]),
787
+ ],
788
+ names=["pk_col_1", "col_1"],
789
+ ),
790
+ DeltaType.UPSERT,
791
+ None,
792
+ ),
793
+ ],
794
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
795
+ [
796
+ pa.array([2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16]),
797
+ pa.array(
798
+ [
799
+ "captain america",
800
+ "hulk",
801
+ "hawkeye",
802
+ "thor",
803
+ "star lord",
804
+ "war machine",
805
+ "scarlet witch",
806
+ "vision",
807
+ "falcon",
808
+ "ant man",
809
+ "wasp",
810
+ "rocket raccoon",
811
+ "groot",
812
+ ]
813
+ ),
814
+ ],
815
+ names=["pk_col_1", "col_1"],
816
+ ),
817
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
818
+ [
819
+ pa.array([2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16]),
820
+ pa.array(
821
+ [
822
+ "captain america",
823
+ "hulk",
824
+ "hawkeye",
825
+ "thor",
826
+ "star lord",
827
+ "war machine",
828
+ "scarlet witch",
829
+ "vision",
830
+ "falcon",
831
+ "ant man",
832
+ "wasp",
833
+ "rocket raccoon",
834
+ "groot",
835
+ ]
836
+ ),
837
+ ],
838
+ names=["pk_col_1", "col_1"],
839
+ ),
840
+ expected_terminal_exception=None,
841
+ expected_terminal_exception_message=None,
842
+ do_create_placement_group=False,
843
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
844
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
845
+ read_kwargs_provider=None,
846
+ drop_duplicates=False,
847
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
848
+ assert_compaction_audit=None,
849
+ num_rounds=3,
850
+ ),
851
+ }
852
+
853
+ MULTIPLE_ROUNDS_TEST_CASES = with_compactor_version_func_test_param(
854
+ MULTIPLE_ROUNDS_TEST_CASES
855
+ )