deltacat 1.1.13__py3-none-any.whl → 1.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +3 -2
- deltacat/compute/compactor/model/compact_partition_params.py +11 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +2 -2
- deltacat/compute/compactor/model/delta_annotated.py +2 -4
- deltacat/compute/compactor/steps/hash_bucket.py +2 -3
- deltacat/compute/compactor_v2/compaction_session.py +26 -27
- deltacat/compute/compactor_v2/constants.py +4 -0
- deltacat/compute/compactor_v2/private/__init__.py +0 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +753 -0
- deltacat/compute/compactor_v2/steps/merge.py +0 -3
- deltacat/compute/compactor_v2/utils/delta.py +2 -3
- deltacat/compute/compactor_v2/utils/io.py +0 -2
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +855 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +1 -1
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +330 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -1
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +118 -0
- deltacat/tests/local_deltacat_storage/__init__.py +8 -5
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/METADATA +1 -1
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/RECORD +25 -21
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/LICENSE +0 -0
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/WHEEL +0 -0
- {deltacat-1.1.13.dist-info → deltacat-1.1.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,855 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from typing import Callable, List, Optional, Set, Union
|
3
|
+
from deltacat.utils.common import ReadKwargsProvider
|
4
|
+
from deltacat.tests.compute.test_util_common import (
|
5
|
+
PartitionKey,
|
6
|
+
PartitionKeyType,
|
7
|
+
)
|
8
|
+
from deltacat.tests.compute.test_util_constant import (
|
9
|
+
DEFAULT_MAX_RECORDS_PER_FILE,
|
10
|
+
DEFAULT_HASH_BUCKET_COUNT,
|
11
|
+
)
|
12
|
+
from dataclasses import dataclass, fields
|
13
|
+
|
14
|
+
from deltacat.exceptions import ValidationError
|
15
|
+
|
16
|
+
from deltacat.storage import (
|
17
|
+
DeltaType,
|
18
|
+
DeleteParameters,
|
19
|
+
)
|
20
|
+
|
21
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
22
|
+
|
23
|
+
from deltacat.storage.model.sort_key import SortKey
|
24
|
+
|
25
|
+
from deltacat.tests.compute.compact_partition_test_cases import (
|
26
|
+
with_compactor_version_func_test_param,
|
27
|
+
ZERO_VALUED_SORT_KEY,
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
@dataclass(frozen=True)
|
32
|
+
class MultipleRoundsTestCaseParams:
|
33
|
+
"""
|
34
|
+
A pytest parameterized test case for the `compact_partition` function.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
primary_keys: Set[str] - argument for the primary_keys parameter in compact_partition. Also needed for table/delta creation
|
38
|
+
sort_keys: List[SortKey] - argument for the sort_keys parameter in compact_partition. Also needed for table/delta creation
|
39
|
+
partition_keys_param: List[PartitionKey] - argument for the partition_keys parameter. Needed for table/delta creation
|
40
|
+
partition_values_param: List[Optional[str]] - argument for the partition_valued parameter. Needed for table/delta creation
|
41
|
+
input_deltas: List[pa.Array] - argument required for delta creation during compact_partition test setup. Actual incoming deltas expressed as a PyArrow array (https://arrow.apache.org/docs/python/generated/pyarrow.array.html)
|
42
|
+
expected_terminal_compact_partition_result: pa.Table - expected PyArrow table after compaction (i.e,. the state of the table after applying all row UPDATES/DELETES/INSERTS)
|
43
|
+
expected_terminal_exception: BaseException - expected exception during compaction
|
44
|
+
expected_terminal_exception_message: Optional[str] - expected exception message if present.
|
45
|
+
do_create_placement_group: bool - toggles whether to create a placement group (https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html) or not
|
46
|
+
records_per_compacted_file: int - argument for the records_per_compacted_file parameter in compact_partition
|
47
|
+
hash_bucket_count_param: int - argument for the hash_bucket_count parameter in compact_partition
|
48
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] - argument for read_kwargs_provider parameter in compact_partition. If None then no ReadKwargsProvider is provided to compact_partition_params
|
49
|
+
drop_duplicates: bool - argument for drop_duplicates parameter in compact_partition. Only recognized by compactor v2.
|
50
|
+
skip_enabled_compact_partition_drivers: List[CompactorVersion] - skip whatever enabled_compact_partition_drivers are included in this list
|
51
|
+
assert_compaction_audit: Optional[Callable] - argument that asserts compaction_audit is updated only if compactor_version is v2.
|
52
|
+
rebase_expected_compact_partition_result: pa.Table - expected table after rebase compaction runs. An output that is asserted on in Rebase unit tests
|
53
|
+
num_rounds: int - parameter that specifies the number of rounds of compaction (how many batches of uniform deltas to make). Default is 1 round
|
54
|
+
"""
|
55
|
+
|
56
|
+
primary_keys: Set[str]
|
57
|
+
sort_keys: List[Optional[SortKey]]
|
58
|
+
partition_keys: Optional[List[PartitionKey]]
|
59
|
+
partition_values: List[Optional[str]]
|
60
|
+
input_deltas: Union[List[pa.Array], DeltaType, DeleteParameters]
|
61
|
+
expected_terminal_compact_partition_result: pa.Table
|
62
|
+
expected_terminal_exception: BaseException
|
63
|
+
expected_terminal_exception_message: str
|
64
|
+
do_create_placement_group: bool
|
65
|
+
records_per_compacted_file: int
|
66
|
+
hash_bucket_count: int
|
67
|
+
read_kwargs_provider: Optional[ReadKwargsProvider]
|
68
|
+
drop_duplicates: bool
|
69
|
+
skip_enabled_compact_partition_drivers: List[CompactorVersion]
|
70
|
+
assert_compaction_audit: Optional[Callable]
|
71
|
+
rebase_expected_compact_partition_result: pa.Table
|
72
|
+
num_rounds: int
|
73
|
+
|
74
|
+
# makes MultipleRoundsTestCase iterable which is required to build the list of pytest.param values to pass to pytest.mark.parametrize
|
75
|
+
def __iter__(self):
|
76
|
+
return (getattr(self, field.name) for field in fields(self))
|
77
|
+
|
78
|
+
|
79
|
+
MULTIPLE_ROUNDS_TEST_CASES = {
|
80
|
+
# 4 input deltas that are identical, 2 rounds requested.
|
81
|
+
# Expect to see a table that aggregates 40 records across the 2 rounds
|
82
|
+
# (dropDuplicates = False)
|
83
|
+
"1-multiple-rounds-sanity": MultipleRoundsTestCaseParams(
|
84
|
+
primary_keys={"pk_col_1"},
|
85
|
+
sort_keys=[
|
86
|
+
SortKey.of(key_name="sk_col_1"),
|
87
|
+
SortKey.of(key_name="sk_col_2"),
|
88
|
+
],
|
89
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
90
|
+
partition_values=["1"],
|
91
|
+
input_deltas=[
|
92
|
+
(
|
93
|
+
pa.Table.from_arrays(
|
94
|
+
[
|
95
|
+
pa.array([str(i) for i in range(10)]),
|
96
|
+
pa.array([i for i in range(0, 10)]),
|
97
|
+
pa.array(["foo"] * 10),
|
98
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
99
|
+
],
|
100
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
101
|
+
),
|
102
|
+
DeltaType.UPSERT,
|
103
|
+
None,
|
104
|
+
),
|
105
|
+
(
|
106
|
+
pa.Table.from_arrays(
|
107
|
+
[
|
108
|
+
pa.array([str(i) for i in range(10)]),
|
109
|
+
pa.array([i for i in range(0, 10)]),
|
110
|
+
pa.array(["foo"] * 10),
|
111
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
112
|
+
],
|
113
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
114
|
+
),
|
115
|
+
DeltaType.UPSERT,
|
116
|
+
None,
|
117
|
+
),
|
118
|
+
(
|
119
|
+
pa.Table.from_arrays(
|
120
|
+
[
|
121
|
+
pa.array([str(i) for i in range(10)]),
|
122
|
+
pa.array([i for i in range(0, 10)]),
|
123
|
+
pa.array(["foo"] * 10),
|
124
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
125
|
+
],
|
126
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
127
|
+
),
|
128
|
+
DeltaType.UPSERT,
|
129
|
+
None,
|
130
|
+
),
|
131
|
+
(
|
132
|
+
pa.Table.from_arrays(
|
133
|
+
[
|
134
|
+
pa.array([str(i) for i in range(10)]),
|
135
|
+
pa.array([i for i in range(0, 10)]),
|
136
|
+
pa.array(["foo"] * 10),
|
137
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
138
|
+
],
|
139
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
140
|
+
),
|
141
|
+
DeltaType.UPSERT,
|
142
|
+
None,
|
143
|
+
),
|
144
|
+
],
|
145
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
146
|
+
[
|
147
|
+
pa.array([str(i) for i in range(0, 10)] * 4),
|
148
|
+
pa.array([i for i in range(0, 10)] * 4),
|
149
|
+
pa.array(["foo"] * 40),
|
150
|
+
pa.array([i / 10 for i in range(0, 10)] * 4),
|
151
|
+
],
|
152
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
153
|
+
),
|
154
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
155
|
+
[
|
156
|
+
pa.array([str(i) for i in range(0, 10)] * 4),
|
157
|
+
pa.array([i for i in range(0, 10)] * 4),
|
158
|
+
pa.array(["foo"] * 40),
|
159
|
+
pa.array([i / 10 for i in range(0, 10)] * 4),
|
160
|
+
],
|
161
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
162
|
+
),
|
163
|
+
expected_terminal_exception=None,
|
164
|
+
expected_terminal_exception_message=None,
|
165
|
+
do_create_placement_group=False,
|
166
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
167
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
168
|
+
read_kwargs_provider=None,
|
169
|
+
drop_duplicates=False,
|
170
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
171
|
+
assert_compaction_audit=None,
|
172
|
+
num_rounds=2,
|
173
|
+
),
|
174
|
+
# 4 input deltas that are unique, 2 rounds requested.
|
175
|
+
# Expect to see a table that aggregates 40 unique records across the 2 rounds
|
176
|
+
# (dropDuplicates = False)
|
177
|
+
"2-multiple-rounds-unique-values": MultipleRoundsTestCaseParams(
|
178
|
+
primary_keys={"pk_col_1"},
|
179
|
+
sort_keys=[
|
180
|
+
SortKey.of(key_name="sk_col_1"),
|
181
|
+
SortKey.of(key_name="sk_col_2"),
|
182
|
+
],
|
183
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
184
|
+
partition_values=["1"],
|
185
|
+
input_deltas=[
|
186
|
+
(
|
187
|
+
pa.Table.from_arrays(
|
188
|
+
[
|
189
|
+
pa.array([str(i) for i in range(10)]),
|
190
|
+
pa.array([i for i in range(0, 10)]),
|
191
|
+
pa.array(["foo"] * 10),
|
192
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
193
|
+
],
|
194
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
195
|
+
),
|
196
|
+
DeltaType.UPSERT,
|
197
|
+
None,
|
198
|
+
),
|
199
|
+
(
|
200
|
+
pa.Table.from_arrays(
|
201
|
+
[
|
202
|
+
pa.array([str(i) for i in range(10, 20)]),
|
203
|
+
pa.array([i for i in range(0, 10)]),
|
204
|
+
pa.array(["bar"] * 10),
|
205
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
206
|
+
],
|
207
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
208
|
+
),
|
209
|
+
DeltaType.UPSERT,
|
210
|
+
None,
|
211
|
+
),
|
212
|
+
(
|
213
|
+
pa.Table.from_arrays(
|
214
|
+
[
|
215
|
+
pa.array([str(i) for i in range(20, 30)]),
|
216
|
+
pa.array([i for i in range(0, 10)]),
|
217
|
+
pa.array(["foo"] * 10),
|
218
|
+
pa.array([i / 10 for i in range(20, 30)]),
|
219
|
+
],
|
220
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
221
|
+
),
|
222
|
+
DeltaType.UPSERT,
|
223
|
+
None,
|
224
|
+
),
|
225
|
+
(
|
226
|
+
pa.Table.from_arrays(
|
227
|
+
[
|
228
|
+
pa.array([str(i) for i in range(30, 40)]),
|
229
|
+
pa.array([i for i in range(0, 10)]),
|
230
|
+
pa.array(["foo"] * 10),
|
231
|
+
pa.array([i / 10 for i in range(30, 40)]),
|
232
|
+
],
|
233
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
234
|
+
),
|
235
|
+
DeltaType.UPSERT,
|
236
|
+
None,
|
237
|
+
),
|
238
|
+
],
|
239
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
240
|
+
[
|
241
|
+
pa.array([str(i) for i in range(0, 40)]),
|
242
|
+
pa.array([i for i in range(0, 10)] * 4),
|
243
|
+
pa.array(["foo"] * 10 + ["bar"] * 10 + ["foo"] * 20),
|
244
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
245
|
+
],
|
246
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
247
|
+
),
|
248
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
249
|
+
[
|
250
|
+
pa.array([str(i) for i in range(0, 40)]),
|
251
|
+
pa.array([i for i in range(0, 10)] * 4),
|
252
|
+
pa.array(["foo"] * 10 + ["bar"] * 10 + ["foo"] * 20),
|
253
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
254
|
+
],
|
255
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
256
|
+
),
|
257
|
+
expected_terminal_exception=None,
|
258
|
+
expected_terminal_exception_message=None,
|
259
|
+
do_create_placement_group=False,
|
260
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
261
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
262
|
+
read_kwargs_provider=None,
|
263
|
+
drop_duplicates=False,
|
264
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
265
|
+
assert_compaction_audit=None,
|
266
|
+
num_rounds=2,
|
267
|
+
),
|
268
|
+
# Testing assert that checks if the num_rounds passed in
|
269
|
+
# is less than the len(uniform_deltas).
|
270
|
+
"3-num-rounds-greater-than-deltas-count": MultipleRoundsTestCaseParams(
|
271
|
+
primary_keys={"pk_col_1"},
|
272
|
+
sort_keys=[
|
273
|
+
SortKey.of(key_name="sk_col_1"),
|
274
|
+
SortKey.of(key_name="sk_col_2"),
|
275
|
+
],
|
276
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
277
|
+
partition_values=["1"],
|
278
|
+
input_deltas=[
|
279
|
+
(
|
280
|
+
pa.Table.from_arrays(
|
281
|
+
[
|
282
|
+
pa.array([str(i) for i in range(10)]),
|
283
|
+
pa.array([i for i in range(0, 10)]),
|
284
|
+
pa.array(["foo"] * 10),
|
285
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
286
|
+
],
|
287
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
288
|
+
),
|
289
|
+
DeltaType.UPSERT,
|
290
|
+
None,
|
291
|
+
),
|
292
|
+
(
|
293
|
+
pa.Table.from_arrays(
|
294
|
+
[
|
295
|
+
pa.array([str(i) for i in range(10, 20)]),
|
296
|
+
pa.array([i for i in range(0, 10)]),
|
297
|
+
pa.array(["foo"] * 10),
|
298
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
299
|
+
],
|
300
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
301
|
+
),
|
302
|
+
DeltaType.UPSERT,
|
303
|
+
None,
|
304
|
+
),
|
305
|
+
(
|
306
|
+
pa.Table.from_arrays(
|
307
|
+
[
|
308
|
+
pa.array([str(i) for i in range(20, 30)]),
|
309
|
+
pa.array([i for i in range(0, 10)]),
|
310
|
+
pa.array(["foo"] * 10),
|
311
|
+
pa.array([i / 10 for i in range(20, 30)]),
|
312
|
+
],
|
313
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
314
|
+
),
|
315
|
+
DeltaType.UPSERT,
|
316
|
+
None,
|
317
|
+
),
|
318
|
+
(
|
319
|
+
pa.Table.from_arrays(
|
320
|
+
[
|
321
|
+
pa.array([str(i) for i in range(30, 40)]),
|
322
|
+
pa.array([i for i in range(0, 10)]),
|
323
|
+
pa.array(["foo"] * 10),
|
324
|
+
pa.array([i / 10 for i in range(30, 40)]),
|
325
|
+
],
|
326
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
327
|
+
),
|
328
|
+
DeltaType.UPSERT,
|
329
|
+
None,
|
330
|
+
),
|
331
|
+
],
|
332
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
333
|
+
[
|
334
|
+
pa.array([str(i) for i in range(0, 40)]),
|
335
|
+
pa.array([i for i in range(0, 10)] * 4),
|
336
|
+
pa.array(["foo"] * 40),
|
337
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
338
|
+
],
|
339
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
340
|
+
),
|
341
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
342
|
+
[
|
343
|
+
pa.array([str(i) for i in range(0, 40)]),
|
344
|
+
pa.array([i for i in range(0, 10)] * 4),
|
345
|
+
pa.array(["foo"] * 40),
|
346
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
347
|
+
],
|
348
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
349
|
+
),
|
350
|
+
expected_terminal_exception=ValidationError,
|
351
|
+
expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
|
352
|
+
do_create_placement_group=False,
|
353
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
354
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
355
|
+
read_kwargs_provider=None,
|
356
|
+
drop_duplicates=False,
|
357
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
358
|
+
assert_compaction_audit=None,
|
359
|
+
num_rounds=15,
|
360
|
+
),
|
361
|
+
# 4 input deltas that are identical, 2 rounds requested.
|
362
|
+
# Expect to see a table that aggregates 40 records across the 2 rounds
|
363
|
+
# (dropDuplicates = False), hb_count = 1
|
364
|
+
"4-multiple-rounds-hb-count-equals-1": MultipleRoundsTestCaseParams(
|
365
|
+
primary_keys={"pk_col_1"},
|
366
|
+
sort_keys=[
|
367
|
+
SortKey.of(key_name="sk_col_1"),
|
368
|
+
SortKey.of(key_name="sk_col_2"),
|
369
|
+
],
|
370
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
371
|
+
partition_values=["1"],
|
372
|
+
input_deltas=[
|
373
|
+
(
|
374
|
+
pa.Table.from_arrays(
|
375
|
+
[
|
376
|
+
pa.array([str(i) for i in range(10)]),
|
377
|
+
pa.array([i for i in range(0, 10)]),
|
378
|
+
pa.array(["foo"] * 10),
|
379
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
380
|
+
],
|
381
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
382
|
+
),
|
383
|
+
DeltaType.UPSERT,
|
384
|
+
None,
|
385
|
+
),
|
386
|
+
(
|
387
|
+
pa.Table.from_arrays(
|
388
|
+
[
|
389
|
+
pa.array([str(i) for i in range(10, 20)]),
|
390
|
+
pa.array([i for i in range(0, 10)]),
|
391
|
+
pa.array(["foo"] * 10),
|
392
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
393
|
+
],
|
394
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
395
|
+
),
|
396
|
+
DeltaType.UPSERT,
|
397
|
+
None,
|
398
|
+
),
|
399
|
+
(
|
400
|
+
pa.Table.from_arrays(
|
401
|
+
[
|
402
|
+
pa.array([str(i) for i in range(20, 30)]),
|
403
|
+
pa.array([i for i in range(0, 10)]),
|
404
|
+
pa.array(["foo"] * 10),
|
405
|
+
pa.array([i / 10 for i in range(20, 30)]),
|
406
|
+
],
|
407
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
408
|
+
),
|
409
|
+
DeltaType.UPSERT,
|
410
|
+
None,
|
411
|
+
),
|
412
|
+
(
|
413
|
+
pa.Table.from_arrays(
|
414
|
+
[
|
415
|
+
pa.array([str(i) for i in range(30, 40)]),
|
416
|
+
pa.array([i for i in range(0, 10)]),
|
417
|
+
pa.array(["foo"] * 10),
|
418
|
+
pa.array([i / 10 for i in range(30, 40)]),
|
419
|
+
],
|
420
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
421
|
+
),
|
422
|
+
DeltaType.UPSERT,
|
423
|
+
None,
|
424
|
+
),
|
425
|
+
],
|
426
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
427
|
+
[
|
428
|
+
pa.array([str(i) for i in range(0, 40)]),
|
429
|
+
pa.array([i for i in range(0, 10)] * 4),
|
430
|
+
pa.array(["foo"] * 40),
|
431
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
432
|
+
],
|
433
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
434
|
+
),
|
435
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
436
|
+
[
|
437
|
+
pa.array([str(i) for i in range(0, 40)]),
|
438
|
+
pa.array([i for i in range(0, 10)] * 4),
|
439
|
+
pa.array(["foo"] * 40),
|
440
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
441
|
+
],
|
442
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
443
|
+
),
|
444
|
+
expected_terminal_exception=None,
|
445
|
+
expected_terminal_exception_message=None,
|
446
|
+
do_create_placement_group=False,
|
447
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
448
|
+
hash_bucket_count=1,
|
449
|
+
read_kwargs_provider=None,
|
450
|
+
drop_duplicates=False,
|
451
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
452
|
+
assert_compaction_audit=None,
|
453
|
+
num_rounds=2,
|
454
|
+
),
|
455
|
+
# Testing assert that ensure we are running multiple rounds only when
|
456
|
+
# drop_duplicates is False (rebase). Running backfill on multiple rounds
|
457
|
+
# is currently not supported.
|
458
|
+
"5-multiple-rounds-only-supports-rebase": MultipleRoundsTestCaseParams(
|
459
|
+
primary_keys={"pk_col_1"},
|
460
|
+
sort_keys=[
|
461
|
+
SortKey.of(key_name="sk_col_1"),
|
462
|
+
SortKey.of(key_name="sk_col_2"),
|
463
|
+
],
|
464
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
465
|
+
partition_values=["1"],
|
466
|
+
input_deltas=[
|
467
|
+
(
|
468
|
+
pa.Table.from_arrays(
|
469
|
+
[
|
470
|
+
pa.array([str(i) for i in range(10)]),
|
471
|
+
pa.array([i for i in range(0, 10)]),
|
472
|
+
pa.array(["foo"] * 10),
|
473
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
474
|
+
],
|
475
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
476
|
+
),
|
477
|
+
DeltaType.UPSERT,
|
478
|
+
None,
|
479
|
+
),
|
480
|
+
(
|
481
|
+
pa.Table.from_arrays(
|
482
|
+
[
|
483
|
+
pa.array([str(i) for i in range(10, 20)]),
|
484
|
+
pa.array([i for i in range(0, 10)]),
|
485
|
+
pa.array(["foo"] * 10),
|
486
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
487
|
+
],
|
488
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
489
|
+
),
|
490
|
+
DeltaType.UPSERT,
|
491
|
+
None,
|
492
|
+
),
|
493
|
+
(
|
494
|
+
pa.Table.from_arrays(
|
495
|
+
[
|
496
|
+
pa.array([str(i) for i in range(20, 30)]),
|
497
|
+
pa.array([i for i in range(0, 10)]),
|
498
|
+
pa.array(["foo"] * 10),
|
499
|
+
pa.array([i / 10 for i in range(20, 30)]),
|
500
|
+
],
|
501
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
502
|
+
),
|
503
|
+
DeltaType.UPSERT,
|
504
|
+
None,
|
505
|
+
),
|
506
|
+
(
|
507
|
+
pa.Table.from_arrays(
|
508
|
+
[
|
509
|
+
pa.array([str(i) for i in range(30, 40)]),
|
510
|
+
pa.array([i for i in range(0, 10)]),
|
511
|
+
pa.array(["foo"] * 10),
|
512
|
+
pa.array([i / 10 for i in range(30, 40)]),
|
513
|
+
],
|
514
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
515
|
+
),
|
516
|
+
DeltaType.UPSERT,
|
517
|
+
None,
|
518
|
+
),
|
519
|
+
],
|
520
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
521
|
+
[
|
522
|
+
pa.array([str(i) for i in range(0, 40)]),
|
523
|
+
pa.array([i for i in range(0, 10)] * 4),
|
524
|
+
pa.array(["foo"] * 40),
|
525
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
526
|
+
],
|
527
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
528
|
+
),
|
529
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
530
|
+
[
|
531
|
+
pa.array([str(i) for i in range(0, 40)]),
|
532
|
+
pa.array([i for i in range(0, 10)] * 4),
|
533
|
+
pa.array(["foo"] * 40),
|
534
|
+
pa.array([i / 10 for i in range(0, 40)]),
|
535
|
+
],
|
536
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
537
|
+
),
|
538
|
+
expected_terminal_exception=ValidationError,
|
539
|
+
expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
|
540
|
+
do_create_placement_group=False,
|
541
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
542
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
543
|
+
read_kwargs_provider=None,
|
544
|
+
drop_duplicates=True,
|
545
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
546
|
+
assert_compaction_audit=None,
|
547
|
+
num_rounds=2,
|
548
|
+
),
|
549
|
+
# 4 input deltas that are identical, 2 rounds requested.
|
550
|
+
# Expect to see a table that aggregates 40 records across the 2 rounds
|
551
|
+
# (dropDuplicates = False), tests placement group parameter functionality
|
552
|
+
# (do_create_placement_group = True)
|
553
|
+
"6-multiple-rounds-test-pgm": MultipleRoundsTestCaseParams(
|
554
|
+
primary_keys={"pk_col_1"},
|
555
|
+
sort_keys=[
|
556
|
+
SortKey.of(key_name="sk_col_1"),
|
557
|
+
SortKey.of(key_name="sk_col_2"),
|
558
|
+
],
|
559
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
560
|
+
partition_values=["1"],
|
561
|
+
input_deltas=[
|
562
|
+
(
|
563
|
+
pa.Table.from_arrays(
|
564
|
+
[
|
565
|
+
pa.array([str(i) for i in range(10)]),
|
566
|
+
pa.array([i for i in range(0, 10)]),
|
567
|
+
pa.array(["foo"] * 10),
|
568
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
569
|
+
],
|
570
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
571
|
+
),
|
572
|
+
DeltaType.UPSERT,
|
573
|
+
None,
|
574
|
+
),
|
575
|
+
(
|
576
|
+
pa.Table.from_arrays(
|
577
|
+
[
|
578
|
+
pa.array([str(i) for i in range(10)]),
|
579
|
+
pa.array([i for i in range(0, 10)]),
|
580
|
+
pa.array(["foo"] * 10),
|
581
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
582
|
+
],
|
583
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
584
|
+
),
|
585
|
+
DeltaType.UPSERT,
|
586
|
+
None,
|
587
|
+
),
|
588
|
+
(
|
589
|
+
pa.Table.from_arrays(
|
590
|
+
[
|
591
|
+
pa.array([str(i) for i in range(10)]),
|
592
|
+
pa.array([i for i in range(0, 10)]),
|
593
|
+
pa.array(["foo"] * 10),
|
594
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
595
|
+
],
|
596
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
597
|
+
),
|
598
|
+
DeltaType.UPSERT,
|
599
|
+
None,
|
600
|
+
),
|
601
|
+
(
|
602
|
+
pa.Table.from_arrays(
|
603
|
+
[
|
604
|
+
pa.array([str(i) for i in range(10)]),
|
605
|
+
pa.array([i for i in range(0, 10)]),
|
606
|
+
pa.array(["foo"] * 10),
|
607
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
608
|
+
],
|
609
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
610
|
+
),
|
611
|
+
DeltaType.UPSERT,
|
612
|
+
None,
|
613
|
+
),
|
614
|
+
],
|
615
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
616
|
+
[
|
617
|
+
pa.array([str(i) for i in range(0, 10)] * 4),
|
618
|
+
pa.array([i for i in range(0, 10)] * 4),
|
619
|
+
pa.array(["foo"] * 40),
|
620
|
+
pa.array([i / 10 for i in range(0, 10)] * 4),
|
621
|
+
],
|
622
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
623
|
+
),
|
624
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
625
|
+
[
|
626
|
+
pa.array([str(i) for i in range(0, 10)] * 4),
|
627
|
+
pa.array([i for i in range(0, 10)] * 4),
|
628
|
+
pa.array(["foo"] * 40),
|
629
|
+
pa.array([i / 10 for i in range(0, 10)] * 4),
|
630
|
+
],
|
631
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
632
|
+
),
|
633
|
+
expected_terminal_exception=None,
|
634
|
+
expected_terminal_exception_message=None,
|
635
|
+
do_create_placement_group=True,
|
636
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
637
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
638
|
+
read_kwargs_provider=None,
|
639
|
+
drop_duplicates=False,
|
640
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
641
|
+
assert_compaction_audit=None,
|
642
|
+
num_rounds=2,
|
643
|
+
),
|
644
|
+
# 4 input deltas (3 upsert, 1 delete delta), 2 rounds requested
|
645
|
+
# Expect to see a table that aggregates 10 records total
|
646
|
+
# (12 upserts - 2 deletes = 10 records)
|
647
|
+
# (dropDuplicates = False)
|
648
|
+
"7-multiple-rounds-delete-deltas": MultipleRoundsTestCaseParams(
|
649
|
+
primary_keys={"pk_col_1"},
|
650
|
+
sort_keys=ZERO_VALUED_SORT_KEY,
|
651
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
652
|
+
partition_values=["1"],
|
653
|
+
input_deltas=[
|
654
|
+
(
|
655
|
+
pa.Table.from_arrays(
|
656
|
+
[
|
657
|
+
pa.array([10, 11, 12, 13]),
|
658
|
+
pa.array(["a", "b", "c", "d"]),
|
659
|
+
],
|
660
|
+
names=["pk_col_1", "col_1"],
|
661
|
+
),
|
662
|
+
DeltaType.UPSERT,
|
663
|
+
None,
|
664
|
+
),
|
665
|
+
(
|
666
|
+
pa.Table.from_arrays(
|
667
|
+
[
|
668
|
+
pa.array([14, 15, 16, 17]),
|
669
|
+
pa.array(["e", "f", "g", "h"]),
|
670
|
+
],
|
671
|
+
names=["pk_col_1", "col_1"],
|
672
|
+
),
|
673
|
+
DeltaType.UPSERT,
|
674
|
+
None,
|
675
|
+
),
|
676
|
+
(
|
677
|
+
pa.Table.from_arrays(
|
678
|
+
[
|
679
|
+
pa.array([18, 19, 20, 21]),
|
680
|
+
pa.array(["i", "j", "k", "l"]),
|
681
|
+
],
|
682
|
+
names=["pk_col_1", "col_1"],
|
683
|
+
),
|
684
|
+
DeltaType.UPSERT,
|
685
|
+
None,
|
686
|
+
),
|
687
|
+
(
|
688
|
+
pa.Table.from_arrays(
|
689
|
+
[pa.array([10, 11]), pa.array(["a", "b"])],
|
690
|
+
names=["pk_col_1", "col_1"],
|
691
|
+
),
|
692
|
+
DeltaType.DELETE,
|
693
|
+
DeleteParameters.of(["pk_col_1", "col_1"]),
|
694
|
+
),
|
695
|
+
],
|
696
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
697
|
+
[
|
698
|
+
pa.array([i for i in range(12, 22)]),
|
699
|
+
pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
|
700
|
+
],
|
701
|
+
names=["pk_col_1", "col_1"],
|
702
|
+
),
|
703
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
704
|
+
[
|
705
|
+
pa.array([i for i in range(12, 22)]),
|
706
|
+
pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
|
707
|
+
],
|
708
|
+
names=["pk_col_1", "col_1"],
|
709
|
+
),
|
710
|
+
expected_terminal_exception=None,
|
711
|
+
expected_terminal_exception_message=None,
|
712
|
+
do_create_placement_group=False,
|
713
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
714
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
715
|
+
read_kwargs_provider=None,
|
716
|
+
drop_duplicates=False,
|
717
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
718
|
+
assert_compaction_audit=None,
|
719
|
+
num_rounds=2,
|
720
|
+
),
|
721
|
+
# 6 input deltas (4 upsert, 2 delete deltas), 3 rounds requested
|
722
|
+
# Testing multiple delete deltas in between upserts with odd
|
723
|
+
# number of rounds requested
|
724
|
+
# (dropDuplicates = False)
|
725
|
+
"8-multiple-rounds-multiple-delete-deltas": MultipleRoundsTestCaseParams(
|
726
|
+
primary_keys={"pk_col_1"},
|
727
|
+
sort_keys=ZERO_VALUED_SORT_KEY,
|
728
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
729
|
+
partition_values=["1"],
|
730
|
+
input_deltas=[
|
731
|
+
(
|
732
|
+
pa.Table.from_arrays(
|
733
|
+
[
|
734
|
+
pa.array([1, 2, 3, 4]),
|
735
|
+
pa.array(
|
736
|
+
["iron man", "captain america", "black widow", "hulk"]
|
737
|
+
),
|
738
|
+
],
|
739
|
+
names=["pk_col_1", "col_1"],
|
740
|
+
),
|
741
|
+
DeltaType.UPSERT,
|
742
|
+
None,
|
743
|
+
),
|
744
|
+
(
|
745
|
+
pa.Table.from_arrays(
|
746
|
+
[
|
747
|
+
pa.array([5, 6, 7, 8]),
|
748
|
+
pa.array(["hawkeye", "thor", "star lord", "gamora"]),
|
749
|
+
],
|
750
|
+
names=["pk_col_1", "col_1"],
|
751
|
+
),
|
752
|
+
DeltaType.UPSERT,
|
753
|
+
None,
|
754
|
+
),
|
755
|
+
(
|
756
|
+
pa.Table.from_arrays(
|
757
|
+
[pa.array([1, 3]), pa.array(["iron man", "black widow"])],
|
758
|
+
names=["pk_col_1", "col_1"],
|
759
|
+
),
|
760
|
+
DeltaType.DELETE,
|
761
|
+
DeleteParameters.of(["pk_col_1", "col_1"]),
|
762
|
+
),
|
763
|
+
(
|
764
|
+
pa.Table.from_arrays(
|
765
|
+
[pa.array([8]), pa.array(["gamora"])],
|
766
|
+
names=["pk_col_1", "col_1"],
|
767
|
+
),
|
768
|
+
DeltaType.DELETE,
|
769
|
+
DeleteParameters.of(["pk_col_1", "col_1"]),
|
770
|
+
),
|
771
|
+
(
|
772
|
+
pa.Table.from_arrays(
|
773
|
+
[
|
774
|
+
pa.array([9, 10, 11, 12]),
|
775
|
+
pa.array(["war machine", "scarlet witch", "vision", "falcon"]),
|
776
|
+
],
|
777
|
+
names=["pk_col_1", "col_1"],
|
778
|
+
),
|
779
|
+
DeltaType.UPSERT,
|
780
|
+
None,
|
781
|
+
),
|
782
|
+
(
|
783
|
+
pa.Table.from_arrays(
|
784
|
+
[
|
785
|
+
pa.array([13, 14, 15, 16]),
|
786
|
+
pa.array(["ant man", "wasp", "rocket raccoon", "groot"]),
|
787
|
+
],
|
788
|
+
names=["pk_col_1", "col_1"],
|
789
|
+
),
|
790
|
+
DeltaType.UPSERT,
|
791
|
+
None,
|
792
|
+
),
|
793
|
+
],
|
794
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
795
|
+
[
|
796
|
+
pa.array([2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16]),
|
797
|
+
pa.array(
|
798
|
+
[
|
799
|
+
"captain america",
|
800
|
+
"hulk",
|
801
|
+
"hawkeye",
|
802
|
+
"thor",
|
803
|
+
"star lord",
|
804
|
+
"war machine",
|
805
|
+
"scarlet witch",
|
806
|
+
"vision",
|
807
|
+
"falcon",
|
808
|
+
"ant man",
|
809
|
+
"wasp",
|
810
|
+
"rocket raccoon",
|
811
|
+
"groot",
|
812
|
+
]
|
813
|
+
),
|
814
|
+
],
|
815
|
+
names=["pk_col_1", "col_1"],
|
816
|
+
),
|
817
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
818
|
+
[
|
819
|
+
pa.array([2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16]),
|
820
|
+
pa.array(
|
821
|
+
[
|
822
|
+
"captain america",
|
823
|
+
"hulk",
|
824
|
+
"hawkeye",
|
825
|
+
"thor",
|
826
|
+
"star lord",
|
827
|
+
"war machine",
|
828
|
+
"scarlet witch",
|
829
|
+
"vision",
|
830
|
+
"falcon",
|
831
|
+
"ant man",
|
832
|
+
"wasp",
|
833
|
+
"rocket raccoon",
|
834
|
+
"groot",
|
835
|
+
]
|
836
|
+
),
|
837
|
+
],
|
838
|
+
names=["pk_col_1", "col_1"],
|
839
|
+
),
|
840
|
+
expected_terminal_exception=None,
|
841
|
+
expected_terminal_exception_message=None,
|
842
|
+
do_create_placement_group=False,
|
843
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
844
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
845
|
+
read_kwargs_provider=None,
|
846
|
+
drop_duplicates=False,
|
847
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
848
|
+
assert_compaction_audit=None,
|
849
|
+
num_rounds=3,
|
850
|
+
),
|
851
|
+
}
|
852
|
+
|
853
|
+
MULTIPLE_ROUNDS_TEST_CASES = with_compactor_version_func_test_param(
|
854
|
+
MULTIPLE_ROUNDS_TEST_CASES
|
855
|
+
)
|