deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -84,6 +84,314 @@ REBASE_TEST_CASES = {
|
|
84
84
|
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
85
85
|
assert_compaction_audit=None,
|
86
86
|
),
|
87
|
+
"2-rebase-with-null-pk": RebaseCompactionTestCaseParams(
|
88
|
+
primary_keys={"pk_col_1"},
|
89
|
+
sort_keys=[
|
90
|
+
SortKey.of(key_name="sk_col_1"),
|
91
|
+
SortKey.of(key_name="sk_col_2"),
|
92
|
+
],
|
93
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
94
|
+
partition_values=["1"],
|
95
|
+
input_deltas=pa.Table.from_arrays(
|
96
|
+
[
|
97
|
+
pa.array([1, 2, None, 2, None, 1]),
|
98
|
+
pa.array([1, 2, 3, 4, 5, 6]),
|
99
|
+
pa.array(["foo"] * 6),
|
100
|
+
pa.array([5, 6, 7, 8, 9, 10]),
|
101
|
+
],
|
102
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
103
|
+
),
|
104
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
105
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
106
|
+
[
|
107
|
+
pa.array([None, 1, 2]),
|
108
|
+
pa.array([5, 6, 4]),
|
109
|
+
pa.array(["foo"] * 3),
|
110
|
+
pa.array([9, 10, 8]),
|
111
|
+
],
|
112
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
113
|
+
),
|
114
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
115
|
+
[
|
116
|
+
pa.array([None, 1, 2]),
|
117
|
+
pa.array([5, 6, 4]),
|
118
|
+
pa.array(["foo"] * 3),
|
119
|
+
pa.array([7, 10, 8]),
|
120
|
+
],
|
121
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
122
|
+
),
|
123
|
+
expected_terminal_exception=None,
|
124
|
+
expected_terminal_exception_message=None,
|
125
|
+
do_create_placement_group=False,
|
126
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
127
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
128
|
+
read_kwargs_provider=None,
|
129
|
+
drop_duplicates=True,
|
130
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
131
|
+
assert_compaction_audit=None,
|
132
|
+
),
|
133
|
+
"3-rebase-with-null-two-pk": RebaseCompactionTestCaseParams(
|
134
|
+
primary_keys={"pk_col_1", "pk_col_2"},
|
135
|
+
sort_keys=[
|
136
|
+
SortKey.of(key_name="sk_col_1"),
|
137
|
+
],
|
138
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
139
|
+
partition_values=["1"],
|
140
|
+
input_deltas=pa.Table.from_arrays(
|
141
|
+
[
|
142
|
+
pa.array([1, 2, None, 2, None, 1, 5]),
|
143
|
+
pa.array([1, None, 3, None, None, 1, 5]),
|
144
|
+
pa.array(["foo"] * 7),
|
145
|
+
pa.array([5, 6, 7, 8, 9, 10, 11]),
|
146
|
+
],
|
147
|
+
names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
|
148
|
+
),
|
149
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
150
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
151
|
+
[
|
152
|
+
pa.array([1, 2, None, 5, None]),
|
153
|
+
pa.array([1, None, 3, 5, None]),
|
154
|
+
pa.array(["foo"] * 5),
|
155
|
+
pa.array([10, 8, 7, 11, 9]),
|
156
|
+
],
|
157
|
+
names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
|
158
|
+
),
|
159
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
160
|
+
[
|
161
|
+
pa.array([1, 2, None, 5, None]),
|
162
|
+
pa.array([1, None, 3, 5, None]),
|
163
|
+
pa.array(["foo"] * 5),
|
164
|
+
pa.array([10, 8, 7, 11, 9]),
|
165
|
+
],
|
166
|
+
names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
|
167
|
+
),
|
168
|
+
expected_terminal_exception=None,
|
169
|
+
expected_terminal_exception_message=None,
|
170
|
+
do_create_placement_group=False,
|
171
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
172
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
173
|
+
read_kwargs_provider=None,
|
174
|
+
drop_duplicates=True,
|
175
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
176
|
+
assert_compaction_audit=None,
|
177
|
+
),
|
178
|
+
"4-rebase-with-null-multiple-pk-different-types": RebaseCompactionTestCaseParams(
|
179
|
+
primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
|
180
|
+
sort_keys=[],
|
181
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
182
|
+
partition_values=["1"],
|
183
|
+
input_deltas=pa.Table.from_arrays(
|
184
|
+
[
|
185
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
186
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
187
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
188
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
189
|
+
],
|
190
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
191
|
+
),
|
192
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
193
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
194
|
+
[
|
195
|
+
pa.array([1, 2, None, 5, None, None]),
|
196
|
+
pa.array([1, None, 3, 5, None, None]),
|
197
|
+
pa.array(["a", "b", "c", "g", "e", None]),
|
198
|
+
pa.array([10, 8, 7, 11, 12, 14]),
|
199
|
+
],
|
200
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
201
|
+
),
|
202
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
203
|
+
[
|
204
|
+
pa.array([1, 2, None, 5, None, None]),
|
205
|
+
pa.array([1, None, 3, 5, None, None]),
|
206
|
+
pa.array(["a", "b", "c", "g", "e", None]),
|
207
|
+
pa.array([10, 8, 7, 11, 12, 14]),
|
208
|
+
],
|
209
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
210
|
+
),
|
211
|
+
expected_terminal_exception=None,
|
212
|
+
expected_terminal_exception_message=None,
|
213
|
+
do_create_placement_group=False,
|
214
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
215
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
216
|
+
read_kwargs_provider=None,
|
217
|
+
drop_duplicates=True,
|
218
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
219
|
+
assert_compaction_audit=None,
|
220
|
+
),
|
221
|
+
"5-rebase-with-null-multiple-pk-one-hash-bucket": RebaseCompactionTestCaseParams(
|
222
|
+
primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
|
223
|
+
sort_keys=[],
|
224
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
225
|
+
partition_values=["1"],
|
226
|
+
input_deltas=pa.Table.from_arrays(
|
227
|
+
[
|
228
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
229
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
230
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
231
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
232
|
+
],
|
233
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
234
|
+
),
|
235
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
236
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
237
|
+
[
|
238
|
+
pa.array([1, 2, None, 5, None, None]),
|
239
|
+
pa.array([1, None, 3, 5, None, None]),
|
240
|
+
pa.array(["a", "b", "c", "g", "e", None]),
|
241
|
+
pa.array([10, 8, 7, 11, 12, 14]),
|
242
|
+
],
|
243
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
244
|
+
),
|
245
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
246
|
+
[
|
247
|
+
pa.array([1, 2, None, 5, None, None]),
|
248
|
+
pa.array([1, None, 3, 5, None, None]),
|
249
|
+
pa.array(["a", "b", "c", "g", "e", None]),
|
250
|
+
pa.array([10, 8, 7, 11, 12, 14]),
|
251
|
+
],
|
252
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
253
|
+
),
|
254
|
+
expected_terminal_exception=None,
|
255
|
+
expected_terminal_exception_message=None,
|
256
|
+
do_create_placement_group=False,
|
257
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
258
|
+
hash_bucket_count=1,
|
259
|
+
read_kwargs_provider=None,
|
260
|
+
drop_duplicates=True,
|
261
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
262
|
+
assert_compaction_audit=None,
|
263
|
+
),
|
264
|
+
"6-rebase-with-null-multiple-pk-drop-duplicates-false": RebaseCompactionTestCaseParams(
|
265
|
+
primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
|
266
|
+
sort_keys=[],
|
267
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
268
|
+
partition_values=["1"],
|
269
|
+
input_deltas=pa.Table.from_arrays(
|
270
|
+
[
|
271
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
272
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
273
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
274
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
275
|
+
],
|
276
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
277
|
+
),
|
278
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
279
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
280
|
+
[
|
281
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
282
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
283
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
284
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
285
|
+
],
|
286
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
287
|
+
),
|
288
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
289
|
+
[
|
290
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
291
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
292
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
293
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
294
|
+
],
|
295
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
296
|
+
),
|
297
|
+
expected_terminal_exception=None,
|
298
|
+
expected_terminal_exception_message=None,
|
299
|
+
do_create_placement_group=False,
|
300
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
301
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
302
|
+
read_kwargs_provider=None,
|
303
|
+
drop_duplicates=False,
|
304
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
305
|
+
assert_compaction_audit=None,
|
306
|
+
),
|
307
|
+
"7-rebase-drop-duplicates-false": RebaseCompactionTestCaseParams(
|
308
|
+
primary_keys={"pk_col_1"},
|
309
|
+
sort_keys=[
|
310
|
+
SortKey.of(key_name="sk_col_1"),
|
311
|
+
],
|
312
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
313
|
+
partition_values=["1"],
|
314
|
+
input_deltas=pa.Table.from_arrays(
|
315
|
+
[
|
316
|
+
pa.array([1, 2, 2, 3, 3, 1]),
|
317
|
+
pa.array([1, 2, 3, 4, 5, 6]),
|
318
|
+
pa.array(["a", "b", "c", "b", "e", "a"]),
|
319
|
+
pa.array([5, 6, 7, 8, 9, 10]),
|
320
|
+
],
|
321
|
+
names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
|
322
|
+
),
|
323
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
324
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
325
|
+
[
|
326
|
+
pa.array([1, 2, 2, 3, 3, 1]),
|
327
|
+
pa.array([1, 2, 3, 4, 5, 6]),
|
328
|
+
pa.array(["a", "b", "c", "b", "e", "a"]),
|
329
|
+
pa.array([5, 6, 7, 8, 9, 10]),
|
330
|
+
],
|
331
|
+
names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
|
332
|
+
),
|
333
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
334
|
+
[
|
335
|
+
pa.array([1, 2, 2, 3, 3, 1]),
|
336
|
+
pa.array([1, 2, 3, 4, 5, 6]),
|
337
|
+
pa.array(["a", "b", "c", "b", "e", "a"]),
|
338
|
+
pa.array([5, 6, 7, 8, 9, 10]),
|
339
|
+
],
|
340
|
+
names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
|
341
|
+
),
|
342
|
+
expected_terminal_exception=None,
|
343
|
+
expected_terminal_exception_message=None,
|
344
|
+
do_create_placement_group=False,
|
345
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
346
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
347
|
+
read_kwargs_provider=None,
|
348
|
+
drop_duplicates=False,
|
349
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
350
|
+
assert_compaction_audit=None,
|
351
|
+
),
|
352
|
+
"8-rebase-with-with-null-pk-duplicates-false-hash-bucket-1": RebaseCompactionTestCaseParams(
|
353
|
+
primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
|
354
|
+
sort_keys=[],
|
355
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
356
|
+
partition_values=["1"],
|
357
|
+
input_deltas=pa.Table.from_arrays(
|
358
|
+
[
|
359
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
360
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
361
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
362
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
363
|
+
],
|
364
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
365
|
+
),
|
366
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
367
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
368
|
+
[
|
369
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
370
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
371
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
372
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
373
|
+
],
|
374
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
375
|
+
),
|
376
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
377
|
+
[
|
378
|
+
pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
|
379
|
+
pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
|
380
|
+
pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
|
381
|
+
pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
|
382
|
+
],
|
383
|
+
names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
|
384
|
+
),
|
385
|
+
expected_terminal_exception=None,
|
386
|
+
expected_terminal_exception_message=None,
|
387
|
+
do_create_placement_group=False,
|
388
|
+
records_per_compacted_file=1,
|
389
|
+
hash_bucket_count=1,
|
390
|
+
read_kwargs_provider=None,
|
391
|
+
drop_duplicates=False,
|
392
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
393
|
+
assert_compaction_audit=None,
|
394
|
+
),
|
87
395
|
}
|
88
396
|
|
89
397
|
REBASE_TEST_CASES = with_compactor_version_func_test_param(REBASE_TEST_CASES)
|
@@ -798,6 +798,67 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
|
|
798
798
|
skip_enabled_compact_partition_drivers=None,
|
799
799
|
assert_compaction_audit=None,
|
800
800
|
),
|
801
|
+
"14-rebase-then-incremental-with-null-pk": RebaseThenIncrementalCompactionTestCaseParams(
|
802
|
+
primary_keys={"pk_col_1"},
|
803
|
+
sort_keys=[
|
804
|
+
SortKey.of(key_name="sk_col_1"),
|
805
|
+
SortKey.of(key_name="sk_col_2"),
|
806
|
+
],
|
807
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
808
|
+
partition_values=["1"],
|
809
|
+
input_deltas=pa.Table.from_arrays(
|
810
|
+
[
|
811
|
+
pa.array([str(i) for i in range(9)] + [None]),
|
812
|
+
pa.array([i for i in range(0, 10)]),
|
813
|
+
pa.array(["foo"] * 10),
|
814
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
815
|
+
],
|
816
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
817
|
+
),
|
818
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
819
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
820
|
+
[
|
821
|
+
pa.array([str(i) for i in range(9)] + [None]),
|
822
|
+
pa.array([i for i in range(0, 10)]),
|
823
|
+
pa.array(["foo"] * 10),
|
824
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
825
|
+
],
|
826
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
827
|
+
),
|
828
|
+
incremental_deltas=[
|
829
|
+
(
|
830
|
+
pa.Table.from_arrays(
|
831
|
+
[
|
832
|
+
pa.array([str(i) for i in range(9)] + [None]),
|
833
|
+
pa.array([i for i in range(20, 30)]),
|
834
|
+
pa.array(["foo"] * 10),
|
835
|
+
pa.array([i / 10 for i in range(40, 50)]),
|
836
|
+
],
|
837
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
838
|
+
),
|
839
|
+
DeltaType.UPSERT,
|
840
|
+
None,
|
841
|
+
)
|
842
|
+
],
|
843
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
844
|
+
[
|
845
|
+
pa.array([str(i) for i in range(9)] + [None]),
|
846
|
+
pa.array([i for i in range(20, 30)]),
|
847
|
+
pa.array(["foo"] * 10),
|
848
|
+
pa.array([i / 10 for i in range(40, 50)]),
|
849
|
+
],
|
850
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
851
|
+
),
|
852
|
+
expected_terminal_exception=None,
|
853
|
+
expected_terminal_exception_message=None,
|
854
|
+
do_create_placement_group=False,
|
855
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
856
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
857
|
+
read_kwargs_provider=None,
|
858
|
+
drop_duplicates=True,
|
859
|
+
skip_enabled_compact_partition_drivers=None,
|
860
|
+
assert_compaction_audit=assert_compaction_audit,
|
861
|
+
),
|
801
862
|
}
|
802
863
|
|
803
864
|
REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
|
@@ -1983,6 +2044,104 @@ REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
|
|
1983
2044
|
skip_enabled_compact_partition_drivers=None,
|
1984
2045
|
assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
|
1985
2046
|
),
|
2047
|
+
"31-rebase-then-incremental-delete-delta-on-incremental-null-pk-delete-null": RebaseThenIncrementalCompactionTestCaseParams(
|
2048
|
+
primary_keys={"pk_col_1"},
|
2049
|
+
sort_keys=ZERO_VALUED_SORT_KEY,
|
2050
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
2051
|
+
partition_values=["1"],
|
2052
|
+
input_deltas=pa.Table.from_arrays(
|
2053
|
+
[
|
2054
|
+
pa.array([i for i in range(11)] + [None]),
|
2055
|
+
pa.array([str(i) for i in range(0, 12)]),
|
2056
|
+
],
|
2057
|
+
names=["pk_col_1", "col_1"],
|
2058
|
+
),
|
2059
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
2060
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
2061
|
+
[
|
2062
|
+
pa.array([i for i in range(11)] + [None]),
|
2063
|
+
pa.array([str(i) for i in range(0, 12)]),
|
2064
|
+
],
|
2065
|
+
names=["pk_col_1", "col_1"],
|
2066
|
+
),
|
2067
|
+
incremental_deltas=[
|
2068
|
+
(
|
2069
|
+
pa.Table.from_arrays(
|
2070
|
+
[
|
2071
|
+
pa.array([10, 11, None, 13]),
|
2072
|
+
pa.array(["a", "b", "c", "d"]),
|
2073
|
+
],
|
2074
|
+
names=["pk_col_1", "col_1"],
|
2075
|
+
),
|
2076
|
+
DeltaType.UPSERT,
|
2077
|
+
None,
|
2078
|
+
),
|
2079
|
+
(
|
2080
|
+
pa.Table.from_arrays(
|
2081
|
+
[pa.array([10, 11]), pa.array(["a", "b"])],
|
2082
|
+
names=["pk_col_1", "col_1"],
|
2083
|
+
),
|
2084
|
+
DeltaType.DELETE,
|
2085
|
+
DeleteParameters.of(["pk_col_1", "col_1"]),
|
2086
|
+
),
|
2087
|
+
(
|
2088
|
+
pa.Table.from_arrays(
|
2089
|
+
[pa.array([None])], # Support deleting null PK records
|
2090
|
+
names=["pk_col_1"],
|
2091
|
+
),
|
2092
|
+
DeltaType.DELETE,
|
2093
|
+
DeleteParameters.of(["pk_col_1"]),
|
2094
|
+
),
|
2095
|
+
(
|
2096
|
+
pa.Table.from_arrays(
|
2097
|
+
[pa.array(["c"])],
|
2098
|
+
names=["col_1"],
|
2099
|
+
),
|
2100
|
+
DeltaType.DELETE,
|
2101
|
+
DeleteParameters.of(["col_1"]),
|
2102
|
+
),
|
2103
|
+
(
|
2104
|
+
pa.Table.from_arrays(
|
2105
|
+
[pa.array(["c"])],
|
2106
|
+
names=["col_1"],
|
2107
|
+
),
|
2108
|
+
DeltaType.DELETE,
|
2109
|
+
DeleteParameters.of(["col_1"]),
|
2110
|
+
),
|
2111
|
+
(
|
2112
|
+
pa.Table.from_arrays(
|
2113
|
+
[pa.array([10, 11]), pa.array(["a", "b"])],
|
2114
|
+
names=["pk_col_1", "col_1"],
|
2115
|
+
),
|
2116
|
+
DeltaType.DELETE,
|
2117
|
+
DeleteParameters.of(["pk_col_1", "col_1"]),
|
2118
|
+
),
|
2119
|
+
(
|
2120
|
+
pa.Table.from_arrays(
|
2121
|
+
[pa.array(["c"])],
|
2122
|
+
names=["col_1"],
|
2123
|
+
),
|
2124
|
+
DeltaType.DELETE,
|
2125
|
+
DeleteParameters.of(["col_1"]),
|
2126
|
+
),
|
2127
|
+
],
|
2128
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
2129
|
+
[
|
2130
|
+
pa.array([i for i in range(10)] + [13]),
|
2131
|
+
pa.array([str(i) for i in range(0, 10)] + ["d"]),
|
2132
|
+
],
|
2133
|
+
names=["pk_col_1", "col_1"],
|
2134
|
+
),
|
2135
|
+
expected_terminal_exception=None,
|
2136
|
+
expected_terminal_exception_message=None,
|
2137
|
+
do_create_placement_group=False,
|
2138
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
2139
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
2140
|
+
read_kwargs_provider=None,
|
2141
|
+
drop_duplicates=True,
|
2142
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
2143
|
+
assert_compaction_audit=assert_compaction_audit,
|
2144
|
+
),
|
1986
2145
|
}
|
1987
2146
|
|
1988
2147
|
REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
|
@@ -19,6 +19,7 @@ from deltacat.tests.test_utils.utils import read_s3_contents
|
|
19
19
|
from deltacat.tests.compute.test_util_constant import (
|
20
20
|
TEST_S3_RCF_BUCKET_NAME,
|
21
21
|
)
|
22
|
+
from deltacat.compute.resource_estimation import ResourceEstimationMethod
|
22
23
|
from deltacat.tests.compute.test_util_common import get_rcf
|
23
24
|
from deltacat.tests.test_utils.pyarrow import (
|
24
25
|
stage_partition_from_file_paths,
|
@@ -399,3 +400,159 @@ class TestCompactionSession:
|
|
399
400
|
assert compaction_audit.output_file_count == 2
|
400
401
|
assert abs(compaction_audit.output_size_bytes - 1843) / 1843 <= self.ERROR_RATE
|
401
402
|
assert abs(compaction_audit.input_size_bytes - 2748) / 2748 <= self.ERROR_RATE
|
403
|
+
|
404
|
+
def test_compact_partition_when_incremental_then_intelligent_estimation_sanity(
|
405
|
+
self, s3_resource, local_deltacat_storage_kwargs
|
406
|
+
):
|
407
|
+
"""
|
408
|
+
A test case which asserts the RCF stats are correctly generated for
|
409
|
+
a rebase and incremental use-case.
|
410
|
+
"""
|
411
|
+
|
412
|
+
# setup
|
413
|
+
staged_source = stage_partition_from_file_paths(
|
414
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
415
|
+
)
|
416
|
+
|
417
|
+
source_delta = commit_delta_to_staged_partition(
|
418
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
419
|
+
)
|
420
|
+
|
421
|
+
staged_dest = stage_partition_from_file_paths(
|
422
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
423
|
+
)
|
424
|
+
dest_partition = ds.commit_partition(
|
425
|
+
staged_dest, **local_deltacat_storage_kwargs
|
426
|
+
)
|
427
|
+
|
428
|
+
# action
|
429
|
+
compact_partition(
|
430
|
+
CompactPartitionParams.of(
|
431
|
+
{
|
432
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
433
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
434
|
+
"dd_max_parallelism_ratio": 1.0,
|
435
|
+
"deltacat_storage": ds,
|
436
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
437
|
+
"destination_partition_locator": dest_partition.locator,
|
438
|
+
"drop_duplicates": True,
|
439
|
+
"hash_bucket_count": 2,
|
440
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
441
|
+
"list_deltas_kwargs": {
|
442
|
+
**local_deltacat_storage_kwargs,
|
443
|
+
**{"equivalent_table_types": []},
|
444
|
+
},
|
445
|
+
"primary_keys": ["pk"],
|
446
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
447
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
448
|
+
"records_per_compacted_file": 4000,
|
449
|
+
"s3_client_kwargs": {},
|
450
|
+
"source_partition_locator": source_delta.partition_locator,
|
451
|
+
"resource_estimation_method": ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
452
|
+
}
|
453
|
+
)
|
454
|
+
)
|
455
|
+
|
456
|
+
def test_compact_partition_when_incremental_then_content_type_meta_estimation_sanity(
|
457
|
+
self, s3_resource, local_deltacat_storage_kwargs
|
458
|
+
):
|
459
|
+
"""
|
460
|
+
A test case which asserts the RCF stats are correctly generated for
|
461
|
+
a rebase and incremental use-case.
|
462
|
+
"""
|
463
|
+
|
464
|
+
# setup
|
465
|
+
staged_source = stage_partition_from_file_paths(
|
466
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
467
|
+
)
|
468
|
+
|
469
|
+
source_delta = commit_delta_to_staged_partition(
|
470
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
471
|
+
)
|
472
|
+
|
473
|
+
staged_dest = stage_partition_from_file_paths(
|
474
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
475
|
+
)
|
476
|
+
dest_partition = ds.commit_partition(
|
477
|
+
staged_dest, **local_deltacat_storage_kwargs
|
478
|
+
)
|
479
|
+
|
480
|
+
# action
|
481
|
+
compact_partition(
|
482
|
+
CompactPartitionParams.of(
|
483
|
+
{
|
484
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
485
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
486
|
+
"dd_max_parallelism_ratio": 1.0,
|
487
|
+
"deltacat_storage": ds,
|
488
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
489
|
+
"destination_partition_locator": dest_partition.locator,
|
490
|
+
"drop_duplicates": True,
|
491
|
+
"hash_bucket_count": 2,
|
492
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
493
|
+
"list_deltas_kwargs": {
|
494
|
+
**local_deltacat_storage_kwargs,
|
495
|
+
**{"equivalent_table_types": []},
|
496
|
+
},
|
497
|
+
"primary_keys": ["pk"],
|
498
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
499
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
500
|
+
"records_per_compacted_file": 4000,
|
501
|
+
"s3_client_kwargs": {},
|
502
|
+
"source_partition_locator": source_delta.partition_locator,
|
503
|
+
"resource_estimation_method": ResourceEstimationMethod.CONTENT_TYPE_META,
|
504
|
+
}
|
505
|
+
)
|
506
|
+
)
|
507
|
+
|
508
|
+
def test_compact_partition_when_incremental_then_previous_inflation_estimation_sanity(
|
509
|
+
self, s3_resource, local_deltacat_storage_kwargs
|
510
|
+
):
|
511
|
+
"""
|
512
|
+
A test case which asserts the RCF stats are correctly generated for
|
513
|
+
a rebase and incremental use-case.
|
514
|
+
"""
|
515
|
+
|
516
|
+
# setup
|
517
|
+
staged_source = stage_partition_from_file_paths(
|
518
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
519
|
+
)
|
520
|
+
|
521
|
+
source_delta = commit_delta_to_staged_partition(
|
522
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
523
|
+
)
|
524
|
+
|
525
|
+
staged_dest = stage_partition_from_file_paths(
|
526
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
527
|
+
)
|
528
|
+
dest_partition = ds.commit_partition(
|
529
|
+
staged_dest, **local_deltacat_storage_kwargs
|
530
|
+
)
|
531
|
+
|
532
|
+
# action
|
533
|
+
compact_partition(
|
534
|
+
CompactPartitionParams.of(
|
535
|
+
{
|
536
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
537
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
538
|
+
"dd_max_parallelism_ratio": 1.0,
|
539
|
+
"deltacat_storage": ds,
|
540
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
541
|
+
"destination_partition_locator": dest_partition.locator,
|
542
|
+
"drop_duplicates": True,
|
543
|
+
"hash_bucket_count": 2,
|
544
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
545
|
+
"list_deltas_kwargs": {
|
546
|
+
**local_deltacat_storage_kwargs,
|
547
|
+
**{"equivalent_table_types": []},
|
548
|
+
},
|
549
|
+
"primary_keys": ["pk"],
|
550
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
551
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
552
|
+
"records_per_compacted_file": 4000,
|
553
|
+
"s3_client_kwargs": {},
|
554
|
+
"source_partition_locator": source_delta.partition_locator,
|
555
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
556
|
+
}
|
557
|
+
)
|
558
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import unittest
|
2
2
|
import ray
|
3
|
-
from deltacat.compute.compactor_v2.utils.task_options import
|
3
|
+
from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
|
4
4
|
|
5
5
|
|
6
6
|
@ray.remote
|
@@ -20,14 +20,14 @@ class TestTaskOptions(unittest.TestCase):
|
|
20
20
|
super().setUpClass()
|
21
21
|
|
22
22
|
def test_get_task_options_sanity(self):
|
23
|
-
opts =
|
23
|
+
opts = _get_task_options(0.01, 0.01)
|
24
24
|
result_ref = valid_func.options(**opts).remote()
|
25
25
|
result = ray.get(result_ref)
|
26
26
|
|
27
27
|
self.assertEqual(result, 2)
|
28
28
|
|
29
29
|
def test_get_task_options_when_exception_is_thrown(self):
|
30
|
-
opts =
|
30
|
+
opts = _get_task_options(0.01, 0.01)
|
31
31
|
result_ref = throwing_func.options(**opts).remote()
|
32
32
|
|
33
33
|
self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
|