deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -84,6 +84,314 @@ REBASE_TEST_CASES = {
84
84
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
85
85
  assert_compaction_audit=None,
86
86
  ),
87
+ "2-rebase-with-null-pk": RebaseCompactionTestCaseParams(
88
+ primary_keys={"pk_col_1"},
89
+ sort_keys=[
90
+ SortKey.of(key_name="sk_col_1"),
91
+ SortKey.of(key_name="sk_col_2"),
92
+ ],
93
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
94
+ partition_values=["1"],
95
+ input_deltas=pa.Table.from_arrays(
96
+ [
97
+ pa.array([1, 2, None, 2, None, 1]),
98
+ pa.array([1, 2, 3, 4, 5, 6]),
99
+ pa.array(["foo"] * 6),
100
+ pa.array([5, 6, 7, 8, 9, 10]),
101
+ ],
102
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
103
+ ),
104
+ input_deltas_delta_type=DeltaType.UPSERT,
105
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
106
+ [
107
+ pa.array([None, 1, 2]),
108
+ pa.array([5, 6, 4]),
109
+ pa.array(["foo"] * 3),
110
+ pa.array([9, 10, 8]),
111
+ ],
112
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
113
+ ),
114
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
115
+ [
116
+ pa.array([None, 1, 2]),
117
+ pa.array([5, 6, 4]),
118
+ pa.array(["foo"] * 3),
119
+ pa.array([7, 10, 8]),
120
+ ],
121
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
122
+ ),
123
+ expected_terminal_exception=None,
124
+ expected_terminal_exception_message=None,
125
+ do_create_placement_group=False,
126
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
127
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
128
+ read_kwargs_provider=None,
129
+ drop_duplicates=True,
130
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
131
+ assert_compaction_audit=None,
132
+ ),
133
+ "3-rebase-with-null-two-pk": RebaseCompactionTestCaseParams(
134
+ primary_keys={"pk_col_1", "pk_col_2"},
135
+ sort_keys=[
136
+ SortKey.of(key_name="sk_col_1"),
137
+ ],
138
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
139
+ partition_values=["1"],
140
+ input_deltas=pa.Table.from_arrays(
141
+ [
142
+ pa.array([1, 2, None, 2, None, 1, 5]),
143
+ pa.array([1, None, 3, None, None, 1, 5]),
144
+ pa.array(["foo"] * 7),
145
+ pa.array([5, 6, 7, 8, 9, 10, 11]),
146
+ ],
147
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
148
+ ),
149
+ input_deltas_delta_type=DeltaType.UPSERT,
150
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
151
+ [
152
+ pa.array([1, 2, None, 5, None]),
153
+ pa.array([1, None, 3, 5, None]),
154
+ pa.array(["foo"] * 5),
155
+ pa.array([10, 8, 7, 11, 9]),
156
+ ],
157
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
158
+ ),
159
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
160
+ [
161
+ pa.array([1, 2, None, 5, None]),
162
+ pa.array([1, None, 3, 5, None]),
163
+ pa.array(["foo"] * 5),
164
+ pa.array([10, 8, 7, 11, 9]),
165
+ ],
166
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
167
+ ),
168
+ expected_terminal_exception=None,
169
+ expected_terminal_exception_message=None,
170
+ do_create_placement_group=False,
171
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
172
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
173
+ read_kwargs_provider=None,
174
+ drop_duplicates=True,
175
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
176
+ assert_compaction_audit=None,
177
+ ),
178
+ "4-rebase-with-null-multiple-pk-different-types": RebaseCompactionTestCaseParams(
179
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
180
+ sort_keys=[],
181
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
182
+ partition_values=["1"],
183
+ input_deltas=pa.Table.from_arrays(
184
+ [
185
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
186
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
187
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
188
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
189
+ ],
190
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
191
+ ),
192
+ input_deltas_delta_type=DeltaType.UPSERT,
193
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
194
+ [
195
+ pa.array([1, 2, None, 5, None, None]),
196
+ pa.array([1, None, 3, 5, None, None]),
197
+ pa.array(["a", "b", "c", "g", "e", None]),
198
+ pa.array([10, 8, 7, 11, 12, 14]),
199
+ ],
200
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
201
+ ),
202
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
203
+ [
204
+ pa.array([1, 2, None, 5, None, None]),
205
+ pa.array([1, None, 3, 5, None, None]),
206
+ pa.array(["a", "b", "c", "g", "e", None]),
207
+ pa.array([10, 8, 7, 11, 12, 14]),
208
+ ],
209
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
210
+ ),
211
+ expected_terminal_exception=None,
212
+ expected_terminal_exception_message=None,
213
+ do_create_placement_group=False,
214
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
215
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
216
+ read_kwargs_provider=None,
217
+ drop_duplicates=True,
218
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
219
+ assert_compaction_audit=None,
220
+ ),
221
+ "5-rebase-with-null-multiple-pk-one-hash-bucket": RebaseCompactionTestCaseParams(
222
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
223
+ sort_keys=[],
224
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
225
+ partition_values=["1"],
226
+ input_deltas=pa.Table.from_arrays(
227
+ [
228
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
229
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
230
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
231
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
232
+ ],
233
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
234
+ ),
235
+ input_deltas_delta_type=DeltaType.UPSERT,
236
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
237
+ [
238
+ pa.array([1, 2, None, 5, None, None]),
239
+ pa.array([1, None, 3, 5, None, None]),
240
+ pa.array(["a", "b", "c", "g", "e", None]),
241
+ pa.array([10, 8, 7, 11, 12, 14]),
242
+ ],
243
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
244
+ ),
245
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
246
+ [
247
+ pa.array([1, 2, None, 5, None, None]),
248
+ pa.array([1, None, 3, 5, None, None]),
249
+ pa.array(["a", "b", "c", "g", "e", None]),
250
+ pa.array([10, 8, 7, 11, 12, 14]),
251
+ ],
252
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
253
+ ),
254
+ expected_terminal_exception=None,
255
+ expected_terminal_exception_message=None,
256
+ do_create_placement_group=False,
257
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
258
+ hash_bucket_count=1,
259
+ read_kwargs_provider=None,
260
+ drop_duplicates=True,
261
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
262
+ assert_compaction_audit=None,
263
+ ),
264
+ "6-rebase-with-null-multiple-pk-drop-duplicates-false": RebaseCompactionTestCaseParams(
265
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
266
+ sort_keys=[],
267
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
268
+ partition_values=["1"],
269
+ input_deltas=pa.Table.from_arrays(
270
+ [
271
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
272
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
273
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
274
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
275
+ ],
276
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
277
+ ),
278
+ input_deltas_delta_type=DeltaType.UPSERT,
279
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
280
+ [
281
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
282
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
283
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
284
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
285
+ ],
286
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
287
+ ),
288
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
289
+ [
290
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
291
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
292
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
293
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
294
+ ],
295
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
296
+ ),
297
+ expected_terminal_exception=None,
298
+ expected_terminal_exception_message=None,
299
+ do_create_placement_group=False,
300
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
301
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
302
+ read_kwargs_provider=None,
303
+ drop_duplicates=False,
304
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
305
+ assert_compaction_audit=None,
306
+ ),
307
+ "7-rebase-drop-duplicates-false": RebaseCompactionTestCaseParams(
308
+ primary_keys={"pk_col_1"},
309
+ sort_keys=[
310
+ SortKey.of(key_name="sk_col_1"),
311
+ ],
312
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
313
+ partition_values=["1"],
314
+ input_deltas=pa.Table.from_arrays(
315
+ [
316
+ pa.array([1, 2, 2, 3, 3, 1]),
317
+ pa.array([1, 2, 3, 4, 5, 6]),
318
+ pa.array(["a", "b", "c", "b", "e", "a"]),
319
+ pa.array([5, 6, 7, 8, 9, 10]),
320
+ ],
321
+ names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
322
+ ),
323
+ input_deltas_delta_type=DeltaType.UPSERT,
324
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
325
+ [
326
+ pa.array([1, 2, 2, 3, 3, 1]),
327
+ pa.array([1, 2, 3, 4, 5, 6]),
328
+ pa.array(["a", "b", "c", "b", "e", "a"]),
329
+ pa.array([5, 6, 7, 8, 9, 10]),
330
+ ],
331
+ names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
332
+ ),
333
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
334
+ [
335
+ pa.array([1, 2, 2, 3, 3, 1]),
336
+ pa.array([1, 2, 3, 4, 5, 6]),
337
+ pa.array(["a", "b", "c", "b", "e", "a"]),
338
+ pa.array([5, 6, 7, 8, 9, 10]),
339
+ ],
340
+ names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
341
+ ),
342
+ expected_terminal_exception=None,
343
+ expected_terminal_exception_message=None,
344
+ do_create_placement_group=False,
345
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
346
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
347
+ read_kwargs_provider=None,
348
+ drop_duplicates=False,
349
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
350
+ assert_compaction_audit=None,
351
+ ),
352
+ "8-rebase-with-with-null-pk-duplicates-false-hash-bucket-1": RebaseCompactionTestCaseParams(
353
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
354
+ sort_keys=[],
355
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
356
+ partition_values=["1"],
357
+ input_deltas=pa.Table.from_arrays(
358
+ [
359
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
360
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
361
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
362
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
363
+ ],
364
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
365
+ ),
366
+ input_deltas_delta_type=DeltaType.UPSERT,
367
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
368
+ [
369
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
370
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
371
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
372
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
373
+ ],
374
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
375
+ ),
376
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
377
+ [
378
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
379
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
380
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
381
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
382
+ ],
383
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
384
+ ),
385
+ expected_terminal_exception=None,
386
+ expected_terminal_exception_message=None,
387
+ do_create_placement_group=False,
388
+ records_per_compacted_file=1,
389
+ hash_bucket_count=1,
390
+ read_kwargs_provider=None,
391
+ drop_duplicates=False,
392
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
393
+ assert_compaction_audit=None,
394
+ ),
87
395
  }
88
396
 
89
397
  REBASE_TEST_CASES = with_compactor_version_func_test_param(REBASE_TEST_CASES)
@@ -798,6 +798,67 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
798
798
  skip_enabled_compact_partition_drivers=None,
799
799
  assert_compaction_audit=None,
800
800
  ),
801
+ "14-rebase-then-incremental-with-null-pk": RebaseThenIncrementalCompactionTestCaseParams(
802
+ primary_keys={"pk_col_1"},
803
+ sort_keys=[
804
+ SortKey.of(key_name="sk_col_1"),
805
+ SortKey.of(key_name="sk_col_2"),
806
+ ],
807
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
808
+ partition_values=["1"],
809
+ input_deltas=pa.Table.from_arrays(
810
+ [
811
+ pa.array([str(i) for i in range(9)] + [None]),
812
+ pa.array([i for i in range(0, 10)]),
813
+ pa.array(["foo"] * 10),
814
+ pa.array([i / 10 for i in range(10, 20)]),
815
+ ],
816
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
817
+ ),
818
+ input_deltas_delta_type=DeltaType.UPSERT,
819
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
820
+ [
821
+ pa.array([str(i) for i in range(9)] + [None]),
822
+ pa.array([i for i in range(0, 10)]),
823
+ pa.array(["foo"] * 10),
824
+ pa.array([i / 10 for i in range(10, 20)]),
825
+ ],
826
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
827
+ ),
828
+ incremental_deltas=[
829
+ (
830
+ pa.Table.from_arrays(
831
+ [
832
+ pa.array([str(i) for i in range(9)] + [None]),
833
+ pa.array([i for i in range(20, 30)]),
834
+ pa.array(["foo"] * 10),
835
+ pa.array([i / 10 for i in range(40, 50)]),
836
+ ],
837
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
838
+ ),
839
+ DeltaType.UPSERT,
840
+ None,
841
+ )
842
+ ],
843
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
844
+ [
845
+ pa.array([str(i) for i in range(9)] + [None]),
846
+ pa.array([i for i in range(20, 30)]),
847
+ pa.array(["foo"] * 10),
848
+ pa.array([i / 10 for i in range(40, 50)]),
849
+ ],
850
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
851
+ ),
852
+ expected_terminal_exception=None,
853
+ expected_terminal_exception_message=None,
854
+ do_create_placement_group=False,
855
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
856
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
857
+ read_kwargs_provider=None,
858
+ drop_duplicates=True,
859
+ skip_enabled_compact_partition_drivers=None,
860
+ assert_compaction_audit=assert_compaction_audit,
861
+ ),
801
862
  }
802
863
 
803
864
  REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
@@ -1983,6 +2044,104 @@ REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
1983
2044
  skip_enabled_compact_partition_drivers=None,
1984
2045
  assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
1985
2046
  ),
2047
+ "31-rebase-then-incremental-delete-delta-on-incremental-null-pk-delete-null": RebaseThenIncrementalCompactionTestCaseParams(
2048
+ primary_keys={"pk_col_1"},
2049
+ sort_keys=ZERO_VALUED_SORT_KEY,
2050
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
2051
+ partition_values=["1"],
2052
+ input_deltas=pa.Table.from_arrays(
2053
+ [
2054
+ pa.array([i for i in range(11)] + [None]),
2055
+ pa.array([str(i) for i in range(0, 12)]),
2056
+ ],
2057
+ names=["pk_col_1", "col_1"],
2058
+ ),
2059
+ input_deltas_delta_type=DeltaType.UPSERT,
2060
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
2061
+ [
2062
+ pa.array([i for i in range(11)] + [None]),
2063
+ pa.array([str(i) for i in range(0, 12)]),
2064
+ ],
2065
+ names=["pk_col_1", "col_1"],
2066
+ ),
2067
+ incremental_deltas=[
2068
+ (
2069
+ pa.Table.from_arrays(
2070
+ [
2071
+ pa.array([10, 11, None, 13]),
2072
+ pa.array(["a", "b", "c", "d"]),
2073
+ ],
2074
+ names=["pk_col_1", "col_1"],
2075
+ ),
2076
+ DeltaType.UPSERT,
2077
+ None,
2078
+ ),
2079
+ (
2080
+ pa.Table.from_arrays(
2081
+ [pa.array([10, 11]), pa.array(["a", "b"])],
2082
+ names=["pk_col_1", "col_1"],
2083
+ ),
2084
+ DeltaType.DELETE,
2085
+ DeleteParameters.of(["pk_col_1", "col_1"]),
2086
+ ),
2087
+ (
2088
+ pa.Table.from_arrays(
2089
+ [pa.array([None])], # Support deleting null PK records
2090
+ names=["pk_col_1"],
2091
+ ),
2092
+ DeltaType.DELETE,
2093
+ DeleteParameters.of(["pk_col_1"]),
2094
+ ),
2095
+ (
2096
+ pa.Table.from_arrays(
2097
+ [pa.array(["c"])],
2098
+ names=["col_1"],
2099
+ ),
2100
+ DeltaType.DELETE,
2101
+ DeleteParameters.of(["col_1"]),
2102
+ ),
2103
+ (
2104
+ pa.Table.from_arrays(
2105
+ [pa.array(["c"])],
2106
+ names=["col_1"],
2107
+ ),
2108
+ DeltaType.DELETE,
2109
+ DeleteParameters.of(["col_1"]),
2110
+ ),
2111
+ (
2112
+ pa.Table.from_arrays(
2113
+ [pa.array([10, 11]), pa.array(["a", "b"])],
2114
+ names=["pk_col_1", "col_1"],
2115
+ ),
2116
+ DeltaType.DELETE,
2117
+ DeleteParameters.of(["pk_col_1", "col_1"]),
2118
+ ),
2119
+ (
2120
+ pa.Table.from_arrays(
2121
+ [pa.array(["c"])],
2122
+ names=["col_1"],
2123
+ ),
2124
+ DeltaType.DELETE,
2125
+ DeleteParameters.of(["col_1"]),
2126
+ ),
2127
+ ],
2128
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
2129
+ [
2130
+ pa.array([i for i in range(10)] + [13]),
2131
+ pa.array([str(i) for i in range(0, 10)] + ["d"]),
2132
+ ],
2133
+ names=["pk_col_1", "col_1"],
2134
+ ),
2135
+ expected_terminal_exception=None,
2136
+ expected_terminal_exception_message=None,
2137
+ do_create_placement_group=False,
2138
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
2139
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
2140
+ read_kwargs_provider=None,
2141
+ drop_duplicates=True,
2142
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
2143
+ assert_compaction_audit=assert_compaction_audit,
2144
+ ),
1986
2145
  }
1987
2146
 
1988
2147
  REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
@@ -19,6 +19,7 @@ from deltacat.tests.test_utils.utils import read_s3_contents
19
19
  from deltacat.tests.compute.test_util_constant import (
20
20
  TEST_S3_RCF_BUCKET_NAME,
21
21
  )
22
+ from deltacat.compute.resource_estimation import ResourceEstimationMethod
22
23
  from deltacat.tests.compute.test_util_common import get_rcf
23
24
  from deltacat.tests.test_utils.pyarrow import (
24
25
  stage_partition_from_file_paths,
@@ -399,3 +400,159 @@ class TestCompactionSession:
399
400
  assert compaction_audit.output_file_count == 2
400
401
  assert abs(compaction_audit.output_size_bytes - 1843) / 1843 <= self.ERROR_RATE
401
402
  assert abs(compaction_audit.input_size_bytes - 2748) / 2748 <= self.ERROR_RATE
403
+
404
+ def test_compact_partition_when_incremental_then_intelligent_estimation_sanity(
405
+ self, s3_resource, local_deltacat_storage_kwargs
406
+ ):
407
+ """
408
+ A test case which asserts the RCF stats are correctly generated for
409
+ a rebase and incremental use-case.
410
+ """
411
+
412
+ # setup
413
+ staged_source = stage_partition_from_file_paths(
414
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
415
+ )
416
+
417
+ source_delta = commit_delta_to_staged_partition(
418
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
419
+ )
420
+
421
+ staged_dest = stage_partition_from_file_paths(
422
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
423
+ )
424
+ dest_partition = ds.commit_partition(
425
+ staged_dest, **local_deltacat_storage_kwargs
426
+ )
427
+
428
+ # action
429
+ compact_partition(
430
+ CompactPartitionParams.of(
431
+ {
432
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
433
+ "compacted_file_content_type": ContentType.PARQUET,
434
+ "dd_max_parallelism_ratio": 1.0,
435
+ "deltacat_storage": ds,
436
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
437
+ "destination_partition_locator": dest_partition.locator,
438
+ "drop_duplicates": True,
439
+ "hash_bucket_count": 2,
440
+ "last_stream_position_to_compact": source_delta.stream_position,
441
+ "list_deltas_kwargs": {
442
+ **local_deltacat_storage_kwargs,
443
+ **{"equivalent_table_types": []},
444
+ },
445
+ "primary_keys": ["pk"],
446
+ "rebase_source_partition_locator": source_delta.partition_locator,
447
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
448
+ "records_per_compacted_file": 4000,
449
+ "s3_client_kwargs": {},
450
+ "source_partition_locator": source_delta.partition_locator,
451
+ "resource_estimation_method": ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
452
+ }
453
+ )
454
+ )
455
+
456
+ def test_compact_partition_when_incremental_then_content_type_meta_estimation_sanity(
457
+ self, s3_resource, local_deltacat_storage_kwargs
458
+ ):
459
+ """
460
+ A test case which asserts the RCF stats are correctly generated for
461
+ a rebase and incremental use-case.
462
+ """
463
+
464
+ # setup
465
+ staged_source = stage_partition_from_file_paths(
466
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
467
+ )
468
+
469
+ source_delta = commit_delta_to_staged_partition(
470
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
471
+ )
472
+
473
+ staged_dest = stage_partition_from_file_paths(
474
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
475
+ )
476
+ dest_partition = ds.commit_partition(
477
+ staged_dest, **local_deltacat_storage_kwargs
478
+ )
479
+
480
+ # action
481
+ compact_partition(
482
+ CompactPartitionParams.of(
483
+ {
484
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
485
+ "compacted_file_content_type": ContentType.PARQUET,
486
+ "dd_max_parallelism_ratio": 1.0,
487
+ "deltacat_storage": ds,
488
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
489
+ "destination_partition_locator": dest_partition.locator,
490
+ "drop_duplicates": True,
491
+ "hash_bucket_count": 2,
492
+ "last_stream_position_to_compact": source_delta.stream_position,
493
+ "list_deltas_kwargs": {
494
+ **local_deltacat_storage_kwargs,
495
+ **{"equivalent_table_types": []},
496
+ },
497
+ "primary_keys": ["pk"],
498
+ "rebase_source_partition_locator": source_delta.partition_locator,
499
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
500
+ "records_per_compacted_file": 4000,
501
+ "s3_client_kwargs": {},
502
+ "source_partition_locator": source_delta.partition_locator,
503
+ "resource_estimation_method": ResourceEstimationMethod.CONTENT_TYPE_META,
504
+ }
505
+ )
506
+ )
507
+
508
+ def test_compact_partition_when_incremental_then_previous_inflation_estimation_sanity(
509
+ self, s3_resource, local_deltacat_storage_kwargs
510
+ ):
511
+ """
512
+ A test case which asserts the RCF stats are correctly generated for
513
+ a rebase and incremental use-case.
514
+ """
515
+
516
+ # setup
517
+ staged_source = stage_partition_from_file_paths(
518
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
519
+ )
520
+
521
+ source_delta = commit_delta_to_staged_partition(
522
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
523
+ )
524
+
525
+ staged_dest = stage_partition_from_file_paths(
526
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
527
+ )
528
+ dest_partition = ds.commit_partition(
529
+ staged_dest, **local_deltacat_storage_kwargs
530
+ )
531
+
532
+ # action
533
+ compact_partition(
534
+ CompactPartitionParams.of(
535
+ {
536
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
537
+ "compacted_file_content_type": ContentType.PARQUET,
538
+ "dd_max_parallelism_ratio": 1.0,
539
+ "deltacat_storage": ds,
540
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
541
+ "destination_partition_locator": dest_partition.locator,
542
+ "drop_duplicates": True,
543
+ "hash_bucket_count": 2,
544
+ "last_stream_position_to_compact": source_delta.stream_position,
545
+ "list_deltas_kwargs": {
546
+ **local_deltacat_storage_kwargs,
547
+ **{"equivalent_table_types": []},
548
+ },
549
+ "primary_keys": ["pk"],
550
+ "rebase_source_partition_locator": source_delta.partition_locator,
551
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
552
+ "records_per_compacted_file": 4000,
553
+ "s3_client_kwargs": {},
554
+ "source_partition_locator": source_delta.partition_locator,
555
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
556
+ }
557
+ )
558
+ )
@@ -1,6 +1,6 @@
1
1
  import unittest
2
2
  import ray
3
- from deltacat.compute.compactor_v2.utils.task_options import get_task_options
3
+ from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
4
4
 
5
5
 
6
6
  @ray.remote
@@ -20,14 +20,14 @@ class TestTaskOptions(unittest.TestCase):
20
20
  super().setUpClass()
21
21
 
22
22
  def test_get_task_options_sanity(self):
23
- opts = get_task_options(0.01, 0.01)
23
+ opts = _get_task_options(0.01, 0.01)
24
24
  result_ref = valid_func.options(**opts).remote()
25
25
  result = ray.get(result_ref)
26
26
 
27
27
  self.assertEqual(result, 2)
28
28
 
29
29
  def test_get_task_options_when_exception_is_thrown(self):
30
- opts = get_task_options(0.01, 0.01)
30
+ opts = _get_task_options(0.01, 0.01)
31
31
  result_ref = throwing_func.options(**opts).remote()
32
32
 
33
33
  self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))