deltacat 1.1.17__py3-none-any.whl → 1.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.17"
47
+ __version__ = "1.1.18"
48
48
 
49
49
 
50
50
  __all__ = [
deltacat/aws/constants.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import botocore
2
2
  from typing import Set
3
3
  from daft.exceptions import DaftTransientError
4
-
5
4
  from deltacat.utils.common import env_integer, env_string
6
5
 
7
6
 
@@ -27,8 +27,11 @@ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Tab
27
27
 
28
28
  result = []
29
29
  for hash_value in hash_column_np:
30
- assert hash_value is not None, f"Expected non-null primary key"
31
- result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
30
+ if hash_value is None:
31
+ result.append(None)
32
+ logger.info("A primary key hash is null")
33
+ else:
34
+ result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
32
35
 
33
36
  return sc.append_pk_hash_string_column(table, result)
34
37
 
@@ -191,7 +194,7 @@ def generate_pk_hash_column(
191
194
  pk_columns.append(sliced_string_cast(table[pk_name]))
192
195
 
193
196
  pk_columns.append(PK_DELIMITER)
194
- hash_column = pc.binary_join_element_wise(*pk_columns)
197
+ hash_column = pc.binary_join_element_wise(*pk_columns, null_handling="replace")
195
198
  return hash_column
196
199
 
197
200
  def _generate_uuid(table: pa.Table) -> pa.Array:
@@ -345,8 +348,10 @@ def hash_group_index_to_hash_bucket_indices(
345
348
  return range(hb_group, num_buckets, num_groups)
346
349
 
347
350
 
348
- def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
351
+ def pk_digest_to_hash_bucket_index(digest: Optional[str], num_buckets: int) -> int:
349
352
  """
350
353
  Generates the hash bucket index from the given digest.
351
354
  """
355
+ if digest is None:
356
+ return 0
352
357
  return int(digest, 16) % num_buckets
deltacat/exceptions.py CHANGED
@@ -299,7 +299,7 @@ def _categorize_tenacity_error(e: tenacity.RetryError):
299
299
  def _categorize_dependency_pyarrow_error(e: ArrowException):
300
300
  if isinstance(e, ArrowInvalid):
301
301
  raise DependencyPyarrowInvalidError(
302
- f"Pyarrow Invalid error occurred. Reason: {e}"
302
+ f"Pyarrow Invalid error occurred. {e}"
303
303
  ) from e
304
304
  elif isinstance(e, ArrowCapacityError):
305
305
  raise DependencyPyarrowCapacityError("Pyarrow Capacity error occurred.") from e
@@ -308,9 +308,7 @@ def _categorize_dependency_pyarrow_error(e: ArrowException):
308
308
 
309
309
 
310
310
  def _categorize_assertion_error(e: BaseException):
311
- raise ValidationError(
312
- f"One of the assertions in DeltaCAT has failed. Reason: {e}"
313
- ) from e
311
+ raise ValidationError(f"One of the assertions in DeltaCAT has failed. {e}") from e
314
312
 
315
313
 
316
314
  def _categorize_daft_error(e: DaftCoreException):
@@ -848,6 +848,83 @@ MULTIPLE_ROUNDS_TEST_CASES = {
848
848
  assert_compaction_audit=None,
849
849
  num_rounds=3,
850
850
  ),
851
+ # 4 input deltas (3 upsert, 1 delete delta), 2 rounds requested
852
+ # Expect to see a table that aggregates 10 records total
853
+ # (12 upserts - 2 deletes (null PK) = 10 records)
854
+ # (dropDuplicates = False)
855
+ "9-multiple-rounds-delete-deltas-with-null-pk": MultipleRoundsTestCaseParams(
856
+ primary_keys={"pk_col_1"},
857
+ sort_keys=ZERO_VALUED_SORT_KEY,
858
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
859
+ partition_values=["1"],
860
+ input_deltas=[
861
+ (
862
+ pa.Table.from_arrays(
863
+ [
864
+ pa.array([None, 11, 12, 13]),
865
+ pa.array(["a", "b", "c", "d"]),
866
+ ],
867
+ names=["pk_col_1", "col_1"],
868
+ ),
869
+ DeltaType.UPSERT,
870
+ None,
871
+ ),
872
+ (
873
+ pa.Table.from_arrays(
874
+ [
875
+ pa.array([14, 15, 16, 17]),
876
+ pa.array(["e", "f", "g", "h"]),
877
+ ],
878
+ names=["pk_col_1", "col_1"],
879
+ ),
880
+ DeltaType.UPSERT,
881
+ None,
882
+ ),
883
+ (
884
+ pa.Table.from_arrays(
885
+ [
886
+ pa.array([18, 19, 20, 21]),
887
+ pa.array(["i", "j", "k", "l"]),
888
+ ],
889
+ names=["pk_col_1", "col_1"],
890
+ ),
891
+ DeltaType.UPSERT,
892
+ None,
893
+ ),
894
+ (
895
+ pa.Table.from_arrays(
896
+ [pa.array([None, 11]), pa.array(["a", "b"])],
897
+ names=["pk_col_1", "col_1"],
898
+ ),
899
+ DeltaType.DELETE,
900
+ DeleteParameters.of(["pk_col_1", "col_1"]),
901
+ ),
902
+ ],
903
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
904
+ [
905
+ pa.array([i for i in range(12, 22)]),
906
+ pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
907
+ ],
908
+ names=["pk_col_1", "col_1"],
909
+ ),
910
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
911
+ [
912
+ pa.array([i for i in range(12, 22)]),
913
+ pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
914
+ ],
915
+ names=["pk_col_1", "col_1"],
916
+ ),
917
+ expected_terminal_exception=None,
918
+ expected_terminal_exception_message=None,
919
+ do_create_placement_group=False,
920
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
921
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
922
+ read_kwargs_provider=None,
923
+ drop_duplicates=False,
924
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
925
+ assert_compaction_audit=None,
926
+ num_rounds=2,
927
+ ),
851
928
  }
852
929
 
853
930
  MULTIPLE_ROUNDS_TEST_CASES = with_compactor_version_func_test_param(
@@ -84,6 +84,314 @@ REBASE_TEST_CASES = {
84
84
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
85
85
  assert_compaction_audit=None,
86
86
  ),
87
+ "2-rebase-with-null-pk": RebaseCompactionTestCaseParams(
88
+ primary_keys={"pk_col_1"},
89
+ sort_keys=[
90
+ SortKey.of(key_name="sk_col_1"),
91
+ SortKey.of(key_name="sk_col_2"),
92
+ ],
93
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
94
+ partition_values=["1"],
95
+ input_deltas=pa.Table.from_arrays(
96
+ [
97
+ pa.array([1, 2, None, 2, None, 1]),
98
+ pa.array([1, 2, 3, 4, 5, 6]),
99
+ pa.array(["foo"] * 6),
100
+ pa.array([5, 6, 7, 8, 9, 10]),
101
+ ],
102
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
103
+ ),
104
+ input_deltas_delta_type=DeltaType.UPSERT,
105
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
106
+ [
107
+ pa.array([None, 1, 2]),
108
+ pa.array([5, 6, 4]),
109
+ pa.array(["foo"] * 3),
110
+ pa.array([9, 10, 8]),
111
+ ],
112
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
113
+ ),
114
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
115
+ [
116
+ pa.array([None, 1, 2]),
117
+ pa.array([5, 6, 4]),
118
+ pa.array(["foo"] * 3),
119
+ pa.array([7, 10, 8]),
120
+ ],
121
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
122
+ ),
123
+ expected_terminal_exception=None,
124
+ expected_terminal_exception_message=None,
125
+ do_create_placement_group=False,
126
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
127
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
128
+ read_kwargs_provider=None,
129
+ drop_duplicates=True,
130
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
131
+ assert_compaction_audit=None,
132
+ ),
133
+ "3-rebase-with-null-two-pk": RebaseCompactionTestCaseParams(
134
+ primary_keys={"pk_col_1", "pk_col_2"},
135
+ sort_keys=[
136
+ SortKey.of(key_name="sk_col_1"),
137
+ ],
138
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
139
+ partition_values=["1"],
140
+ input_deltas=pa.Table.from_arrays(
141
+ [
142
+ pa.array([1, 2, None, 2, None, 1, 5]),
143
+ pa.array([1, None, 3, None, None, 1, 5]),
144
+ pa.array(["foo"] * 7),
145
+ pa.array([5, 6, 7, 8, 9, 10, 11]),
146
+ ],
147
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
148
+ ),
149
+ input_deltas_delta_type=DeltaType.UPSERT,
150
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
151
+ [
152
+ pa.array([1, 2, None, 5, None]),
153
+ pa.array([1, None, 3, 5, None]),
154
+ pa.array(["foo"] * 5),
155
+ pa.array([10, 8, 7, 11, 9]),
156
+ ],
157
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
158
+ ),
159
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
160
+ [
161
+ pa.array([1, 2, None, 5, None]),
162
+ pa.array([1, None, 3, 5, None]),
163
+ pa.array(["foo"] * 5),
164
+ pa.array([10, 8, 7, 11, 9]),
165
+ ],
166
+ names=["pk_col_1", "pk_col_2", "sk_col_1", "col_1"],
167
+ ),
168
+ expected_terminal_exception=None,
169
+ expected_terminal_exception_message=None,
170
+ do_create_placement_group=False,
171
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
172
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
173
+ read_kwargs_provider=None,
174
+ drop_duplicates=True,
175
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
176
+ assert_compaction_audit=None,
177
+ ),
178
+ "4-rebase-with-null-multiple-pk-different-types": RebaseCompactionTestCaseParams(
179
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
180
+ sort_keys=[],
181
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
182
+ partition_values=["1"],
183
+ input_deltas=pa.Table.from_arrays(
184
+ [
185
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
186
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
187
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
188
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
189
+ ],
190
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
191
+ ),
192
+ input_deltas_delta_type=DeltaType.UPSERT,
193
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
194
+ [
195
+ pa.array([1, 2, None, 5, None, None]),
196
+ pa.array([1, None, 3, 5, None, None]),
197
+ pa.array(["a", "b", "c", "g", "e", None]),
198
+ pa.array([10, 8, 7, 11, 12, 14]),
199
+ ],
200
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
201
+ ),
202
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
203
+ [
204
+ pa.array([1, 2, None, 5, None, None]),
205
+ pa.array([1, None, 3, 5, None, None]),
206
+ pa.array(["a", "b", "c", "g", "e", None]),
207
+ pa.array([10, 8, 7, 11, 12, 14]),
208
+ ],
209
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
210
+ ),
211
+ expected_terminal_exception=None,
212
+ expected_terminal_exception_message=None,
213
+ do_create_placement_group=False,
214
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
215
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
216
+ read_kwargs_provider=None,
217
+ drop_duplicates=True,
218
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
219
+ assert_compaction_audit=None,
220
+ ),
221
+ "5-rebase-with-null-multiple-pk-one-hash-bucket": RebaseCompactionTestCaseParams(
222
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
223
+ sort_keys=[],
224
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
225
+ partition_values=["1"],
226
+ input_deltas=pa.Table.from_arrays(
227
+ [
228
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
229
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
230
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
231
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
232
+ ],
233
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
234
+ ),
235
+ input_deltas_delta_type=DeltaType.UPSERT,
236
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
237
+ [
238
+ pa.array([1, 2, None, 5, None, None]),
239
+ pa.array([1, None, 3, 5, None, None]),
240
+ pa.array(["a", "b", "c", "g", "e", None]),
241
+ pa.array([10, 8, 7, 11, 12, 14]),
242
+ ],
243
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
244
+ ),
245
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
246
+ [
247
+ pa.array([1, 2, None, 5, None, None]),
248
+ pa.array([1, None, 3, 5, None, None]),
249
+ pa.array(["a", "b", "c", "g", "e", None]),
250
+ pa.array([10, 8, 7, 11, 12, 14]),
251
+ ],
252
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
253
+ ),
254
+ expected_terminal_exception=None,
255
+ expected_terminal_exception_message=None,
256
+ do_create_placement_group=False,
257
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
258
+ hash_bucket_count=1,
259
+ read_kwargs_provider=None,
260
+ drop_duplicates=True,
261
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
262
+ assert_compaction_audit=None,
263
+ ),
264
+ "6-rebase-with-null-multiple-pk-drop-duplicates-false": RebaseCompactionTestCaseParams(
265
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
266
+ sort_keys=[],
267
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
268
+ partition_values=["1"],
269
+ input_deltas=pa.Table.from_arrays(
270
+ [
271
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
272
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
273
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
274
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
275
+ ],
276
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
277
+ ),
278
+ input_deltas_delta_type=DeltaType.UPSERT,
279
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
280
+ [
281
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
282
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
283
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
284
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
285
+ ],
286
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
287
+ ),
288
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
289
+ [
290
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
291
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
292
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
293
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
294
+ ],
295
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
296
+ ),
297
+ expected_terminal_exception=None,
298
+ expected_terminal_exception_message=None,
299
+ do_create_placement_group=False,
300
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
301
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
302
+ read_kwargs_provider=None,
303
+ drop_duplicates=False,
304
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
305
+ assert_compaction_audit=None,
306
+ ),
307
+ "7-rebase-drop-duplicates-false": RebaseCompactionTestCaseParams(
308
+ primary_keys={"pk_col_1"},
309
+ sort_keys=[
310
+ SortKey.of(key_name="sk_col_1"),
311
+ ],
312
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
313
+ partition_values=["1"],
314
+ input_deltas=pa.Table.from_arrays(
315
+ [
316
+ pa.array([1, 2, 2, 3, 3, 1]),
317
+ pa.array([1, 2, 3, 4, 5, 6]),
318
+ pa.array(["a", "b", "c", "b", "e", "a"]),
319
+ pa.array([5, 6, 7, 8, 9, 10]),
320
+ ],
321
+ names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
322
+ ),
323
+ input_deltas_delta_type=DeltaType.UPSERT,
324
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
325
+ [
326
+ pa.array([1, 2, 2, 3, 3, 1]),
327
+ pa.array([1, 2, 3, 4, 5, 6]),
328
+ pa.array(["a", "b", "c", "b", "e", "a"]),
329
+ pa.array([5, 6, 7, 8, 9, 10]),
330
+ ],
331
+ names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
332
+ ),
333
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
334
+ [
335
+ pa.array([1, 2, 2, 3, 3, 1]),
336
+ pa.array([1, 2, 3, 4, 5, 6]),
337
+ pa.array(["a", "b", "c", "b", "e", "a"]),
338
+ pa.array([5, 6, 7, 8, 9, 10]),
339
+ ],
340
+ names=["pk_col_1", "sk_col_1", "col_1", "col_2"],
341
+ ),
342
+ expected_terminal_exception=None,
343
+ expected_terminal_exception_message=None,
344
+ do_create_placement_group=False,
345
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
346
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
347
+ read_kwargs_provider=None,
348
+ drop_duplicates=False,
349
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
350
+ assert_compaction_audit=None,
351
+ ),
352
+ "8-rebase-with-with-null-pk-duplicates-false-hash-bucket-1": RebaseCompactionTestCaseParams(
353
+ primary_keys={"pk_col_1", "pk_col_2", "pk_col_3"},
354
+ sort_keys=[],
355
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
356
+ partition_values=["1"],
357
+ input_deltas=pa.Table.from_arrays(
358
+ [
359
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
360
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
361
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
362
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
363
+ ],
364
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
365
+ ),
366
+ input_deltas_delta_type=DeltaType.UPSERT,
367
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
368
+ [
369
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
370
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
371
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
372
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
373
+ ],
374
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
375
+ ),
376
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
377
+ [
378
+ pa.array([1, 2, None, 2, None, 1, 5, None, None, None]),
379
+ pa.array([1, None, 3, None, None, 1, 5, None, None, None]),
380
+ pa.array(["a", "b", "c", "b", "e", "a", "g", "e", None, None]),
381
+ pa.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
382
+ ],
383
+ names=["pk_col_1", "pk_col_2", "pk_col_3", "col_1"],
384
+ ),
385
+ expected_terminal_exception=None,
386
+ expected_terminal_exception_message=None,
387
+ do_create_placement_group=False,
388
+ records_per_compacted_file=1,
389
+ hash_bucket_count=1,
390
+ read_kwargs_provider=None,
391
+ drop_duplicates=False,
392
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
393
+ assert_compaction_audit=None,
394
+ ),
87
395
  }
88
396
 
89
397
  REBASE_TEST_CASES = with_compactor_version_func_test_param(REBASE_TEST_CASES)
@@ -798,6 +798,67 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
798
798
  skip_enabled_compact_partition_drivers=None,
799
799
  assert_compaction_audit=None,
800
800
  ),
801
+ "14-rebase-then-incremental-with-null-pk": RebaseThenIncrementalCompactionTestCaseParams(
802
+ primary_keys={"pk_col_1"},
803
+ sort_keys=[
804
+ SortKey.of(key_name="sk_col_1"),
805
+ SortKey.of(key_name="sk_col_2"),
806
+ ],
807
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
808
+ partition_values=["1"],
809
+ input_deltas=pa.Table.from_arrays(
810
+ [
811
+ pa.array([str(i) for i in range(9)] + [None]),
812
+ pa.array([i for i in range(0, 10)]),
813
+ pa.array(["foo"] * 10),
814
+ pa.array([i / 10 for i in range(10, 20)]),
815
+ ],
816
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
817
+ ),
818
+ input_deltas_delta_type=DeltaType.UPSERT,
819
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
820
+ [
821
+ pa.array([str(i) for i in range(9)] + [None]),
822
+ pa.array([i for i in range(0, 10)]),
823
+ pa.array(["foo"] * 10),
824
+ pa.array([i / 10 for i in range(10, 20)]),
825
+ ],
826
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
827
+ ),
828
+ incremental_deltas=[
829
+ (
830
+ pa.Table.from_arrays(
831
+ [
832
+ pa.array([str(i) for i in range(9)] + [None]),
833
+ pa.array([i for i in range(20, 30)]),
834
+ pa.array(["foo"] * 10),
835
+ pa.array([i / 10 for i in range(40, 50)]),
836
+ ],
837
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
838
+ ),
839
+ DeltaType.UPSERT,
840
+ None,
841
+ )
842
+ ],
843
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
844
+ [
845
+ pa.array([str(i) for i in range(9)] + [None]),
846
+ pa.array([i for i in range(20, 30)]),
847
+ pa.array(["foo"] * 10),
848
+ pa.array([i / 10 for i in range(40, 50)]),
849
+ ],
850
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
851
+ ),
852
+ expected_terminal_exception=None,
853
+ expected_terminal_exception_message=None,
854
+ do_create_placement_group=False,
855
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
856
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
857
+ read_kwargs_provider=None,
858
+ drop_duplicates=True,
859
+ skip_enabled_compact_partition_drivers=None,
860
+ assert_compaction_audit=assert_compaction_audit,
861
+ ),
801
862
  }
802
863
 
803
864
  REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
@@ -1983,6 +2044,104 @@ REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
1983
2044
  skip_enabled_compact_partition_drivers=None,
1984
2045
  assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
1985
2046
  ),
2047
+ "31-rebase-then-incremental-delete-delta-on-incremental-null-pk-delete-null": RebaseThenIncrementalCompactionTestCaseParams(
2048
+ primary_keys={"pk_col_1"},
2049
+ sort_keys=ZERO_VALUED_SORT_KEY,
2050
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
2051
+ partition_values=["1"],
2052
+ input_deltas=pa.Table.from_arrays(
2053
+ [
2054
+ pa.array([i for i in range(11)] + [None]),
2055
+ pa.array([str(i) for i in range(0, 12)]),
2056
+ ],
2057
+ names=["pk_col_1", "col_1"],
2058
+ ),
2059
+ input_deltas_delta_type=DeltaType.UPSERT,
2060
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
2061
+ [
2062
+ pa.array([i for i in range(11)] + [None]),
2063
+ pa.array([str(i) for i in range(0, 12)]),
2064
+ ],
2065
+ names=["pk_col_1", "col_1"],
2066
+ ),
2067
+ incremental_deltas=[
2068
+ (
2069
+ pa.Table.from_arrays(
2070
+ [
2071
+ pa.array([10, 11, None, 13]),
2072
+ pa.array(["a", "b", "c", "d"]),
2073
+ ],
2074
+ names=["pk_col_1", "col_1"],
2075
+ ),
2076
+ DeltaType.UPSERT,
2077
+ None,
2078
+ ),
2079
+ (
2080
+ pa.Table.from_arrays(
2081
+ [pa.array([10, 11]), pa.array(["a", "b"])],
2082
+ names=["pk_col_1", "col_1"],
2083
+ ),
2084
+ DeltaType.DELETE,
2085
+ DeleteParameters.of(["pk_col_1", "col_1"]),
2086
+ ),
2087
+ (
2088
+ pa.Table.from_arrays(
2089
+ [pa.array([None])], # Support deleting null PK records
2090
+ names=["pk_col_1"],
2091
+ ),
2092
+ DeltaType.DELETE,
2093
+ DeleteParameters.of(["pk_col_1"]),
2094
+ ),
2095
+ (
2096
+ pa.Table.from_arrays(
2097
+ [pa.array(["c"])],
2098
+ names=["col_1"],
2099
+ ),
2100
+ DeltaType.DELETE,
2101
+ DeleteParameters.of(["col_1"]),
2102
+ ),
2103
+ (
2104
+ pa.Table.from_arrays(
2105
+ [pa.array(["c"])],
2106
+ names=["col_1"],
2107
+ ),
2108
+ DeltaType.DELETE,
2109
+ DeleteParameters.of(["col_1"]),
2110
+ ),
2111
+ (
2112
+ pa.Table.from_arrays(
2113
+ [pa.array([10, 11]), pa.array(["a", "b"])],
2114
+ names=["pk_col_1", "col_1"],
2115
+ ),
2116
+ DeltaType.DELETE,
2117
+ DeleteParameters.of(["pk_col_1", "col_1"]),
2118
+ ),
2119
+ (
2120
+ pa.Table.from_arrays(
2121
+ [pa.array(["c"])],
2122
+ names=["col_1"],
2123
+ ),
2124
+ DeltaType.DELETE,
2125
+ DeleteParameters.of(["col_1"]),
2126
+ ),
2127
+ ],
2128
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
2129
+ [
2130
+ pa.array([i for i in range(10)] + [13]),
2131
+ pa.array([str(i) for i in range(0, 10)] + ["d"]),
2132
+ ],
2133
+ names=["pk_col_1", "col_1"],
2134
+ ),
2135
+ expected_terminal_exception=None,
2136
+ expected_terminal_exception_message=None,
2137
+ do_create_placement_group=False,
2138
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
2139
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
2140
+ read_kwargs_provider=None,
2141
+ drop_duplicates=True,
2142
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
2143
+ assert_compaction_audit=assert_compaction_audit,
2144
+ ),
1986
2145
  }
1987
2146
 
1988
2147
  REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
@@ -249,6 +249,7 @@ def test_compact_partition_rebase_same_source_and_destination(
249
249
  pgm = PlacementGroupManager(
250
250
  1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
251
251
  ).pgs[0]
252
+ last_stream_position_to_compact = source_partition.stream_position
252
253
  compact_partition_params = CompactPartitionParams.of(
253
254
  {
254
255
  "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
@@ -258,17 +259,19 @@ def test_compact_partition_rebase_same_source_and_destination(
258
259
  "deltacat_storage_kwargs": ds_mock_kwargs,
259
260
  "destination_partition_locator": rebased_partition.locator,
260
261
  "hash_bucket_count": hash_bucket_count_param,
261
- "last_stream_position_to_compact": source_partition.stream_position,
262
+ "last_stream_position_to_compact": last_stream_position_to_compact,
262
263
  "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
263
264
  "object_store": RayPlasmaObjectStore(),
264
265
  "pg_config": pgm,
265
266
  "primary_keys": primary_keys,
266
267
  "read_kwargs_provider": read_kwargs_provider_param,
267
268
  "rebase_source_partition_locator": source_partition.locator,
269
+ "rebase_source_partition_high_watermark": rebased_partition.stream_position,
268
270
  "records_per_compacted_file": records_per_compacted_file_param,
269
271
  "s3_client_kwargs": {},
270
272
  "source_partition_locator": rebased_partition.locator,
271
273
  "sort_keys": sort_keys if sort_keys else None,
274
+ "drop_duplicates": drop_duplicates_param,
272
275
  }
273
276
  )
274
277
 
@@ -300,14 +303,20 @@ def test_compact_partition_rebase_same_source_and_destination(
300
303
  compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
301
304
  s3_resource, rcf_file_s3_uri
302
305
  )
306
+ assert (
307
+ compacted_delta_locator.stream_position == last_stream_position_to_compact
308
+ ), "Compacted delta locator must be equal to last stream position"
303
309
  tables = ds.download_delta(
304
310
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
305
311
  )
306
312
  actual_rebase_compacted_table = pa.concat_tables(tables)
307
313
  # if no primary key is specified then sort by sort_key for consistent assertion
308
- sorting_cols: List[Any] = (
309
- [(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
310
- )
314
+ sorting_cols: List[Any] = []
315
+ if primary_keys:
316
+ sorting_cols.extend([(val, "ascending") for val in primary_keys])
317
+ if sort_keys:
318
+ sorting_cols.extend(sort_keys)
319
+
311
320
  rebase_expected_compact_partition_result = (
312
321
  rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
313
322
  )
@@ -193,8 +193,10 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
193
193
  ds_mock_kwargs: Optional[Dict[str, Any]],
194
194
  ) -> Tuple[Stream, Stream, Optional[Stream]]:
195
195
  import deltacat.tests.local_deltacat_storage as ds
196
- from deltacat.storage import Partition, Stream
196
+ from deltacat.storage import Delta
197
+ from deltacat.utils.common import current_time_ms
197
198
 
199
+ last_stream_position = current_time_ms()
198
200
  source_namespace, source_table_name, source_table_version = create_src_table(
199
201
  primary_keys, sort_keys, partition_keys, ds_mock_kwargs
200
202
  )
@@ -208,10 +210,12 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
208
210
  staged_partition: Partition = ds.stage_partition(
209
211
  source_table_stream, partition_values, **ds_mock_kwargs
210
212
  )
213
+ staged_delta: Delta = ds.stage_delta(
214
+ input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
215
+ )
216
+ staged_delta.locator.stream_position = last_stream_position
211
217
  ds.commit_delta(
212
- ds.stage_delta(
213
- input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
214
- ),
218
+ staged_delta,
215
219
  **ds_mock_kwargs,
216
220
  )
217
221
  ds.commit_partition(staged_partition, **ds_mock_kwargs)
@@ -244,8 +248,12 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
244
248
  staged_partition: Partition = ds.stage_partition(
245
249
  rebasing_table_stream, partition_values, **ds_mock_kwargs
246
250
  )
251
+ staged_delta: Delta = ds.stage_delta(
252
+ input_deltas, staged_partition, **ds_mock_kwargs
253
+ )
254
+ staged_delta.locator.stream_position = last_stream_position
247
255
  ds.commit_delta(
248
- ds.stage_delta(input_deltas, staged_partition, **ds_mock_kwargs),
256
+ staged_delta,
249
257
  **ds_mock_kwargs,
250
258
  )
251
259
  ds.commit_partition(staged_partition, **ds_mock_kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.17
3
+ Version: 1.1.18
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,10 +1,10 @@
1
- deltacat/__init__.py,sha256=x9MIXX9uM_Gf7hvnQuf0dQwXgyXztDLmGSbM1F27e10,1778
1
+ deltacat/__init__.py,sha256=-PrYkT-sQtWxMQFFAp4vobJJ8-dq-3EA5LjPiH5sFFQ,1778
2
2
  deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
- deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
3
+ deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=_UAc_6GiQR3mxccys32Cp2CZOKOVZ9L-AkNUAlzepns,9091
5
5
  deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  deltacat/aws/clients.py,sha256=4eQvpkV1PzFfxog7EriuglOGGwNFHR5hbGYpjsNNPxk,6949
7
- deltacat/aws/constants.py,sha256=1HnDXrSokW-G3YA3qKEiv7fZVntDs1uSk6a7On-VG5k,1223
7
+ deltacat/aws/constants.py,sha256=hcYAUot4ahq9GXCMClQiuYCtiDs5XaOebdUoKg4V84k,1222
8
8
  deltacat/aws/s3u.py,sha256=IdT0XqDXVOkPdo5Em5u3qAkV1UXFpXaE1rTkUDKv4f4,28578
9
9
  deltacat/aws/redshift/__init__.py,sha256=7SvjG-dqox8zZUhFicTsUvpG5vXYDl_QQ3ohlHOgTKc,342
10
10
  deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -76,7 +76,7 @@ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQ
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=5zwJEW_UHv9ttQ2exJ23ZnExwBQXn1KgN7FDx1MGYv0,5262
78
78
  deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
79
- deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=ghyIifjXtqXgi8lN3lfnVQ2vi8uk_ny0FE7hsQlLjRQ,11538
79
+ deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=QOMwWxGhZ7VWa3oE6InM4thR5pbjmT7ttNXvx_IiKjo,11676
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=VXvoVVUq5re8NiOoyrfz34qSRiOTB0IkxHJlMqKsBmk,14066
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
82
82
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -145,18 +145,18 @@ deltacat/tests/aws/test_s3u.py,sha256=FsYCH8K8DsDRPOtTp-w1Nu3ATqt4p1mqDo6aVJV-Sb
145
145
  deltacat/tests/catalog/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  deltacat/tests/catalog/test_default_catalog_impl.py,sha256=2l5uwmtLlUJ9yH1LDggtj81fa-pHqbE0-VBt6G4Hyc0,3180
147
147
  deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=vyqwuKYOb4FtRwC5r1SJf7kcZNYXoiGb-BUrBgr5_Xw,34852
149
- deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=NfcB1aTq6HeYMFrtooIIoifzLp5U0xFTN6F7Lpk8cYQ,3143
150
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=Kl5A7hoJ0pVOE-rZna_4XcuIjhuzQbJudvnfNYKHgGo,75436
148
+ deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
149
+ deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
150
+ deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
151
151
  deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
152
152
  deltacat/tests/compute/test_compact_partition_incremental.py,sha256=Z0hyQGhMZjCaOn1Vk4qUbgDiS7HDhtdNeFQyG1PJhqA,14559
153
153
  deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xhKCurTA29Y78_1eksUVJ0W35zNNZYm40rMpMM9ynvM,11853
154
154
  deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
155
- deltacat/tests/compute/test_compact_partition_rebase.py,sha256=p97zJmEoC2t6R12luSkCKjjBl50l4UGzh-IHdiQdpCs,11445
155
+ deltacat/tests/compute/test_compact_partition_rebase.py,sha256=O_IwZ1Xeaff98V1XYOyVD8PoS_EpVXSQcHWz4In8bK4,11889
156
156
  deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=CHHfNFEJW8S1We7NE1Gg6EaoKEWnaOMRxWrLyirrahc,14643
157
157
  deltacat/tests/compute/test_util_common.py,sha256=oTkTuo6wscVN8hmoQASIKP_DJN-M0um_ySCOcXv9AJA,11699
158
158
  deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
159
- deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=k9lq_3r_kNMzruTSn4JE7yjdBBUT3Lh-l8khSYdYpYs,12945
159
+ deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=Q3HJj1fjoe2JwRUOW8KEjbTqPIIoP2o_T3ZGH6SJnCM,13244
160
160
  deltacat/tests/compute/compactor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
161
  deltacat/tests/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
162
  deltacat/tests/compute/compactor/steps/test_repartition.py,sha256=0uRguPEKeLSYs746Jv8io-HZMWdyXNcOMBu8GO2mA0M,9305
@@ -220,8 +220,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
220
220
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
221
221
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
222
222
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
223
- deltacat-1.1.17.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
224
- deltacat-1.1.17.dist-info/METADATA,sha256=SxYHCGGCCaUP0Yej0eT0BMV3K8lQdOmodQj6VSnALcU,1734
225
- deltacat-1.1.17.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
226
- deltacat-1.1.17.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
227
- deltacat-1.1.17.dist-info/RECORD,,
223
+ deltacat-1.1.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
224
+ deltacat-1.1.18.dist-info/METADATA,sha256=aMHqD2bxsU5-IHUUjn47PFrRP01jh1wlrDtOHeHGGaA,1734
225
+ deltacat-1.1.18.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
226
+ deltacat-1.1.18.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
227
+ deltacat-1.1.18.dist-info/RECORD,,