deltacat 1.1.34__py3-none-any.whl → 1.1.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.34"
47
+ __version__ = "1.1.36"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -69,14 +69,17 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
69
69
  assert (
70
70
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
71
71
  ), "hash_bucket_count is a required arg for compactor v2"
72
+ assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
72
73
  if params.num_rounds > 1:
73
74
  assert (
74
75
  not params.drop_duplicates
75
76
  ), "num_rounds > 1, drop_duplicates must be False but is True"
76
77
 
77
- with memray.Tracker(
78
- "compaction_partition.bin"
79
- ) if params.enable_profiler else nullcontext():
78
+ with (
79
+ memray.Tracker("compaction_partition.bin")
80
+ if params.enable_profiler
81
+ else nullcontext()
82
+ ):
80
83
  execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
81
84
  params,
82
85
  **kwargs,
@@ -1,4 +1,4 @@
1
- from deltacat.utils.common import env_bool, env_integer
1
+ from deltacat.utils.common import env_bool, env_integer, env_string
2
2
 
3
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
4
4
 
@@ -92,3 +92,18 @@ DEFAULT_NUM_ROUNDS = 1
92
92
  SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
93
  "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
94
  )
95
+
96
+ # This env variable specifies whether to check bucketing spec
97
+ # compliance of the existing compacted table.
98
+ # PRINT_LOG: Enable logging if any partition is found
99
+ # to be non-compliant with the bucketing spec.
100
+ # ASSERT: Fail the job with ValidationError if the
101
+ # current compacted partition is found to be non-compliant
102
+ # with bucketing spec. Note, logging is implicitly enabled
103
+ # in this case.
104
+ BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
105
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE", None
106
+ )
107
+
108
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
109
+ BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
@@ -48,6 +48,7 @@ class MergeInput(Dict):
48
48
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
49
  memory_logs_enabled: Optional[bool] = None,
50
50
  disable_copy_by_reference: Optional[bool] = None,
51
+ hash_bucket_count: Optional[int] = None,
51
52
  ) -> MergeInput:
52
53
 
53
54
  result = MergeInput()
@@ -71,6 +72,7 @@ class MergeInput(Dict):
71
72
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
72
73
  result["memory_logs_enabled"] = memory_logs_enabled
73
74
  result["disable_copy_by_reference"] = disable_copy_by_reference
75
+ result["hash_bucket_count"] = hash_bucket_count
74
76
  return result
75
77
 
76
78
  @property
@@ -154,3 +156,7 @@ class MergeInput(Dict):
154
156
  @property
155
157
  def disable_copy_by_reference(self) -> bool:
156
158
  return self["disable_copy_by_reference"]
159
+
160
+ @property
161
+ def hash_bucket_count(self) -> int:
162
+ return self["hash_bucket_count"]
@@ -438,6 +438,7 @@ def _merge(
438
438
  delete_file_envelopes=delete_file_envelopes,
439
439
  memory_logs_enabled=params.memory_logs_enabled,
440
440
  disable_copy_by_reference=params.disable_copy_by_reference,
441
+ hash_bucket_count=params.hash_bucket_count,
441
442
  )
442
443
  }
443
444
 
@@ -32,6 +32,7 @@ from deltacat.utils.resources import (
32
32
  )
33
33
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
34
34
  generate_pk_hash_column,
35
+ pk_digest_to_hash_bucket_index,
35
36
  )
36
37
  from deltacat.storage import (
37
38
  Delta,
@@ -47,6 +48,9 @@ from deltacat.compute.compactor_v2.constants import (
47
48
  MERGE_TIME_IN_SECONDS,
48
49
  MERGE_SUCCESS_COUNT,
49
50
  MERGE_FAILURE_COUNT,
51
+ BUCKETING_SPEC_COMPLIANCE_PROFILE,
52
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
53
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
50
54
  )
51
55
  from deltacat.exceptions import (
52
56
  categorize_errors,
@@ -58,6 +62,10 @@ if importlib.util.find_spec("memray"):
58
62
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
59
63
 
60
64
 
65
+ _EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
66
+ _INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
67
+
68
+
61
69
  def _append_delta_type_column(table: pa.Table, value: np.bool_):
62
70
  return table.append_column(
63
71
  sc._DELTA_TYPE_COLUMN_FIELD,
@@ -108,6 +116,8 @@ def _merge_tables(
108
116
  table: pa.Table,
109
117
  primary_keys: List[str],
110
118
  can_drop_duplicates: bool,
119
+ hb_index: int,
120
+ num_buckets: int,
111
121
  compacted_table: Optional[pa.Table] = None,
112
122
  ) -> pa.Table:
113
123
  """
@@ -126,6 +136,20 @@ def _merge_tables(
126
136
 
127
137
  all_tables.append(table)
128
138
 
139
+ check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
140
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
141
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
142
+ ]
143
+
144
+ if primary_keys and check_bucketing_spec:
145
+ _validate_bucketing_spec_compliance(
146
+ table=all_tables[incremental_idx],
147
+ num_buckets=num_buckets,
148
+ primary_keys=primary_keys,
149
+ hb_index=hb_index,
150
+ log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
151
+ )
152
+
129
153
  if not primary_keys or not can_drop_duplicates:
130
154
  logger.info(
131
155
  f"Not dropping duplicates for primary keys={primary_keys} "
@@ -188,9 +212,47 @@ def _merge_tables(
188
212
  return final_table
189
213
 
190
214
 
215
+ def _validate_bucketing_spec_compliance(
216
+ table: pa.Table,
217
+ num_buckets: int,
218
+ hb_index: int,
219
+ primary_keys: List[str],
220
+ rcf: RoundCompletionInfo = None,
221
+ log_prefix=None,
222
+ ) -> None:
223
+ if rcf is not None:
224
+ message_prefix = f"{log_prefix}{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}.{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}.{rcf.compacted_delta_locator.partition_values}"
225
+ else:
226
+ message_prefix = f"{log_prefix}"
227
+ pki_table = generate_pk_hash_column(
228
+ [table], primary_keys=primary_keys, requires_hash=True
229
+ )[0]
230
+ is_not_compliant: bool = False
231
+ for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
232
+ hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
233
+ if hash_bucket != hb_index:
234
+ is_not_compliant = True
235
+ logger.info(
236
+ f"{message_prefix} has non-compliant bucketing spec at index: {index} "
237
+ f"Expected hash bucket is {hb_index} but found {hash_bucket}."
238
+ )
239
+ if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
240
+ raise AssertionError(
241
+ f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
242
+ f" to be {hb_index} but found {hash_bucket}"
243
+ )
244
+ # No further checks necessary
245
+ break
246
+ if not is_not_compliant:
247
+ logger.debug(
248
+ f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
249
+ )
250
+
251
+
191
252
  def _download_compacted_table(
192
253
  hb_index: int,
193
254
  rcf: RoundCompletionInfo,
255
+ primary_keys: List[str],
194
256
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
195
257
  deltacat_storage=unimplemented_deltacat_storage,
196
258
  deltacat_storage_kwargs: Optional[dict] = None,
@@ -214,7 +276,28 @@ def _download_compacted_table(
214
276
 
215
277
  tables.append(table)
216
278
 
217
- return pa.concat_tables(tables)
279
+ compacted_table = pa.concat_tables(tables)
280
+ check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
281
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
282
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
283
+ ]
284
+
285
+ logger.debug(
286
+ f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
287
+ f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
288
+ )
289
+
290
+ # Bucketing spec compliance isn't required without primary keys
291
+ if primary_keys and check_bucketing_spec:
292
+ _validate_bucketing_spec_compliance(
293
+ compacted_table,
294
+ rcf.hash_bucket_count,
295
+ hb_index,
296
+ primary_keys,
297
+ rcf=rcf,
298
+ log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
299
+ )
300
+ return compacted_table
218
301
 
219
302
 
220
303
  def _copy_all_manifest_files_from_old_hash_buckets(
@@ -417,12 +500,12 @@ def _compact_tables(
417
500
  _group_sequence_by_delta_type(reordered_all_dfes)
418
501
  ):
419
502
  if delta_type is DeltaType.UPSERT:
420
- (
421
- table,
422
- incremental_len,
423
- deduped_records,
424
- merge_time,
425
- ) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
503
+ (table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
504
+ input=input,
505
+ dfe_list=delta_type_sequence,
506
+ hb_idx=hb_idx,
507
+ prev_table=table,
508
+ )
426
509
  logger.info(
427
510
  f" [Merge task index {input.merge_task_index}] Merged"
428
511
  f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
@@ -481,6 +564,8 @@ def _apply_upserts(
481
564
  primary_keys=input.primary_keys,
482
565
  can_drop_duplicates=input.drop_duplicates,
483
566
  compacted_table=prev_table,
567
+ hb_index=hb_idx,
568
+ num_buckets=input.hash_bucket_count,
484
569
  )
485
570
  deduped_records = hb_table_record_count - len(table)
486
571
  return table, incremental_len, deduped_records, merge_time
@@ -543,6 +628,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
543
628
  compacted_table = _download_compacted_table(
544
629
  hb_index=merge_file_group.hb_index,
545
630
  rcf=input.round_completion_info,
631
+ primary_keys=input.primary_keys,
546
632
  read_kwargs_provider=input.read_kwargs_provider,
547
633
  deltacat_storage=input.deltacat_storage,
548
634
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,
@@ -133,4 +133,5 @@ def generate_local_merge_input(
133
133
  delete_strategy=delete_strategy,
134
134
  delete_file_envelopes=delete_file_envelopes,
135
135
  disable_copy_by_reference=params.disable_copy_by_reference,
136
+ hash_bucket_count=params.hash_bucket_count,
136
137
  )
@@ -4,9 +4,11 @@ import os
4
4
  import pyarrow as pa
5
5
  import pytest
6
6
  import boto3
7
+ import json
7
8
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
8
9
  CompactionSessionAuditInfo,
9
10
  )
11
+ from deltacat.exceptions import ValidationError
10
12
  from boto3.resources.base import ServiceResource
11
13
  import deltacat.tests.local_deltacat_storage as ds
12
14
  from deltacat.types.media import ContentType
@@ -88,6 +90,17 @@ def disable_sha1(monkeypatch):
88
90
  )
89
91
 
90
92
 
93
+ @pytest.fixture(scope="function")
94
+ def enable_bucketing_spec_validation(monkeypatch):
95
+ import deltacat.compute.compactor_v2.steps.merge
96
+
97
+ monkeypatch.setattr(
98
+ deltacat.compute.compactor_v2.steps.merge,
99
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
100
+ "ASSERT",
101
+ )
102
+
103
+
91
104
  class TestCompactionSession:
92
105
  """
93
106
  This class adds specific tests that aren't part of the parametrized test suite.
@@ -689,3 +702,307 @@ class TestCompactionSession:
689
702
  incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
690
703
  )
691
704
  assert incremental_rcf.compacted_pyarrow_write_result.records == 4
705
+
706
+ def test_compact_partition_when_bucket_spec_validation_fails(
707
+ self,
708
+ s3_resource,
709
+ local_deltacat_storage_kwargs,
710
+ enable_bucketing_spec_validation,
711
+ ):
712
+ """
713
+ A test case which asserts the bucketing spec validation throws an assertion error
714
+ when the validation has failed.
715
+ """
716
+
717
+ # setup
718
+ staged_source = stage_partition_from_file_paths(
719
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
720
+ )
721
+
722
+ source_delta = commit_delta_to_staged_partition(
723
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
724
+ )
725
+
726
+ staged_dest = stage_partition_from_file_paths(
727
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
728
+ )
729
+ dest_partition = ds.commit_partition(
730
+ staged_dest, **local_deltacat_storage_kwargs
731
+ )
732
+
733
+ # action
734
+ rcf_url = compact_partition(
735
+ CompactPartitionParams.of(
736
+ {
737
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
738
+ "compacted_file_content_type": ContentType.PARQUET,
739
+ "dd_max_parallelism_ratio": 1.0,
740
+ "deltacat_storage": ds,
741
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
742
+ "destination_partition_locator": dest_partition.locator,
743
+ "drop_duplicates": True,
744
+ "hash_bucket_count": 4,
745
+ "last_stream_position_to_compact": source_delta.stream_position,
746
+ "list_deltas_kwargs": {
747
+ **local_deltacat_storage_kwargs,
748
+ **{"equivalent_table_types": []},
749
+ },
750
+ "primary_keys": ["pk"],
751
+ "rebase_source_partition_locator": source_delta.partition_locator,
752
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
753
+ "records_per_compacted_file": 1,
754
+ "s3_client_kwargs": {},
755
+ "source_partition_locator": source_delta.partition_locator,
756
+ }
757
+ )
758
+ )
759
+
760
+ backfill_rcf = get_rcf(s3_resource, rcf_url)
761
+ bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
762
+ # Move the records to different hash buckets to simulate a validation failure.
763
+ backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
764
+ s3_resource.Bucket(bucket).put_object(
765
+ Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
766
+ )
767
+
768
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
769
+ new_source_delta = commit_delta_to_partition(
770
+ source_delta.partition_locator,
771
+ [self.INCREMENTAL_FILE_PATH],
772
+ **local_deltacat_storage_kwargs,
773
+ )
774
+
775
+ new_destination_partition = ds.get_partition(
776
+ dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
777
+ )
778
+
779
+ with pytest.raises(ValidationError) as excinfo:
780
+ compact_partition(
781
+ CompactPartitionParams.of(
782
+ {
783
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
784
+ "compacted_file_content_type": ContentType.PARQUET,
785
+ "dd_max_parallelism_ratio": 1.0,
786
+ "deltacat_storage": ds,
787
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
788
+ "destination_partition_locator": new_destination_partition.locator,
789
+ "drop_duplicates": True,
790
+ "hash_bucket_count": 4,
791
+ "last_stream_position_to_compact": new_source_delta.stream_position,
792
+ "list_deltas_kwargs": {
793
+ **local_deltacat_storage_kwargs,
794
+ **{"equivalent_table_types": []},
795
+ },
796
+ "primary_keys": ["pk"],
797
+ "rebase_source_partition_locator": None,
798
+ "rebase_source_partition_high_watermark": None,
799
+ "records_per_compacted_file": 4000,
800
+ "s3_client_kwargs": {},
801
+ "source_partition_locator": new_source_delta.partition_locator,
802
+ }
803
+ )
804
+ )
805
+
806
+ assert (
807
+ "Hash bucket drift detected at index: 0. Expected hash bucket index to be 1 but found 0"
808
+ in str(excinfo.value)
809
+ )
810
+
811
+ def test_compact_partition_when_bucket_spec_validation_fails_but_env_variable_disabled(
812
+ self,
813
+ s3_resource,
814
+ local_deltacat_storage_kwargs,
815
+ ):
816
+ """
817
+ A test case which asserts even if bucketing spec validation fails, compaction doesn't
818
+ throw an error if the feature is not enabled.
819
+ """
820
+
821
+ # setup
822
+ staged_source = stage_partition_from_file_paths(
823
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
824
+ )
825
+
826
+ source_delta = commit_delta_to_staged_partition(
827
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
828
+ )
829
+
830
+ staged_dest = stage_partition_from_file_paths(
831
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
832
+ )
833
+ dest_partition = ds.commit_partition(
834
+ staged_dest, **local_deltacat_storage_kwargs
835
+ )
836
+
837
+ # action
838
+ rcf_url = compact_partition(
839
+ CompactPartitionParams.of(
840
+ {
841
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
842
+ "compacted_file_content_type": ContentType.PARQUET,
843
+ "dd_max_parallelism_ratio": 1.0,
844
+ "deltacat_storage": ds,
845
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
846
+ "destination_partition_locator": dest_partition.locator,
847
+ "drop_duplicates": True,
848
+ "hash_bucket_count": 4,
849
+ "last_stream_position_to_compact": source_delta.stream_position,
850
+ "list_deltas_kwargs": {
851
+ **local_deltacat_storage_kwargs,
852
+ **{"equivalent_table_types": []},
853
+ },
854
+ "primary_keys": ["pk"],
855
+ "rebase_source_partition_locator": source_delta.partition_locator,
856
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
857
+ "records_per_compacted_file": 1,
858
+ "s3_client_kwargs": {},
859
+ "source_partition_locator": source_delta.partition_locator,
860
+ }
861
+ )
862
+ )
863
+
864
+ backfill_rcf = get_rcf(s3_resource, rcf_url)
865
+ bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
866
+ # Move the records to different hash buckets to simulate a validation failure.
867
+ backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
868
+ s3_resource.Bucket(bucket).put_object(
869
+ Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
870
+ )
871
+
872
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
873
+ new_source_delta = commit_delta_to_partition(
874
+ source_delta.partition_locator,
875
+ [self.INCREMENTAL_FILE_PATH],
876
+ **local_deltacat_storage_kwargs,
877
+ )
878
+
879
+ new_destination_partition = ds.get_partition(
880
+ dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
881
+ )
882
+
883
+ new_rcf = compact_partition(
884
+ CompactPartitionParams.of(
885
+ {
886
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
887
+ "compacted_file_content_type": ContentType.PARQUET,
888
+ "dd_max_parallelism_ratio": 1.0,
889
+ "deltacat_storage": ds,
890
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
891
+ "destination_partition_locator": new_destination_partition.locator,
892
+ "drop_duplicates": True,
893
+ "hash_bucket_count": 4,
894
+ "last_stream_position_to_compact": new_source_delta.stream_position,
895
+ "list_deltas_kwargs": {
896
+ **local_deltacat_storage_kwargs,
897
+ **{"equivalent_table_types": []},
898
+ },
899
+ "primary_keys": ["pk"],
900
+ "rebase_source_partition_locator": None,
901
+ "rebase_source_partition_high_watermark": None,
902
+ "records_per_compacted_file": 4000,
903
+ "s3_client_kwargs": {},
904
+ "source_partition_locator": new_source_delta.partition_locator,
905
+ }
906
+ )
907
+ )
908
+
909
+ incremental_rcf = get_rcf(s3_resource, new_rcf)
910
+ assert incremental_rcf.hash_bucket_count == 4
911
+ assert len(incremental_rcf.hb_index_to_entry_range) == 2
912
+
913
+ def test_compact_partition_when_bucket_spec_validation_succeeds(
914
+ self,
915
+ s3_resource,
916
+ local_deltacat_storage_kwargs,
917
+ enable_bucketing_spec_validation,
918
+ ):
919
+ """
920
+ A test case which asserts the bucketing spec validation does not throw
921
+ and error when the validation succeeds.
922
+ """
923
+
924
+ # setup
925
+ staged_source = stage_partition_from_file_paths(
926
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
927
+ )
928
+
929
+ source_delta = commit_delta_to_staged_partition(
930
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
931
+ )
932
+
933
+ staged_dest = stage_partition_from_file_paths(
934
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
935
+ )
936
+ dest_partition = ds.commit_partition(
937
+ staged_dest, **local_deltacat_storage_kwargs
938
+ )
939
+
940
+ # action
941
+ rcf_url = compact_partition(
942
+ CompactPartitionParams.of(
943
+ {
944
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
945
+ "compacted_file_content_type": ContentType.PARQUET,
946
+ "dd_max_parallelism_ratio": 1.0,
947
+ "deltacat_storage": ds,
948
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
949
+ "destination_partition_locator": dest_partition.locator,
950
+ "drop_duplicates": True,
951
+ "hash_bucket_count": 4,
952
+ "last_stream_position_to_compact": source_delta.stream_position,
953
+ "list_deltas_kwargs": {
954
+ **local_deltacat_storage_kwargs,
955
+ **{"equivalent_table_types": []},
956
+ },
957
+ "primary_keys": ["pk"],
958
+ "rebase_source_partition_locator": source_delta.partition_locator,
959
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
960
+ "records_per_compacted_file": 1,
961
+ "s3_client_kwargs": {},
962
+ "source_partition_locator": source_delta.partition_locator,
963
+ }
964
+ )
965
+ )
966
+
967
+ rcf = get_rcf(s3_resource, rcf_url)
968
+ assert rcf.hash_bucket_count == 4
969
+
970
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
971
+ new_source_delta = commit_delta_to_partition(
972
+ source_delta.partition_locator,
973
+ [self.INCREMENTAL_FILE_PATH],
974
+ **local_deltacat_storage_kwargs,
975
+ )
976
+
977
+ new_destination_partition = ds.get_partition(
978
+ dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
979
+ )
980
+
981
+ new_uri = compact_partition(
982
+ CompactPartitionParams.of(
983
+ {
984
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
985
+ "compacted_file_content_type": ContentType.PARQUET,
986
+ "dd_max_parallelism_ratio": 1.0,
987
+ "deltacat_storage": ds,
988
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
989
+ "destination_partition_locator": new_destination_partition.locator,
990
+ "drop_duplicates": True,
991
+ "hash_bucket_count": 4,
992
+ "last_stream_position_to_compact": new_source_delta.stream_position,
993
+ "list_deltas_kwargs": {
994
+ **local_deltacat_storage_kwargs,
995
+ **{"equivalent_table_types": []},
996
+ },
997
+ "primary_keys": ["pk"],
998
+ "rebase_source_partition_locator": None,
999
+ "rebase_source_partition_high_watermark": None,
1000
+ "records_per_compacted_file": 4000,
1001
+ "s3_client_kwargs": {},
1002
+ "source_partition_locator": new_source_delta.partition_locator,
1003
+ }
1004
+ )
1005
+ )
1006
+
1007
+ rcf = get_rcf(s3_resource, new_uri)
1008
+ assert rcf.hash_bucket_count == 4
@@ -119,6 +119,21 @@ def offer_local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
119
119
  os.remove(DATABASE_FILE_PATH_VALUE)
120
120
 
121
121
 
122
+ @pytest.fixture(autouse=True, scope="function")
123
+ def enable_bucketing_spec_validation(monkeypatch):
124
+ """
125
+ Enable the bucketing spec validation for all tests.
126
+ This will help catch hash bucket drift in testing.
127
+ """
128
+ import deltacat.compute.compactor_v2.steps.merge
129
+
130
+ monkeypatch.setattr(
131
+ deltacat.compute.compactor_v2.steps.merge,
132
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
133
+ "ASSERT",
134
+ )
135
+
136
+
122
137
  @pytest.mark.parametrize(
123
138
  [
124
139
  "test_name",
@@ -114,6 +114,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
114
114
  os.remove(DATABASE_FILE_PATH_VALUE)
115
115
 
116
116
 
117
+ @pytest.fixture(autouse=True, scope="function")
118
+ def enable_bucketing_spec_validation(monkeypatch):
119
+ """
120
+ Enable the bucketing spec validation for all tests.
121
+ This will help catch hash bucket drift in testing.
122
+ """
123
+ import deltacat.compute.compactor_v2.steps.merge
124
+
125
+ monkeypatch.setattr(
126
+ deltacat.compute.compactor_v2.steps.merge,
127
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
128
+ "ASSERT",
129
+ )
130
+
131
+
117
132
  @pytest.mark.parametrize(
118
133
  [
119
134
  "test_name",
@@ -114,6 +114,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
114
114
  os.remove(DATABASE_FILE_PATH_VALUE)
115
115
 
116
116
 
117
+ @pytest.fixture(autouse=True, scope="function")
118
+ def enable_bucketing_spec_validation(monkeypatch):
119
+ """
120
+ Enable the bucketing spec validation for all tests.
121
+ This will help catch hash bucket drift in testing.
122
+ """
123
+ import deltacat.compute.compactor_v2.steps.merge
124
+
125
+ monkeypatch.setattr(
126
+ deltacat.compute.compactor_v2.steps.merge,
127
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
128
+ "ASSERT",
129
+ )
130
+
131
+
117
132
  @pytest.mark.parametrize(
118
133
  [
119
134
  "test_name",
@@ -118,6 +118,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
118
118
  os.remove(DATABASE_FILE_PATH_VALUE)
119
119
 
120
120
 
121
+ @pytest.fixture(autouse=True, scope="function")
122
+ def enable_bucketing_spec_validation(monkeypatch):
123
+ """
124
+ Enable the bucketing spec validation for all tests.
125
+ This will help catch hash bucket drift in testing.
126
+ """
127
+ import deltacat.compute.compactor_v2.steps.merge
128
+
129
+ monkeypatch.setattr(
130
+ deltacat.compute.compactor_v2.steps.merge,
131
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
132
+ "ASSERT",
133
+ )
134
+
135
+
121
136
  @pytest.mark.parametrize(
122
137
  [
123
138
  "test_name",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.34
3
+ Version: 1.1.36
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=g15eDSUMw0XN1VLww6GpBKnKFtB4WGCcMnJe4SlGOqg,1778
1
+ deltacat/__init__.py,sha256=9vJMHGceWew6atD_3VqKurlBJ3crD5mwAQIgSB1yjNY,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -50,8 +50,8 @@ deltacat/compute/compactor/utils/round_completion_file.py,sha256=fFevhUuveCvrU3g
50
50
  deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZRG2NqBlCw1X3_FBc,2397
51
51
  deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
52
52
  deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
54
- deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
53
+ deltacat/compute/compactor_v2/compaction_session.py,sha256=RbO_du0qX7nlyXO-ZSksX8RqWuRwfdvWddpTJjLDVNk,8185
54
+ deltacat/compute/compactor_v2/constants.py,sha256=F5Phrh-2JgnWvtjHXacxOG5Z2ivKcHnboerI12rc1zk,3632
55
55
  deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
57
57
  deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
@@ -63,19 +63,19 @@ deltacat/compute/compactor_v2/model/evaluate_compaction_result.py,sha256=XAaEEAd
63
63
  deltacat/compute/compactor_v2/model/hash_bucket_input.py,sha256=iJy8kLi1dIpFIyfoAjkaAtZvg8Np1z7BsUNGAcWfFm4,3042
64
64
  deltacat/compute/compactor_v2/model/hash_bucket_result.py,sha256=EsY9BPPywhmxlcLKn3kGWzAX4s4BTR2vYyPUB-wAEOc,309
65
65
  deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViVO1SVljCj6f0B3MfB3hqtGm2S0s,7410
66
- deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
66
+ deltacat/compute/compactor_v2/model/merge_input.py,sha256=D-6WuHK4X7m9-P6Hskz6RRemeWrNf6IPdhc14O3KDAg,5860
67
67
  deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
68
68
  deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
69
+ deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=nz2N3YZVE9bNwOqRXoQYkArJhyUJRis2s9BweZ3tad8,30989
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
- deltacat/compute/compactor_v2/steps/merge.py,sha256=LliCkWxWZ5Yh7UxVGSDJ1aRViw5hUZhEzlWxoXftbxA,22909
72
+ deltacat/compute/compactor_v2/steps/merge.py,sha256=4rKQ__SeWO_QLZl2btcFrYHCMOn-8R3kja74UrWOMgg,26225
73
73
  deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=t2j9H9IdFRH9EfpL-9g5XvZs9WK9HybqBGA7fDi82EM,8310
75
75
  deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=Xjs7_D-0xKSetvllIe4o96aM1elfdjt1Ii7YfsHPvZs,6108
78
- deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
78
+ deltacat/compute/compactor_v2/utils/merge.py,sha256=fAzEYwQYH2ia8MLdEFdZFivWHpi6qZu8AyyEK0H0vwE,5363
79
79
  deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
@@ -137,11 +137,11 @@ deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kW
137
137
  deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
138
138
  deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
139
139
  deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
140
- deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
141
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
140
+ deltacat/tests/compute/test_compact_partition_incremental.py,sha256=8hUqnzeGIhAENcBxLL0R_yfjAaNTmRds6OWxQOmVqD8,15416
141
+ deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=6d3F9E_4eO2Okh97v8NWFbEptPkzKoO0Qq8O6yAXrIs,13377
142
142
  deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
143
- deltacat/tests/compute/test_compact_partition_rebase.py,sha256=DNcpmnBo5QoZ23BiIhJCC3zaDK0xClZLUb2-ZEEp5s4,13108
144
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=Rxen3QGIaxVPa8lcO7NDMRxQ0aBjrOKn46LK5ZsfQTo,15073
143
+ deltacat/tests/compute/test_compact_partition_rebase.py,sha256=vOF8wgTpdaWJKo47mK9aii3NKtwVwWgujoQyS8C3YyA,13535
144
+ deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=BimvU-iHiF78WlE4xbfk4dzHm0a-frwpE7H7Kh4XkbE,15500
145
145
  deltacat/tests/compute/test_util_common.py,sha256=0mEHo38bgH64y0XZ_zgUL_aZgQMgJOSTlOYvIJxG_MM,11825
146
146
  deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
147
147
  deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=Q3HJj1fjoe2JwRUOW8KEjbTqPIIoP2o_T3ZGH6SJnCM,13244
@@ -152,7 +152,7 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
152
152
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
153
153
  deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
154
154
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
155
+ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=F1DFaranHekHB7HSNH-0_hV5ovdR5HfF9JqTVDw6Vh8,42575
156
156
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoiDuBUhgCmc3DYKCXL1g4QWtmROhZ0RJCQgePMY9as,9959
@@ -212,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
212
212
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
213
213
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
214
214
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
215
- deltacat-1.1.34.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
- deltacat-1.1.34.dist-info/METADATA,sha256=tndXv3uFRcc8PbYdaLMj9sZp9_uW7g94WGZBaNKFcUw,1733
217
- deltacat-1.1.34.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
- deltacat-1.1.34.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
- deltacat-1.1.34.dist-info/RECORD,,
215
+ deltacat-1.1.36.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
+ deltacat-1.1.36.dist-info/METADATA,sha256=wIZbEGHnJWq_TBKi0u463p4-PgG9R_0MApw7IIwmnRc,1733
217
+ deltacat-1.1.36.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
+ deltacat-1.1.36.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
+ deltacat-1.1.36.dist-info/RECORD,,