deltacat 1.1.34__py3-none-any.whl → 1.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/constants.py +16 -1
- deltacat/compute/compactor_v2/steps/merge.py +47 -1
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +317 -0
- deltacat/tests/compute/test_compact_partition_incremental.py +15 -0
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +15 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +15 -0
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +15 -0
- {deltacat-1.1.34.dist-info → deltacat-1.1.35.dist-info}/METADATA +1 -1
- {deltacat-1.1.34.dist-info → deltacat-1.1.35.dist-info}/RECORD +13 -13
- {deltacat-1.1.34.dist-info → deltacat-1.1.35.dist-info}/LICENSE +0 -0
- {deltacat-1.1.34.dist-info → deltacat-1.1.35.dist-info}/WHEEL +0 -0
- {deltacat-1.1.34.dist-info → deltacat-1.1.35.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from deltacat.utils.common import env_bool, env_integer
|
1
|
+
from deltacat.utils.common import env_bool, env_integer, env_string
|
2
2
|
|
3
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
4
4
|
|
@@ -92,3 +92,18 @@ DEFAULT_NUM_ROUNDS = 1
|
|
92
92
|
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
93
|
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
94
|
)
|
95
|
+
|
96
|
+
# This env variable specifies whether to check bucketing spec
|
97
|
+
# compliance of the existing compacted table.
|
98
|
+
# PRINT_LOG: Enable logging if any partition is found
|
99
|
+
# to be non-compliant with the bucketing spec.
|
100
|
+
# ASSERT: Fail the job with ValidationError if the
|
101
|
+
# current compacted partition is found to be non-compliant
|
102
|
+
# with bucketing spec. Note, logging is implicitly enabled
|
103
|
+
# in this case.
|
104
|
+
BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
|
105
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE", None
|
106
|
+
)
|
107
|
+
|
108
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
|
109
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
|
@@ -32,6 +32,7 @@ from deltacat.utils.resources import (
|
|
32
32
|
)
|
33
33
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
34
34
|
generate_pk_hash_column,
|
35
|
+
pk_digest_to_hash_bucket_index,
|
35
36
|
)
|
36
37
|
from deltacat.storage import (
|
37
38
|
Delta,
|
@@ -47,6 +48,9 @@ from deltacat.compute.compactor_v2.constants import (
|
|
47
48
|
MERGE_TIME_IN_SECONDS,
|
48
49
|
MERGE_SUCCESS_COUNT,
|
49
50
|
MERGE_FAILURE_COUNT,
|
51
|
+
BUCKETING_SPEC_COMPLIANCE_PROFILE,
|
52
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
53
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
50
54
|
)
|
51
55
|
from deltacat.exceptions import (
|
52
56
|
categorize_errors,
|
@@ -188,9 +192,34 @@ def _merge_tables(
|
|
188
192
|
return final_table
|
189
193
|
|
190
194
|
|
195
|
+
def _validate_bucketing_spec_compliance(
|
196
|
+
table: pa.Table, rcf: RoundCompletionInfo, hb_index: int, primary_keys: List[str]
|
197
|
+
) -> None:
|
198
|
+
pki_table = generate_pk_hash_column(
|
199
|
+
[table], primary_keys=primary_keys, requires_hash=True
|
200
|
+
)[0]
|
201
|
+
for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
|
202
|
+
hash_bucket = pk_digest_to_hash_bucket_index(hash_value, rcf.hash_bucket_count)
|
203
|
+
if hash_bucket != hb_index:
|
204
|
+
logger.info(
|
205
|
+
f"{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}"
|
206
|
+
f".{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}"
|
207
|
+
f".{rcf.compacted_delta_locator.partition_values} has non-compliant bucketing spec. "
|
208
|
+
f"Expected hash bucket is {hb_index} but found {hash_bucket}."
|
209
|
+
)
|
210
|
+
if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
|
211
|
+
raise AssertionError(
|
212
|
+
"Hash bucket drift detected. Expected hash bucket index"
|
213
|
+
f" to be {hb_index} but found {hash_bucket}"
|
214
|
+
)
|
215
|
+
# No further checks necessary
|
216
|
+
break
|
217
|
+
|
218
|
+
|
191
219
|
def _download_compacted_table(
|
192
220
|
hb_index: int,
|
193
221
|
rcf: RoundCompletionInfo,
|
222
|
+
primary_keys: List[str],
|
194
223
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
195
224
|
deltacat_storage=unimplemented_deltacat_storage,
|
196
225
|
deltacat_storage_kwargs: Optional[dict] = None,
|
@@ -214,7 +243,23 @@ def _download_compacted_table(
|
|
214
243
|
|
215
244
|
tables.append(table)
|
216
245
|
|
217
|
-
|
246
|
+
compacted_table = pa.concat_tables(tables)
|
247
|
+
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
248
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
249
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
250
|
+
]
|
251
|
+
|
252
|
+
logger.debug(
|
253
|
+
f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
|
254
|
+
f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
|
255
|
+
)
|
256
|
+
|
257
|
+
# Bucketing spec compliance isn't required without primary keys
|
258
|
+
if primary_keys and check_bucketing_spec:
|
259
|
+
_validate_bucketing_spec_compliance(
|
260
|
+
compacted_table, rcf, hb_index, primary_keys
|
261
|
+
)
|
262
|
+
return compacted_table
|
218
263
|
|
219
264
|
|
220
265
|
def _copy_all_manifest_files_from_old_hash_buckets(
|
@@ -543,6 +588,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
543
588
|
compacted_table = _download_compacted_table(
|
544
589
|
hb_index=merge_file_group.hb_index,
|
545
590
|
rcf=input.round_completion_info,
|
591
|
+
primary_keys=input.primary_keys,
|
546
592
|
read_kwargs_provider=input.read_kwargs_provider,
|
547
593
|
deltacat_storage=input.deltacat_storage,
|
548
594
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
@@ -4,9 +4,11 @@ import os
|
|
4
4
|
import pyarrow as pa
|
5
5
|
import pytest
|
6
6
|
import boto3
|
7
|
+
import json
|
7
8
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
8
9
|
CompactionSessionAuditInfo,
|
9
10
|
)
|
11
|
+
from deltacat.exceptions import ValidationError
|
10
12
|
from boto3.resources.base import ServiceResource
|
11
13
|
import deltacat.tests.local_deltacat_storage as ds
|
12
14
|
from deltacat.types.media import ContentType
|
@@ -88,6 +90,17 @@ def disable_sha1(monkeypatch):
|
|
88
90
|
)
|
89
91
|
|
90
92
|
|
93
|
+
@pytest.fixture(scope="function")
|
94
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
95
|
+
import deltacat.compute.compactor_v2.steps.merge
|
96
|
+
|
97
|
+
monkeypatch.setattr(
|
98
|
+
deltacat.compute.compactor_v2.steps.merge,
|
99
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
100
|
+
"ASSERT",
|
101
|
+
)
|
102
|
+
|
103
|
+
|
91
104
|
class TestCompactionSession:
|
92
105
|
"""
|
93
106
|
This class adds specific tests that aren't part of the parametrized test suite.
|
@@ -689,3 +702,307 @@ class TestCompactionSession:
|
|
689
702
|
incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
|
690
703
|
)
|
691
704
|
assert incremental_rcf.compacted_pyarrow_write_result.records == 4
|
705
|
+
|
706
|
+
def test_compact_partition_when_bucket_spec_validation_fails(
|
707
|
+
self,
|
708
|
+
s3_resource,
|
709
|
+
local_deltacat_storage_kwargs,
|
710
|
+
enable_bucketing_spec_validation,
|
711
|
+
):
|
712
|
+
"""
|
713
|
+
A test case which asserts the bucketing spec validation throws an assertion error
|
714
|
+
when the validation has failed.
|
715
|
+
"""
|
716
|
+
|
717
|
+
# setup
|
718
|
+
staged_source = stage_partition_from_file_paths(
|
719
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
720
|
+
)
|
721
|
+
|
722
|
+
source_delta = commit_delta_to_staged_partition(
|
723
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
724
|
+
)
|
725
|
+
|
726
|
+
staged_dest = stage_partition_from_file_paths(
|
727
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
728
|
+
)
|
729
|
+
dest_partition = ds.commit_partition(
|
730
|
+
staged_dest, **local_deltacat_storage_kwargs
|
731
|
+
)
|
732
|
+
|
733
|
+
# action
|
734
|
+
rcf_url = compact_partition(
|
735
|
+
CompactPartitionParams.of(
|
736
|
+
{
|
737
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
738
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
739
|
+
"dd_max_parallelism_ratio": 1.0,
|
740
|
+
"deltacat_storage": ds,
|
741
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
742
|
+
"destination_partition_locator": dest_partition.locator,
|
743
|
+
"drop_duplicates": True,
|
744
|
+
"hash_bucket_count": 4,
|
745
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
746
|
+
"list_deltas_kwargs": {
|
747
|
+
**local_deltacat_storage_kwargs,
|
748
|
+
**{"equivalent_table_types": []},
|
749
|
+
},
|
750
|
+
"primary_keys": ["pk"],
|
751
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
752
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
753
|
+
"records_per_compacted_file": 1,
|
754
|
+
"s3_client_kwargs": {},
|
755
|
+
"source_partition_locator": source_delta.partition_locator,
|
756
|
+
}
|
757
|
+
)
|
758
|
+
)
|
759
|
+
|
760
|
+
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
761
|
+
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
762
|
+
# Move the records to different hash buckets to simulate a validation failure.
|
763
|
+
backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
|
764
|
+
s3_resource.Bucket(bucket).put_object(
|
765
|
+
Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
|
766
|
+
)
|
767
|
+
|
768
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
769
|
+
new_source_delta = commit_delta_to_partition(
|
770
|
+
source_delta.partition_locator,
|
771
|
+
[self.INCREMENTAL_FILE_PATH],
|
772
|
+
**local_deltacat_storage_kwargs,
|
773
|
+
)
|
774
|
+
|
775
|
+
new_destination_partition = ds.get_partition(
|
776
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
777
|
+
)
|
778
|
+
|
779
|
+
with pytest.raises(ValidationError) as excinfo:
|
780
|
+
compact_partition(
|
781
|
+
CompactPartitionParams.of(
|
782
|
+
{
|
783
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
784
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
785
|
+
"dd_max_parallelism_ratio": 1.0,
|
786
|
+
"deltacat_storage": ds,
|
787
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
788
|
+
"destination_partition_locator": new_destination_partition.locator,
|
789
|
+
"drop_duplicates": True,
|
790
|
+
"hash_bucket_count": 4,
|
791
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
792
|
+
"list_deltas_kwargs": {
|
793
|
+
**local_deltacat_storage_kwargs,
|
794
|
+
**{"equivalent_table_types": []},
|
795
|
+
},
|
796
|
+
"primary_keys": ["pk"],
|
797
|
+
"rebase_source_partition_locator": None,
|
798
|
+
"rebase_source_partition_high_watermark": None,
|
799
|
+
"records_per_compacted_file": 4000,
|
800
|
+
"s3_client_kwargs": {},
|
801
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
802
|
+
}
|
803
|
+
)
|
804
|
+
)
|
805
|
+
|
806
|
+
assert (
|
807
|
+
"Hash bucket drift detected. Expected hash bucket index to be 1 but found 0"
|
808
|
+
in str(excinfo.value)
|
809
|
+
)
|
810
|
+
|
811
|
+
def test_compact_partition_when_bucket_spec_validation_fails_but_env_variable_disabled(
|
812
|
+
self,
|
813
|
+
s3_resource,
|
814
|
+
local_deltacat_storage_kwargs,
|
815
|
+
):
|
816
|
+
"""
|
817
|
+
A test case which asserts even if bucketing spec validation fails, compaction doesn't
|
818
|
+
throw an error if the feature is not enabled.
|
819
|
+
"""
|
820
|
+
|
821
|
+
# setup
|
822
|
+
staged_source = stage_partition_from_file_paths(
|
823
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
824
|
+
)
|
825
|
+
|
826
|
+
source_delta = commit_delta_to_staged_partition(
|
827
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
828
|
+
)
|
829
|
+
|
830
|
+
staged_dest = stage_partition_from_file_paths(
|
831
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
832
|
+
)
|
833
|
+
dest_partition = ds.commit_partition(
|
834
|
+
staged_dest, **local_deltacat_storage_kwargs
|
835
|
+
)
|
836
|
+
|
837
|
+
# action
|
838
|
+
rcf_url = compact_partition(
|
839
|
+
CompactPartitionParams.of(
|
840
|
+
{
|
841
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
842
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
843
|
+
"dd_max_parallelism_ratio": 1.0,
|
844
|
+
"deltacat_storage": ds,
|
845
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
846
|
+
"destination_partition_locator": dest_partition.locator,
|
847
|
+
"drop_duplicates": True,
|
848
|
+
"hash_bucket_count": 4,
|
849
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
850
|
+
"list_deltas_kwargs": {
|
851
|
+
**local_deltacat_storage_kwargs,
|
852
|
+
**{"equivalent_table_types": []},
|
853
|
+
},
|
854
|
+
"primary_keys": ["pk"],
|
855
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
856
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
857
|
+
"records_per_compacted_file": 1,
|
858
|
+
"s3_client_kwargs": {},
|
859
|
+
"source_partition_locator": source_delta.partition_locator,
|
860
|
+
}
|
861
|
+
)
|
862
|
+
)
|
863
|
+
|
864
|
+
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
865
|
+
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
866
|
+
# Move the records to different hash buckets to simulate a validation failure.
|
867
|
+
backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
|
868
|
+
s3_resource.Bucket(bucket).put_object(
|
869
|
+
Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
|
870
|
+
)
|
871
|
+
|
872
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
873
|
+
new_source_delta = commit_delta_to_partition(
|
874
|
+
source_delta.partition_locator,
|
875
|
+
[self.INCREMENTAL_FILE_PATH],
|
876
|
+
**local_deltacat_storage_kwargs,
|
877
|
+
)
|
878
|
+
|
879
|
+
new_destination_partition = ds.get_partition(
|
880
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
881
|
+
)
|
882
|
+
|
883
|
+
new_rcf = compact_partition(
|
884
|
+
CompactPartitionParams.of(
|
885
|
+
{
|
886
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
887
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
888
|
+
"dd_max_parallelism_ratio": 1.0,
|
889
|
+
"deltacat_storage": ds,
|
890
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
891
|
+
"destination_partition_locator": new_destination_partition.locator,
|
892
|
+
"drop_duplicates": True,
|
893
|
+
"hash_bucket_count": 4,
|
894
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
895
|
+
"list_deltas_kwargs": {
|
896
|
+
**local_deltacat_storage_kwargs,
|
897
|
+
**{"equivalent_table_types": []},
|
898
|
+
},
|
899
|
+
"primary_keys": ["pk"],
|
900
|
+
"rebase_source_partition_locator": None,
|
901
|
+
"rebase_source_partition_high_watermark": None,
|
902
|
+
"records_per_compacted_file": 4000,
|
903
|
+
"s3_client_kwargs": {},
|
904
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
905
|
+
}
|
906
|
+
)
|
907
|
+
)
|
908
|
+
|
909
|
+
incremental_rcf = get_rcf(s3_resource, new_rcf)
|
910
|
+
assert incremental_rcf.hash_bucket_count == 4
|
911
|
+
assert len(incremental_rcf.hb_index_to_entry_range) == 2
|
912
|
+
|
913
|
+
def test_compact_partition_when_bucket_spec_validation_succeeds(
|
914
|
+
self,
|
915
|
+
s3_resource,
|
916
|
+
local_deltacat_storage_kwargs,
|
917
|
+
enable_bucketing_spec_validation,
|
918
|
+
):
|
919
|
+
"""
|
920
|
+
A test case which asserts the bucketing spec validation does not throw
|
921
|
+
and error when the validation succeeds.
|
922
|
+
"""
|
923
|
+
|
924
|
+
# setup
|
925
|
+
staged_source = stage_partition_from_file_paths(
|
926
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
927
|
+
)
|
928
|
+
|
929
|
+
source_delta = commit_delta_to_staged_partition(
|
930
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
931
|
+
)
|
932
|
+
|
933
|
+
staged_dest = stage_partition_from_file_paths(
|
934
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
935
|
+
)
|
936
|
+
dest_partition = ds.commit_partition(
|
937
|
+
staged_dest, **local_deltacat_storage_kwargs
|
938
|
+
)
|
939
|
+
|
940
|
+
# action
|
941
|
+
rcf_url = compact_partition(
|
942
|
+
CompactPartitionParams.of(
|
943
|
+
{
|
944
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
945
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
946
|
+
"dd_max_parallelism_ratio": 1.0,
|
947
|
+
"deltacat_storage": ds,
|
948
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
949
|
+
"destination_partition_locator": dest_partition.locator,
|
950
|
+
"drop_duplicates": True,
|
951
|
+
"hash_bucket_count": 4,
|
952
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
953
|
+
"list_deltas_kwargs": {
|
954
|
+
**local_deltacat_storage_kwargs,
|
955
|
+
**{"equivalent_table_types": []},
|
956
|
+
},
|
957
|
+
"primary_keys": ["pk"],
|
958
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
959
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
960
|
+
"records_per_compacted_file": 1,
|
961
|
+
"s3_client_kwargs": {},
|
962
|
+
"source_partition_locator": source_delta.partition_locator,
|
963
|
+
}
|
964
|
+
)
|
965
|
+
)
|
966
|
+
|
967
|
+
rcf = get_rcf(s3_resource, rcf_url)
|
968
|
+
assert rcf.hash_bucket_count == 4
|
969
|
+
|
970
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
971
|
+
new_source_delta = commit_delta_to_partition(
|
972
|
+
source_delta.partition_locator,
|
973
|
+
[self.INCREMENTAL_FILE_PATH],
|
974
|
+
**local_deltacat_storage_kwargs,
|
975
|
+
)
|
976
|
+
|
977
|
+
new_destination_partition = ds.get_partition(
|
978
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
979
|
+
)
|
980
|
+
|
981
|
+
new_uri = compact_partition(
|
982
|
+
CompactPartitionParams.of(
|
983
|
+
{
|
984
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
985
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
986
|
+
"dd_max_parallelism_ratio": 1.0,
|
987
|
+
"deltacat_storage": ds,
|
988
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
989
|
+
"destination_partition_locator": new_destination_partition.locator,
|
990
|
+
"drop_duplicates": True,
|
991
|
+
"hash_bucket_count": 4,
|
992
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
993
|
+
"list_deltas_kwargs": {
|
994
|
+
**local_deltacat_storage_kwargs,
|
995
|
+
**{"equivalent_table_types": []},
|
996
|
+
},
|
997
|
+
"primary_keys": ["pk"],
|
998
|
+
"rebase_source_partition_locator": None,
|
999
|
+
"rebase_source_partition_high_watermark": None,
|
1000
|
+
"records_per_compacted_file": 4000,
|
1001
|
+
"s3_client_kwargs": {},
|
1002
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
1003
|
+
}
|
1004
|
+
)
|
1005
|
+
)
|
1006
|
+
|
1007
|
+
rcf = get_rcf(s3_resource, new_uri)
|
1008
|
+
assert rcf.hash_bucket_count == 4
|
@@ -119,6 +119,21 @@ def offer_local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
119
119
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
120
120
|
|
121
121
|
|
122
|
+
@pytest.fixture(autouse=True, scope="function")
|
123
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
124
|
+
"""
|
125
|
+
Enable the bucketing spec validation for all tests.
|
126
|
+
This will help catch hash bucket drift in testing.
|
127
|
+
"""
|
128
|
+
import deltacat.compute.compactor_v2.steps.merge
|
129
|
+
|
130
|
+
monkeypatch.setattr(
|
131
|
+
deltacat.compute.compactor_v2.steps.merge,
|
132
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
133
|
+
"ASSERT",
|
134
|
+
)
|
135
|
+
|
136
|
+
|
122
137
|
@pytest.mark.parametrize(
|
123
138
|
[
|
124
139
|
"test_name",
|
@@ -114,6 +114,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
114
114
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
115
|
|
116
116
|
|
117
|
+
@pytest.fixture(autouse=True, scope="function")
|
118
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
119
|
+
"""
|
120
|
+
Enable the bucketing spec validation for all tests.
|
121
|
+
This will help catch hash bucket drift in testing.
|
122
|
+
"""
|
123
|
+
import deltacat.compute.compactor_v2.steps.merge
|
124
|
+
|
125
|
+
monkeypatch.setattr(
|
126
|
+
deltacat.compute.compactor_v2.steps.merge,
|
127
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
128
|
+
"ASSERT",
|
129
|
+
)
|
130
|
+
|
131
|
+
|
117
132
|
@pytest.mark.parametrize(
|
118
133
|
[
|
119
134
|
"test_name",
|
@@ -114,6 +114,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
114
114
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
115
|
|
116
116
|
|
117
|
+
@pytest.fixture(autouse=True, scope="function")
|
118
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
119
|
+
"""
|
120
|
+
Enable the bucketing spec validation for all tests.
|
121
|
+
This will help catch hash bucket drift in testing.
|
122
|
+
"""
|
123
|
+
import deltacat.compute.compactor_v2.steps.merge
|
124
|
+
|
125
|
+
monkeypatch.setattr(
|
126
|
+
deltacat.compute.compactor_v2.steps.merge,
|
127
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
128
|
+
"ASSERT",
|
129
|
+
)
|
130
|
+
|
131
|
+
|
117
132
|
@pytest.mark.parametrize(
|
118
133
|
[
|
119
134
|
"test_name",
|
@@ -118,6 +118,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
118
118
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
119
119
|
|
120
120
|
|
121
|
+
@pytest.fixture(autouse=True, scope="function")
|
122
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
123
|
+
"""
|
124
|
+
Enable the bucketing spec validation for all tests.
|
125
|
+
This will help catch hash bucket drift in testing.
|
126
|
+
"""
|
127
|
+
import deltacat.compute.compactor_v2.steps.merge
|
128
|
+
|
129
|
+
monkeypatch.setattr(
|
130
|
+
deltacat.compute.compactor_v2.steps.merge,
|
131
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
132
|
+
"ASSERT",
|
133
|
+
)
|
134
|
+
|
135
|
+
|
121
136
|
@pytest.mark.parametrize(
|
122
137
|
[
|
123
138
|
"test_name",
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=br2aQSDj5eFS_j0mwGUSEQF386HRAXjiYg421vB9pME,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
|
|
51
51
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
52
52
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
|
54
|
-
deltacat/compute/compactor_v2/constants.py,sha256=
|
54
|
+
deltacat/compute/compactor_v2/constants.py,sha256=F5Phrh-2JgnWvtjHXacxOG5Z2ivKcHnboerI12rc1zk,3632
|
55
55
|
deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
|
57
57
|
deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
|
@@ -69,7 +69,7 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
69
69
|
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
|
-
deltacat/compute/compactor_v2/steps/merge.py,sha256=
|
72
|
+
deltacat/compute/compactor_v2/steps/merge.py,sha256=T2G2AaVsezYzo6oJtpuXH-bYv8nt-yFHA5ZbDIGodQg,24971
|
73
73
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
74
|
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=t2j9H9IdFRH9EfpL-9g5XvZs9WK9HybqBGA7fDi82EM,8310
|
75
75
|
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
|
@@ -137,11 +137,11 @@ deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kW
|
|
137
137
|
deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
|
138
138
|
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
|
139
139
|
deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
|
140
|
-
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=
|
141
|
-
deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=
|
140
|
+
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=8hUqnzeGIhAENcBxLL0R_yfjAaNTmRds6OWxQOmVqD8,15416
|
141
|
+
deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=6d3F9E_4eO2Okh97v8NWFbEptPkzKoO0Qq8O6yAXrIs,13377
|
142
142
|
deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
|
143
|
-
deltacat/tests/compute/test_compact_partition_rebase.py,sha256=
|
144
|
-
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=
|
143
|
+
deltacat/tests/compute/test_compact_partition_rebase.py,sha256=vOF8wgTpdaWJKo47mK9aii3NKtwVwWgujoQyS8C3YyA,13535
|
144
|
+
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=BimvU-iHiF78WlE4xbfk4dzHm0a-frwpE7H7Kh4XkbE,15500
|
145
145
|
deltacat/tests/compute/test_util_common.py,sha256=0mEHo38bgH64y0XZ_zgUL_aZgQMgJOSTlOYvIJxG_MM,11825
|
146
146
|
deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
|
147
147
|
deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=Q3HJj1fjoe2JwRUOW8KEjbTqPIIoP2o_T3ZGH6SJnCM,13244
|
@@ -152,7 +152,7 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
152
152
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
153
153
|
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
154
154
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=
|
155
|
+
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=zEXOIilybDpKuQt1ZRxGg4x_kUacBOcHE8KWcOmL01s,42563
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
158
|
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoiDuBUhgCmc3DYKCXL1g4QWtmROhZ0RJCQgePMY9as,9959
|
@@ -212,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
212
212
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
213
213
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
214
214
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
219
|
-
deltacat-1.1.
|
215
|
+
deltacat-1.1.35.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
216
|
+
deltacat-1.1.35.dist-info/METADATA,sha256=b8Z4aVdNYjBoy0_uh0m4yoU_8h2w8v7I2AZOwacv5Es,1733
|
217
|
+
deltacat-1.1.35.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
218
|
+
deltacat-1.1.35.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
219
|
+
deltacat-1.1.35.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|