deltacat 1.1.33__py3-none-any.whl → 1.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/constants.py +16 -1
- deltacat/compute/compactor_v2/steps/merge.py +47 -1
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -0
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/resource_estimation/delta.py +19 -1
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +317 -0
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +253 -0
- deltacat/tests/compute/test_compact_partition_incremental.py +15 -0
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +15 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +15 -0
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +15 -0
- {deltacat-1.1.33.dist-info → deltacat-1.1.35.dist-info}/METADATA +1 -1
- {deltacat-1.1.33.dist-info → deltacat-1.1.35.dist-info}/RECORD +17 -16
- {deltacat-1.1.33.dist-info → deltacat-1.1.35.dist-info}/LICENSE +0 -0
- {deltacat-1.1.33.dist-info → deltacat-1.1.35.dist-info}/WHEEL +0 -0
- {deltacat-1.1.33.dist-info → deltacat-1.1.35.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from deltacat.utils.common import env_bool, env_integer
|
1
|
+
from deltacat.utils.common import env_bool, env_integer, env_string
|
2
2
|
|
3
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
4
4
|
|
@@ -92,3 +92,18 @@ DEFAULT_NUM_ROUNDS = 1
|
|
92
92
|
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
93
|
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
94
|
)
|
95
|
+
|
96
|
+
# This env variable specifies whether to check bucketing spec
|
97
|
+
# compliance of the existing compacted table.
|
98
|
+
# PRINT_LOG: Enable logging if any partition is found
|
99
|
+
# to be non-compliant with the bucketing spec.
|
100
|
+
# ASSERT: Fail the job with ValidationError if the
|
101
|
+
# current compacted partition is found to be non-compliant
|
102
|
+
# with bucketing spec. Note, logging is implicitly enabled
|
103
|
+
# in this case.
|
104
|
+
BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
|
105
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE", None
|
106
|
+
)
|
107
|
+
|
108
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
|
109
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
|
@@ -32,6 +32,7 @@ from deltacat.utils.resources import (
|
|
32
32
|
)
|
33
33
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
34
34
|
generate_pk_hash_column,
|
35
|
+
pk_digest_to_hash_bucket_index,
|
35
36
|
)
|
36
37
|
from deltacat.storage import (
|
37
38
|
Delta,
|
@@ -47,6 +48,9 @@ from deltacat.compute.compactor_v2.constants import (
|
|
47
48
|
MERGE_TIME_IN_SECONDS,
|
48
49
|
MERGE_SUCCESS_COUNT,
|
49
50
|
MERGE_FAILURE_COUNT,
|
51
|
+
BUCKETING_SPEC_COMPLIANCE_PROFILE,
|
52
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
53
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
50
54
|
)
|
51
55
|
from deltacat.exceptions import (
|
52
56
|
categorize_errors,
|
@@ -188,9 +192,34 @@ def _merge_tables(
|
|
188
192
|
return final_table
|
189
193
|
|
190
194
|
|
195
|
+
def _validate_bucketing_spec_compliance(
|
196
|
+
table: pa.Table, rcf: RoundCompletionInfo, hb_index: int, primary_keys: List[str]
|
197
|
+
) -> None:
|
198
|
+
pki_table = generate_pk_hash_column(
|
199
|
+
[table], primary_keys=primary_keys, requires_hash=True
|
200
|
+
)[0]
|
201
|
+
for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
|
202
|
+
hash_bucket = pk_digest_to_hash_bucket_index(hash_value, rcf.hash_bucket_count)
|
203
|
+
if hash_bucket != hb_index:
|
204
|
+
logger.info(
|
205
|
+
f"{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}"
|
206
|
+
f".{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}"
|
207
|
+
f".{rcf.compacted_delta_locator.partition_values} has non-compliant bucketing spec. "
|
208
|
+
f"Expected hash bucket is {hb_index} but found {hash_bucket}."
|
209
|
+
)
|
210
|
+
if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
|
211
|
+
raise AssertionError(
|
212
|
+
"Hash bucket drift detected. Expected hash bucket index"
|
213
|
+
f" to be {hb_index} but found {hash_bucket}"
|
214
|
+
)
|
215
|
+
# No further checks necessary
|
216
|
+
break
|
217
|
+
|
218
|
+
|
191
219
|
def _download_compacted_table(
|
192
220
|
hb_index: int,
|
193
221
|
rcf: RoundCompletionInfo,
|
222
|
+
primary_keys: List[str],
|
194
223
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
195
224
|
deltacat_storage=unimplemented_deltacat_storage,
|
196
225
|
deltacat_storage_kwargs: Optional[dict] = None,
|
@@ -214,7 +243,23 @@ def _download_compacted_table(
|
|
214
243
|
|
215
244
|
tables.append(table)
|
216
245
|
|
217
|
-
|
246
|
+
compacted_table = pa.concat_tables(tables)
|
247
|
+
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
248
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
249
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
250
|
+
]
|
251
|
+
|
252
|
+
logger.debug(
|
253
|
+
f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
|
254
|
+
f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
|
255
|
+
)
|
256
|
+
|
257
|
+
# Bucketing spec compliance isn't required without primary keys
|
258
|
+
if primary_keys and check_bucketing_spec:
|
259
|
+
_validate_bucketing_spec_compliance(
|
260
|
+
compacted_table, rcf, hb_index, primary_keys
|
261
|
+
)
|
262
|
+
return compacted_table
|
218
263
|
|
219
264
|
|
220
265
|
def _copy_all_manifest_files_from_old_hash_buckets(
|
@@ -543,6 +588,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
543
588
|
compacted_table = _download_compacted_table(
|
544
589
|
hb_index=merge_file_group.hb_index,
|
545
590
|
rcf=input.round_completion_info,
|
591
|
+
primary_keys=input.primary_keys,
|
546
592
|
read_kwargs_provider=input.read_kwargs_provider,
|
547
593
|
deltacat_storage=input.deltacat_storage,
|
548
594
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
@@ -5,6 +5,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
5
5
|
TASK_MAX_PARALLELISM,
|
6
6
|
MAX_PARQUET_METADATA_SIZE,
|
7
7
|
)
|
8
|
+
from deltacat.utils.common import ReadKwargsProvider
|
8
9
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
9
10
|
from deltacat import logs
|
10
11
|
from deltacat.storage import (
|
@@ -75,11 +76,21 @@ def _download_parquet_metadata_for_manifest_entry(
|
|
75
76
|
entry_index: int,
|
76
77
|
deltacat_storage: unimplemented_deltacat_storage,
|
77
78
|
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
|
79
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
78
80
|
) -> Dict[str, Any]:
|
81
|
+
logger.info(
|
82
|
+
f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
|
83
|
+
)
|
84
|
+
if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
|
85
|
+
logger.info(
|
86
|
+
"'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
|
87
|
+
)
|
88
|
+
deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
|
79
89
|
pq_file = deltacat_storage.download_delta_manifest_entry(
|
80
90
|
delta,
|
81
91
|
entry_index=entry_index,
|
82
92
|
table_type=TableType.PYARROW_PARQUET,
|
93
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
83
94
|
**deltacat_storage_kwargs,
|
84
95
|
)
|
85
96
|
|
@@ -97,11 +108,15 @@ def append_content_type_params(
|
|
97
108
|
max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
|
98
109
|
deltacat_storage=unimplemented_deltacat_storage,
|
99
110
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
111
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
100
112
|
) -> bool:
|
101
113
|
"""
|
102
114
|
This operation appends content type params into the delta entry. Note
|
103
115
|
that this operation can be time consuming, hence we cache it in a Ray actor.
|
104
116
|
"""
|
117
|
+
logger.info(
|
118
|
+
f"Appending the content type params for Delta with locator {delta.locator}..."
|
119
|
+
)
|
105
120
|
|
106
121
|
if not delta.meta:
|
107
122
|
logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
|
@@ -159,6 +174,7 @@ def append_content_type_params(
|
|
159
174
|
|
160
175
|
def input_provider(index, item) -> Dict:
|
161
176
|
return {
|
177
|
+
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
162
178
|
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
163
179
|
"deltacat_storage": deltacat_storage,
|
164
180
|
"delta": delta,
|
@@ -168,6 +184,7 @@ def append_content_type_params(
|
|
168
184
|
logger.info(
|
169
185
|
f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
|
170
186
|
)
|
187
|
+
|
171
188
|
pq_files_promise = invoke_parallel(
|
172
189
|
entry_indices_to_download,
|
173
190
|
ray_task=_download_parquet_metadata_for_manifest_entry,
|
@@ -101,7 +101,6 @@ def create_uniform_input_deltas(
|
|
101
101
|
delta_manifest_entries_count = 0
|
102
102
|
estimated_da_bytes = 0
|
103
103
|
input_da_list = []
|
104
|
-
|
105
104
|
for delta in input_deltas:
|
106
105
|
if (
|
107
106
|
compact_partition_params.enable_input_split
|
@@ -118,6 +117,7 @@ def create_uniform_input_deltas(
|
|
118
117
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
119
118
|
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
120
119
|
max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
|
120
|
+
file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
|
121
121
|
)
|
122
122
|
|
123
123
|
manifest_entries = delta.manifest.entries
|
@@ -93,11 +93,29 @@ def _estimate_resources_required_to_process_delta_using_type_params(
|
|
93
93
|
on_disk_size_bytes=delta.meta.content_length,
|
94
94
|
),
|
95
95
|
)
|
96
|
-
|
96
|
+
file_reader_kwargs_provider = kwargs.get(
|
97
|
+
"file_reader_kwargs_provider"
|
98
|
+
) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
|
99
|
+
|
100
|
+
"""
|
101
|
+
NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
|
102
|
+
1. Nested within deltacat_storage_kwargs during resource estimation
|
103
|
+
2. As a top-level attribute of CompactPartitionsParams during compaction
|
104
|
+
|
105
|
+
This creates an inconsistent parameter path between resource estimation and compaction flows.
|
106
|
+
As a long-term solution, this should be unified to use a single consistent path (either always
|
107
|
+
nested in deltacat_storage_kwargs or always as a top-level parameter).
|
108
|
+
|
109
|
+
For now, this implementation handles the resource estimation case by:
|
110
|
+
1. First checking for file_reader_kwargs_provider as a direct kwarg
|
111
|
+
2. Falling back to deltacat_storage_kwargs if not found
|
112
|
+
This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
|
113
|
+
"""
|
97
114
|
appended = append_content_type_params(
|
98
115
|
delta=delta,
|
99
116
|
deltacat_storage=deltacat_storage,
|
100
117
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
118
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
101
119
|
)
|
102
120
|
|
103
121
|
if not appended:
|
@@ -4,9 +4,11 @@ import os
|
|
4
4
|
import pyarrow as pa
|
5
5
|
import pytest
|
6
6
|
import boto3
|
7
|
+
import json
|
7
8
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
8
9
|
CompactionSessionAuditInfo,
|
9
10
|
)
|
11
|
+
from deltacat.exceptions import ValidationError
|
10
12
|
from boto3.resources.base import ServiceResource
|
11
13
|
import deltacat.tests.local_deltacat_storage as ds
|
12
14
|
from deltacat.types.media import ContentType
|
@@ -88,6 +90,17 @@ def disable_sha1(monkeypatch):
|
|
88
90
|
)
|
89
91
|
|
90
92
|
|
93
|
+
@pytest.fixture(scope="function")
|
94
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
95
|
+
import deltacat.compute.compactor_v2.steps.merge
|
96
|
+
|
97
|
+
monkeypatch.setattr(
|
98
|
+
deltacat.compute.compactor_v2.steps.merge,
|
99
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
100
|
+
"ASSERT",
|
101
|
+
)
|
102
|
+
|
103
|
+
|
91
104
|
class TestCompactionSession:
|
92
105
|
"""
|
93
106
|
This class adds specific tests that aren't part of the parametrized test suite.
|
@@ -689,3 +702,307 @@ class TestCompactionSession:
|
|
689
702
|
incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
|
690
703
|
)
|
691
704
|
assert incremental_rcf.compacted_pyarrow_write_result.records == 4
|
705
|
+
|
706
|
+
def test_compact_partition_when_bucket_spec_validation_fails(
|
707
|
+
self,
|
708
|
+
s3_resource,
|
709
|
+
local_deltacat_storage_kwargs,
|
710
|
+
enable_bucketing_spec_validation,
|
711
|
+
):
|
712
|
+
"""
|
713
|
+
A test case which asserts the bucketing spec validation throws an assertion error
|
714
|
+
when the validation has failed.
|
715
|
+
"""
|
716
|
+
|
717
|
+
# setup
|
718
|
+
staged_source = stage_partition_from_file_paths(
|
719
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
720
|
+
)
|
721
|
+
|
722
|
+
source_delta = commit_delta_to_staged_partition(
|
723
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
724
|
+
)
|
725
|
+
|
726
|
+
staged_dest = stage_partition_from_file_paths(
|
727
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
728
|
+
)
|
729
|
+
dest_partition = ds.commit_partition(
|
730
|
+
staged_dest, **local_deltacat_storage_kwargs
|
731
|
+
)
|
732
|
+
|
733
|
+
# action
|
734
|
+
rcf_url = compact_partition(
|
735
|
+
CompactPartitionParams.of(
|
736
|
+
{
|
737
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
738
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
739
|
+
"dd_max_parallelism_ratio": 1.0,
|
740
|
+
"deltacat_storage": ds,
|
741
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
742
|
+
"destination_partition_locator": dest_partition.locator,
|
743
|
+
"drop_duplicates": True,
|
744
|
+
"hash_bucket_count": 4,
|
745
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
746
|
+
"list_deltas_kwargs": {
|
747
|
+
**local_deltacat_storage_kwargs,
|
748
|
+
**{"equivalent_table_types": []},
|
749
|
+
},
|
750
|
+
"primary_keys": ["pk"],
|
751
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
752
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
753
|
+
"records_per_compacted_file": 1,
|
754
|
+
"s3_client_kwargs": {},
|
755
|
+
"source_partition_locator": source_delta.partition_locator,
|
756
|
+
}
|
757
|
+
)
|
758
|
+
)
|
759
|
+
|
760
|
+
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
761
|
+
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
762
|
+
# Move the records to different hash buckets to simulate a validation failure.
|
763
|
+
backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
|
764
|
+
s3_resource.Bucket(bucket).put_object(
|
765
|
+
Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
|
766
|
+
)
|
767
|
+
|
768
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
769
|
+
new_source_delta = commit_delta_to_partition(
|
770
|
+
source_delta.partition_locator,
|
771
|
+
[self.INCREMENTAL_FILE_PATH],
|
772
|
+
**local_deltacat_storage_kwargs,
|
773
|
+
)
|
774
|
+
|
775
|
+
new_destination_partition = ds.get_partition(
|
776
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
777
|
+
)
|
778
|
+
|
779
|
+
with pytest.raises(ValidationError) as excinfo:
|
780
|
+
compact_partition(
|
781
|
+
CompactPartitionParams.of(
|
782
|
+
{
|
783
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
784
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
785
|
+
"dd_max_parallelism_ratio": 1.0,
|
786
|
+
"deltacat_storage": ds,
|
787
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
788
|
+
"destination_partition_locator": new_destination_partition.locator,
|
789
|
+
"drop_duplicates": True,
|
790
|
+
"hash_bucket_count": 4,
|
791
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
792
|
+
"list_deltas_kwargs": {
|
793
|
+
**local_deltacat_storage_kwargs,
|
794
|
+
**{"equivalent_table_types": []},
|
795
|
+
},
|
796
|
+
"primary_keys": ["pk"],
|
797
|
+
"rebase_source_partition_locator": None,
|
798
|
+
"rebase_source_partition_high_watermark": None,
|
799
|
+
"records_per_compacted_file": 4000,
|
800
|
+
"s3_client_kwargs": {},
|
801
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
802
|
+
}
|
803
|
+
)
|
804
|
+
)
|
805
|
+
|
806
|
+
assert (
|
807
|
+
"Hash bucket drift detected. Expected hash bucket index to be 1 but found 0"
|
808
|
+
in str(excinfo.value)
|
809
|
+
)
|
810
|
+
|
811
|
+
def test_compact_partition_when_bucket_spec_validation_fails_but_env_variable_disabled(
|
812
|
+
self,
|
813
|
+
s3_resource,
|
814
|
+
local_deltacat_storage_kwargs,
|
815
|
+
):
|
816
|
+
"""
|
817
|
+
A test case which asserts even if bucketing spec validation fails, compaction doesn't
|
818
|
+
throw an error if the feature is not enabled.
|
819
|
+
"""
|
820
|
+
|
821
|
+
# setup
|
822
|
+
staged_source = stage_partition_from_file_paths(
|
823
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
824
|
+
)
|
825
|
+
|
826
|
+
source_delta = commit_delta_to_staged_partition(
|
827
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
828
|
+
)
|
829
|
+
|
830
|
+
staged_dest = stage_partition_from_file_paths(
|
831
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
832
|
+
)
|
833
|
+
dest_partition = ds.commit_partition(
|
834
|
+
staged_dest, **local_deltacat_storage_kwargs
|
835
|
+
)
|
836
|
+
|
837
|
+
# action
|
838
|
+
rcf_url = compact_partition(
|
839
|
+
CompactPartitionParams.of(
|
840
|
+
{
|
841
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
842
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
843
|
+
"dd_max_parallelism_ratio": 1.0,
|
844
|
+
"deltacat_storage": ds,
|
845
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
846
|
+
"destination_partition_locator": dest_partition.locator,
|
847
|
+
"drop_duplicates": True,
|
848
|
+
"hash_bucket_count": 4,
|
849
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
850
|
+
"list_deltas_kwargs": {
|
851
|
+
**local_deltacat_storage_kwargs,
|
852
|
+
**{"equivalent_table_types": []},
|
853
|
+
},
|
854
|
+
"primary_keys": ["pk"],
|
855
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
856
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
857
|
+
"records_per_compacted_file": 1,
|
858
|
+
"s3_client_kwargs": {},
|
859
|
+
"source_partition_locator": source_delta.partition_locator,
|
860
|
+
}
|
861
|
+
)
|
862
|
+
)
|
863
|
+
|
864
|
+
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
865
|
+
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
866
|
+
# Move the records to different hash buckets to simulate a validation failure.
|
867
|
+
backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
|
868
|
+
s3_resource.Bucket(bucket).put_object(
|
869
|
+
Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
|
870
|
+
)
|
871
|
+
|
872
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
873
|
+
new_source_delta = commit_delta_to_partition(
|
874
|
+
source_delta.partition_locator,
|
875
|
+
[self.INCREMENTAL_FILE_PATH],
|
876
|
+
**local_deltacat_storage_kwargs,
|
877
|
+
)
|
878
|
+
|
879
|
+
new_destination_partition = ds.get_partition(
|
880
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
881
|
+
)
|
882
|
+
|
883
|
+
new_rcf = compact_partition(
|
884
|
+
CompactPartitionParams.of(
|
885
|
+
{
|
886
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
887
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
888
|
+
"dd_max_parallelism_ratio": 1.0,
|
889
|
+
"deltacat_storage": ds,
|
890
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
891
|
+
"destination_partition_locator": new_destination_partition.locator,
|
892
|
+
"drop_duplicates": True,
|
893
|
+
"hash_bucket_count": 4,
|
894
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
895
|
+
"list_deltas_kwargs": {
|
896
|
+
**local_deltacat_storage_kwargs,
|
897
|
+
**{"equivalent_table_types": []},
|
898
|
+
},
|
899
|
+
"primary_keys": ["pk"],
|
900
|
+
"rebase_source_partition_locator": None,
|
901
|
+
"rebase_source_partition_high_watermark": None,
|
902
|
+
"records_per_compacted_file": 4000,
|
903
|
+
"s3_client_kwargs": {},
|
904
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
905
|
+
}
|
906
|
+
)
|
907
|
+
)
|
908
|
+
|
909
|
+
incremental_rcf = get_rcf(s3_resource, new_rcf)
|
910
|
+
assert incremental_rcf.hash_bucket_count == 4
|
911
|
+
assert len(incremental_rcf.hb_index_to_entry_range) == 2
|
912
|
+
|
913
|
+
def test_compact_partition_when_bucket_spec_validation_succeeds(
|
914
|
+
self,
|
915
|
+
s3_resource,
|
916
|
+
local_deltacat_storage_kwargs,
|
917
|
+
enable_bucketing_spec_validation,
|
918
|
+
):
|
919
|
+
"""
|
920
|
+
A test case which asserts the bucketing spec validation does not throw
|
921
|
+
and error when the validation succeeds.
|
922
|
+
"""
|
923
|
+
|
924
|
+
# setup
|
925
|
+
staged_source = stage_partition_from_file_paths(
|
926
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
927
|
+
)
|
928
|
+
|
929
|
+
source_delta = commit_delta_to_staged_partition(
|
930
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
931
|
+
)
|
932
|
+
|
933
|
+
staged_dest = stage_partition_from_file_paths(
|
934
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
935
|
+
)
|
936
|
+
dest_partition = ds.commit_partition(
|
937
|
+
staged_dest, **local_deltacat_storage_kwargs
|
938
|
+
)
|
939
|
+
|
940
|
+
# action
|
941
|
+
rcf_url = compact_partition(
|
942
|
+
CompactPartitionParams.of(
|
943
|
+
{
|
944
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
945
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
946
|
+
"dd_max_parallelism_ratio": 1.0,
|
947
|
+
"deltacat_storage": ds,
|
948
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
949
|
+
"destination_partition_locator": dest_partition.locator,
|
950
|
+
"drop_duplicates": True,
|
951
|
+
"hash_bucket_count": 4,
|
952
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
953
|
+
"list_deltas_kwargs": {
|
954
|
+
**local_deltacat_storage_kwargs,
|
955
|
+
**{"equivalent_table_types": []},
|
956
|
+
},
|
957
|
+
"primary_keys": ["pk"],
|
958
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
959
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
960
|
+
"records_per_compacted_file": 1,
|
961
|
+
"s3_client_kwargs": {},
|
962
|
+
"source_partition_locator": source_delta.partition_locator,
|
963
|
+
}
|
964
|
+
)
|
965
|
+
)
|
966
|
+
|
967
|
+
rcf = get_rcf(s3_resource, rcf_url)
|
968
|
+
assert rcf.hash_bucket_count == 4
|
969
|
+
|
970
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
971
|
+
new_source_delta = commit_delta_to_partition(
|
972
|
+
source_delta.partition_locator,
|
973
|
+
[self.INCREMENTAL_FILE_PATH],
|
974
|
+
**local_deltacat_storage_kwargs,
|
975
|
+
)
|
976
|
+
|
977
|
+
new_destination_partition = ds.get_partition(
|
978
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
979
|
+
)
|
980
|
+
|
981
|
+
new_uri = compact_partition(
|
982
|
+
CompactPartitionParams.of(
|
983
|
+
{
|
984
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
985
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
986
|
+
"dd_max_parallelism_ratio": 1.0,
|
987
|
+
"deltacat_storage": ds,
|
988
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
989
|
+
"destination_partition_locator": new_destination_partition.locator,
|
990
|
+
"drop_duplicates": True,
|
991
|
+
"hash_bucket_count": 4,
|
992
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
993
|
+
"list_deltas_kwargs": {
|
994
|
+
**local_deltacat_storage_kwargs,
|
995
|
+
**{"equivalent_table_types": []},
|
996
|
+
},
|
997
|
+
"primary_keys": ["pk"],
|
998
|
+
"rebase_source_partition_locator": None,
|
999
|
+
"rebase_source_partition_high_watermark": None,
|
1000
|
+
"records_per_compacted_file": 4000,
|
1001
|
+
"s3_client_kwargs": {},
|
1002
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
1003
|
+
}
|
1004
|
+
)
|
1005
|
+
)
|
1006
|
+
|
1007
|
+
rcf = get_rcf(s3_resource, new_uri)
|
1008
|
+
assert rcf.hash_bucket_count == 4
|
@@ -0,0 +1,253 @@
|
|
1
|
+
import ray
|
2
|
+
from typing import Dict, Any
|
3
|
+
from deltacat.types.media import ContentType
|
4
|
+
import pyarrow as pa
|
5
|
+
|
6
|
+
import pytest
|
7
|
+
import deltacat.tests.local_deltacat_storage as ds
|
8
|
+
import os
|
9
|
+
from deltacat.tests.test_utils.pyarrow import (
|
10
|
+
stage_partition_from_file_paths,
|
11
|
+
commit_delta_to_staged_partition,
|
12
|
+
)
|
13
|
+
from deltacat.utils.pyarrow import (
|
14
|
+
ReadKwargsProviderPyArrowCsvPureUtf8,
|
15
|
+
ReadKwargsProviderPyArrowSchemaOverride,
|
16
|
+
)
|
17
|
+
|
18
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
19
|
+
"db_file_path",
|
20
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
class TestContentTypeParams:
|
25
|
+
TEST_NAMESPACE = "test_content_type_params"
|
26
|
+
TEST_ENTRY_INDEX = 0
|
27
|
+
DEDUPE_BASE_COMPACTED_TABLE_STRING_PK = "deltacat/tests/compute/compactor_v2/steps/data/dedupe_base_compacted_table_string_pk.csv"
|
28
|
+
DEDUPE_NO_DUPLICATION_STRING_PK = "deltacat/tests/compute/compactor_v2/steps/data/dedupe_table_no_duplication_string_pk.csv"
|
29
|
+
|
30
|
+
@pytest.fixture(scope="module", autouse=True)
|
31
|
+
def setup_ray_cluster(self):
|
32
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
33
|
+
yield
|
34
|
+
ray.shutdown()
|
35
|
+
|
36
|
+
@pytest.fixture(scope="function")
|
37
|
+
def local_deltacat_storage_kwargs(self, request: pytest.FixtureRequest):
|
38
|
+
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
39
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
40
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
41
|
+
}
|
42
|
+
yield kwargs_for_local_deltacat_storage
|
43
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
44
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
45
|
+
|
46
|
+
def test__download_parquet_metadata_for_manifest_entry_sanity(
|
47
|
+
self, local_deltacat_storage_kwargs
|
48
|
+
):
|
49
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
50
|
+
_download_parquet_metadata_for_manifest_entry,
|
51
|
+
)
|
52
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
53
|
+
|
54
|
+
partition = stage_partition_from_file_paths(
|
55
|
+
self.TEST_NAMESPACE,
|
56
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
|
57
|
+
**local_deltacat_storage_kwargs,
|
58
|
+
)
|
59
|
+
test_delta = commit_delta_to_staged_partition(
|
60
|
+
partition,
|
61
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
|
62
|
+
**local_deltacat_storage_kwargs,
|
63
|
+
)
|
64
|
+
test_entry_index = 0
|
65
|
+
obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
|
66
|
+
test_delta, test_entry_index, ds, local_deltacat_storage_kwargs
|
67
|
+
)
|
68
|
+
parquet_metadata = ray.get(obj_ref)
|
69
|
+
partial_parquet_params = parquet_metadata["partial_parquet_params"]
|
70
|
+
|
71
|
+
# validate
|
72
|
+
assert isinstance(parquet_metadata, dict)
|
73
|
+
assert "entry_index" in parquet_metadata
|
74
|
+
assert "partial_parquet_params" in parquet_metadata
|
75
|
+
assert parquet_metadata["entry_index"] == test_entry_index
|
76
|
+
assert isinstance(partial_parquet_params, PartialParquetParameters)
|
77
|
+
|
78
|
+
assert partial_parquet_params.row_groups_to_download == [0]
|
79
|
+
assert partial_parquet_params.num_row_groups == 1
|
80
|
+
assert partial_parquet_params.num_rows == 8
|
81
|
+
assert isinstance(partial_parquet_params.in_memory_size_bytes, float)
|
82
|
+
assert partial_parquet_params.in_memory_size_bytes > 0
|
83
|
+
|
84
|
+
pq_metadata = partial_parquet_params.pq_metadata
|
85
|
+
assert pq_metadata.num_columns == 2
|
86
|
+
assert pq_metadata.num_rows == 8
|
87
|
+
assert pq_metadata.num_row_groups == 1
|
88
|
+
assert pq_metadata.format_version == "2.6"
|
89
|
+
|
90
|
+
assert (
|
91
|
+
test_delta.manifest.entries[self.TEST_ENTRY_INDEX].meta.content_type
|
92
|
+
== ContentType.PARQUET.value
|
93
|
+
)
|
94
|
+
|
95
|
+
@pytest.mark.parametrize(
|
96
|
+
"read_kwargs_provider,expected_values",
|
97
|
+
[
|
98
|
+
(
|
99
|
+
ReadKwargsProviderPyArrowCsvPureUtf8(),
|
100
|
+
{
|
101
|
+
"num_rows": 6,
|
102
|
+
"num_columns": 2,
|
103
|
+
"num_row_groups": 1,
|
104
|
+
"format_version": "2.6",
|
105
|
+
"column_types": [pa.string(), pa.string()],
|
106
|
+
},
|
107
|
+
),
|
108
|
+
(
|
109
|
+
ReadKwargsProviderPyArrowSchemaOverride(
|
110
|
+
schema=pa.schema(
|
111
|
+
[
|
112
|
+
("id", pa.string()),
|
113
|
+
("value", pa.int64()),
|
114
|
+
]
|
115
|
+
)
|
116
|
+
),
|
117
|
+
{
|
118
|
+
"num_rows": 6,
|
119
|
+
"num_columns": 2,
|
120
|
+
"num_row_groups": 1,
|
121
|
+
"format_version": "2.6",
|
122
|
+
"column_types": [pa.string(), pa.int64()],
|
123
|
+
},
|
124
|
+
),
|
125
|
+
(
|
126
|
+
ReadKwargsProviderPyArrowSchemaOverride(
|
127
|
+
schema=None,
|
128
|
+
pq_coerce_int96_timestamp_unit="ms",
|
129
|
+
parquet_reader_type="daft",
|
130
|
+
),
|
131
|
+
{
|
132
|
+
"num_rows": 6,
|
133
|
+
"num_columns": 2,
|
134
|
+
"num_row_groups": 1,
|
135
|
+
"format_version": "2.6",
|
136
|
+
"column_types": None, # Will use default type inference
|
137
|
+
},
|
138
|
+
),
|
139
|
+
],
|
140
|
+
)
|
141
|
+
def test__download_parquet_metadata_for_manifest_entry_with_read_kwargs_provider(
|
142
|
+
self, read_kwargs_provider, expected_values, local_deltacat_storage_kwargs
|
143
|
+
):
|
144
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
145
|
+
_download_parquet_metadata_for_manifest_entry,
|
146
|
+
)
|
147
|
+
|
148
|
+
partition = stage_partition_from_file_paths(
|
149
|
+
self.TEST_NAMESPACE,
|
150
|
+
[self.DEDUPE_NO_DUPLICATION_STRING_PK],
|
151
|
+
**local_deltacat_storage_kwargs,
|
152
|
+
)
|
153
|
+
test_delta = commit_delta_to_staged_partition(
|
154
|
+
partition,
|
155
|
+
[self.DEDUPE_NO_DUPLICATION_STRING_PK],
|
156
|
+
**local_deltacat_storage_kwargs,
|
157
|
+
)
|
158
|
+
test_entry_index = 0
|
159
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowCsvPureUtf8
|
160
|
+
obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
|
161
|
+
test_delta,
|
162
|
+
test_entry_index,
|
163
|
+
ds,
|
164
|
+
local_deltacat_storage_kwargs,
|
165
|
+
read_kwargs_provider,
|
166
|
+
)
|
167
|
+
parquet_metadata = ray.get(obj_ref)
|
168
|
+
partial_parquet_params = parquet_metadata["partial_parquet_params"]
|
169
|
+
|
170
|
+
# validate
|
171
|
+
assert isinstance(parquet_metadata, dict)
|
172
|
+
assert "entry_index" in parquet_metadata
|
173
|
+
assert "partial_parquet_params" in parquet_metadata
|
174
|
+
assert parquet_metadata["entry_index"] == self.TEST_ENTRY_INDEX
|
175
|
+
|
176
|
+
assert partial_parquet_params.row_groups_to_download == [0]
|
177
|
+
assert (
|
178
|
+
partial_parquet_params.num_row_groups == expected_values["num_row_groups"]
|
179
|
+
)
|
180
|
+
assert partial_parquet_params.num_rows == expected_values["num_rows"]
|
181
|
+
assert isinstance(partial_parquet_params.in_memory_size_bytes, float)
|
182
|
+
assert partial_parquet_params.in_memory_size_bytes > 0
|
183
|
+
|
184
|
+
pq_metadata = partial_parquet_params.pq_metadata
|
185
|
+
assert pq_metadata.num_columns == expected_values["num_columns"]
|
186
|
+
assert pq_metadata.num_rows == expected_values["num_rows"]
|
187
|
+
assert pq_metadata.num_row_groups == expected_values["num_row_groups"]
|
188
|
+
assert pq_metadata.format_version == expected_values["format_version"]
|
189
|
+
|
190
|
+
assert (
|
191
|
+
test_delta.manifest.entries[self.TEST_ENTRY_INDEX].meta.content_type
|
192
|
+
== ContentType.PARQUET.value
|
193
|
+
)
|
194
|
+
|
195
|
+
def test_download_parquet_metadata_for_manifest_entry_file_reader_kwargs_present_top_level_and_deltacat_storage_kwarg(
|
196
|
+
self, local_deltacat_storage_kwargs, caplog
|
197
|
+
):
|
198
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
199
|
+
_download_parquet_metadata_for_manifest_entry,
|
200
|
+
)
|
201
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
202
|
+
|
203
|
+
test_file_reader_kwargs_provider = ReadKwargsProviderPyArrowCsvPureUtf8()
|
204
|
+
|
205
|
+
local_deltacat_storage_kwargs[
|
206
|
+
"file_reader_kwargs_provider"
|
207
|
+
] = ReadKwargsProviderPyArrowCsvPureUtf8()
|
208
|
+
|
209
|
+
partition = stage_partition_from_file_paths(
|
210
|
+
self.TEST_NAMESPACE,
|
211
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
|
212
|
+
**local_deltacat_storage_kwargs,
|
213
|
+
)
|
214
|
+
test_delta = commit_delta_to_staged_partition(
|
215
|
+
partition,
|
216
|
+
[self.DEDUPE_BASE_COMPACTED_TABLE_STRING_PK],
|
217
|
+
**local_deltacat_storage_kwargs,
|
218
|
+
)
|
219
|
+
|
220
|
+
test_entry_index = 0
|
221
|
+
obj_ref = _download_parquet_metadata_for_manifest_entry.remote(
|
222
|
+
test_delta,
|
223
|
+
test_entry_index,
|
224
|
+
ds,
|
225
|
+
local_deltacat_storage_kwargs,
|
226
|
+
test_file_reader_kwargs_provider,
|
227
|
+
)
|
228
|
+
parquet_metadata = ray.get(obj_ref)
|
229
|
+
partial_parquet_params = parquet_metadata["partial_parquet_params"]
|
230
|
+
|
231
|
+
# validate
|
232
|
+
assert isinstance(parquet_metadata, dict)
|
233
|
+
assert "entry_index" in parquet_metadata
|
234
|
+
assert "partial_parquet_params" in parquet_metadata
|
235
|
+
assert parquet_metadata["entry_index"] == test_entry_index
|
236
|
+
assert isinstance(partial_parquet_params, PartialParquetParameters)
|
237
|
+
|
238
|
+
assert partial_parquet_params.row_groups_to_download == [0]
|
239
|
+
assert partial_parquet_params.num_row_groups == 1
|
240
|
+
assert partial_parquet_params.num_rows == 8
|
241
|
+
assert isinstance(partial_parquet_params.in_memory_size_bytes, float)
|
242
|
+
assert partial_parquet_params.in_memory_size_bytes > 0
|
243
|
+
|
244
|
+
pq_metadata = partial_parquet_params.pq_metadata
|
245
|
+
assert pq_metadata.num_columns == 2
|
246
|
+
assert pq_metadata.num_rows == 8
|
247
|
+
assert pq_metadata.num_row_groups == 1
|
248
|
+
assert pq_metadata.format_version == "2.6"
|
249
|
+
|
250
|
+
assert (
|
251
|
+
test_delta.manifest.entries[self.TEST_ENTRY_INDEX].meta.content_type
|
252
|
+
== ContentType.PARQUET.value
|
253
|
+
)
|
@@ -119,6 +119,21 @@ def offer_local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
119
119
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
120
120
|
|
121
121
|
|
122
|
+
@pytest.fixture(autouse=True, scope="function")
|
123
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
124
|
+
"""
|
125
|
+
Enable the bucketing spec validation for all tests.
|
126
|
+
This will help catch hash bucket drift in testing.
|
127
|
+
"""
|
128
|
+
import deltacat.compute.compactor_v2.steps.merge
|
129
|
+
|
130
|
+
monkeypatch.setattr(
|
131
|
+
deltacat.compute.compactor_v2.steps.merge,
|
132
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
133
|
+
"ASSERT",
|
134
|
+
)
|
135
|
+
|
136
|
+
|
122
137
|
@pytest.mark.parametrize(
|
123
138
|
[
|
124
139
|
"test_name",
|
@@ -114,6 +114,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
114
114
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
115
|
|
116
116
|
|
117
|
+
@pytest.fixture(autouse=True, scope="function")
|
118
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
119
|
+
"""
|
120
|
+
Enable the bucketing spec validation for all tests.
|
121
|
+
This will help catch hash bucket drift in testing.
|
122
|
+
"""
|
123
|
+
import deltacat.compute.compactor_v2.steps.merge
|
124
|
+
|
125
|
+
monkeypatch.setattr(
|
126
|
+
deltacat.compute.compactor_v2.steps.merge,
|
127
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
128
|
+
"ASSERT",
|
129
|
+
)
|
130
|
+
|
131
|
+
|
117
132
|
@pytest.mark.parametrize(
|
118
133
|
[
|
119
134
|
"test_name",
|
@@ -114,6 +114,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
114
114
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
115
|
|
116
116
|
|
117
|
+
@pytest.fixture(autouse=True, scope="function")
|
118
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
119
|
+
"""
|
120
|
+
Enable the bucketing spec validation for all tests.
|
121
|
+
This will help catch hash bucket drift in testing.
|
122
|
+
"""
|
123
|
+
import deltacat.compute.compactor_v2.steps.merge
|
124
|
+
|
125
|
+
monkeypatch.setattr(
|
126
|
+
deltacat.compute.compactor_v2.steps.merge,
|
127
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
128
|
+
"ASSERT",
|
129
|
+
)
|
130
|
+
|
131
|
+
|
117
132
|
@pytest.mark.parametrize(
|
118
133
|
[
|
119
134
|
"test_name",
|
@@ -118,6 +118,21 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
118
118
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
119
119
|
|
120
120
|
|
121
|
+
@pytest.fixture(autouse=True, scope="function")
|
122
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
123
|
+
"""
|
124
|
+
Enable the bucketing spec validation for all tests.
|
125
|
+
This will help catch hash bucket drift in testing.
|
126
|
+
"""
|
127
|
+
import deltacat.compute.compactor_v2.steps.merge
|
128
|
+
|
129
|
+
monkeypatch.setattr(
|
130
|
+
deltacat.compute.compactor_v2.steps.merge,
|
131
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
132
|
+
"ASSERT",
|
133
|
+
)
|
134
|
+
|
135
|
+
|
121
136
|
@pytest.mark.parametrize(
|
122
137
|
[
|
123
138
|
"test_name",
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=br2aQSDj5eFS_j0mwGUSEQF386HRAXjiYg421vB9pME,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
|
|
51
51
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
52
52
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
|
54
|
-
deltacat/compute/compactor_v2/constants.py,sha256=
|
54
|
+
deltacat/compute/compactor_v2/constants.py,sha256=F5Phrh-2JgnWvtjHXacxOG5Z2ivKcHnboerI12rc1zk,3632
|
55
55
|
deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
|
57
57
|
deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
|
@@ -69,12 +69,12 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
69
69
|
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
|
-
deltacat/compute/compactor_v2/steps/merge.py,sha256=
|
72
|
+
deltacat/compute/compactor_v2/steps/merge.py,sha256=T2G2AaVsezYzo6oJtpuXH-bYv8nt-yFHA5ZbDIGodQg,24971
|
73
73
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
|
-
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=
|
74
|
+
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=t2j9H9IdFRH9EfpL-9g5XvZs9WK9HybqBGA7fDi82EM,8310
|
75
75
|
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
|
76
76
|
deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
|
77
|
-
deltacat/compute/compactor_v2/utils/io.py,sha256=
|
77
|
+
deltacat/compute/compactor_v2/utils/io.py,sha256=Xjs7_D-0xKSetvllIe4o96aM1elfdjt1Ii7YfsHPvZs,6108
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
|
79
79
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
80
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
|
@@ -85,7 +85,7 @@ deltacat/compute/merge_on_read/model/merge_on_read_params.py,sha256=Q51znagh8PtL
|
|
85
85
|
deltacat/compute/merge_on_read/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
86
86
|
deltacat/compute/merge_on_read/utils/delta.py,sha256=e4BtOHa5XPpUnR4r0HqBKjXckBsTI8qBwdUWwpJfkWQ,1367
|
87
87
|
deltacat/compute/resource_estimation/__init__.py,sha256=4bfBXcq-VAt9JCmjvj3yAmn0lEHVGdGsUCCoMGxjEqA,799
|
88
|
-
deltacat/compute/resource_estimation/delta.py,sha256=
|
88
|
+
deltacat/compute/resource_estimation/delta.py,sha256=dN64jbUQ8OI1BTz4fYGbulJLWjKjdT-XvwDJNLM__Oo,10583
|
89
89
|
deltacat/compute/resource_estimation/manifest.py,sha256=gSqOyIda-pYq3vRsKFq3IiZvwhV3mMqrWPtsmUH9dD8,13035
|
90
90
|
deltacat/compute/resource_estimation/model.py,sha256=psyagFXdpLGt8DfDqy7c8DWiuXCacr0Swe5f0M7DdO4,5465
|
91
91
|
deltacat/compute/resource_estimation/parquet.py,sha256=5_apma4EKbKcm-nfV73-qN2nfnCeyhFW23ZHX3jz0Kw,3158
|
@@ -137,11 +137,11 @@ deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kW
|
|
137
137
|
deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
|
138
138
|
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
|
139
139
|
deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
|
140
|
-
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=
|
141
|
-
deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=
|
140
|
+
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=8hUqnzeGIhAENcBxLL0R_yfjAaNTmRds6OWxQOmVqD8,15416
|
141
|
+
deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=6d3F9E_4eO2Okh97v8NWFbEptPkzKoO0Qq8O6yAXrIs,13377
|
142
142
|
deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
|
143
|
-
deltacat/tests/compute/test_compact_partition_rebase.py,sha256=
|
144
|
-
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=
|
143
|
+
deltacat/tests/compute/test_compact_partition_rebase.py,sha256=vOF8wgTpdaWJKo47mK9aii3NKtwVwWgujoQyS8C3YyA,13535
|
144
|
+
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=BimvU-iHiF78WlE4xbfk4dzHm0a-frwpE7H7Kh4XkbE,15500
|
145
145
|
deltacat/tests/compute/test_util_common.py,sha256=0mEHo38bgH64y0XZ_zgUL_aZgQMgJOSTlOYvIJxG_MM,11825
|
146
146
|
deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
|
147
147
|
deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=Q3HJj1fjoe2JwRUOW8KEjbTqPIIoP2o_T3ZGH6SJnCM,13244
|
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
152
152
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
153
153
|
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
154
154
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=
|
155
|
+
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=zEXOIilybDpKuQt1ZRxGg4x_kUacBOcHE8KWcOmL01s,42563
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
|
+
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoiDuBUhgCmc3DYKCXL1g4QWtmROhZ0RJCQgePMY9as,9959
|
158
159
|
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
159
160
|
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=YDQKUKv3Vv8S1fe0YQmjHTrwnWSliqKHIWGu0fEdKnI,11478
|
160
161
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -211,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
211
212
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
212
213
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
213
214
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
215
|
+
deltacat-1.1.35.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
216
|
+
deltacat-1.1.35.dist-info/METADATA,sha256=b8Z4aVdNYjBoy0_uh0m4yoU_8h2w8v7I2AZOwacv5Es,1733
|
217
|
+
deltacat-1.1.35.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
218
|
+
deltacat-1.1.35.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
219
|
+
deltacat-1.1.35.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|