deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +65 -38
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +26 -16
- deltacat/compute/compactor_v2/constants.py +5 -11
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/dataset.py +5 -17
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +56 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +124 -29
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +11 -8
- deltacat/utils/ray_utils/dataset.py +7 -7
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from deltacat.tests.compute.test_util_common import (
|
3
|
+
PartitionKey,
|
4
|
+
PartitionKeyType,
|
5
|
+
)
|
6
|
+
from deltacat.tests.compute.test_util_constant import (
|
7
|
+
DEFAULT_MAX_RECORDS_PER_FILE,
|
8
|
+
DEFAULT_HASH_BUCKET_COUNT,
|
9
|
+
)
|
10
|
+
from dataclasses import dataclass
|
11
|
+
|
12
|
+
|
13
|
+
from deltacat.storage import (
|
14
|
+
DeltaType,
|
15
|
+
)
|
16
|
+
|
17
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
18
|
+
|
19
|
+
from deltacat.storage.model.sort_key import SortKey
|
20
|
+
|
21
|
+
from deltacat.tests.compute.compact_partition_test_cases import (
|
22
|
+
BaseCompactorTestCase,
|
23
|
+
with_compactor_version_func_test_param,
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
@dataclass(frozen=True)
|
28
|
+
class RebaseCompactionTestCaseParams(BaseCompactorTestCase):
|
29
|
+
"""
|
30
|
+
A pytest parameterized test case for the `compact_partition` function with rebase compaction.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
* (inherited from CompactorTestCase): see CompactorTestCase docstring for details
|
34
|
+
rebase_expected_compact_partition_result: pa.Table - expected table after rebase compaction runs. An output that is asserted on in Rebase unit tests
|
35
|
+
"""
|
36
|
+
|
37
|
+
rebase_expected_compact_partition_result: pa.Table
|
38
|
+
|
39
|
+
|
40
|
+
REBASE_TEST_CASES = {
|
41
|
+
"1-rebase-sanity": RebaseCompactionTestCaseParams(
|
42
|
+
primary_keys={"pk_col_1"},
|
43
|
+
sort_keys=[
|
44
|
+
SortKey.of(key_name="sk_col_1"),
|
45
|
+
SortKey.of(key_name="sk_col_2"),
|
46
|
+
],
|
47
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
48
|
+
partition_values=["1"],
|
49
|
+
input_deltas=pa.Table.from_arrays(
|
50
|
+
[
|
51
|
+
pa.array([str(i) for i in range(10)]),
|
52
|
+
pa.array([i for i in range(0, 10)]),
|
53
|
+
pa.array(["foo"] * 10),
|
54
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
55
|
+
],
|
56
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
57
|
+
),
|
58
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
59
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
60
|
+
[
|
61
|
+
pa.array([str(i) for i in range(10)]),
|
62
|
+
pa.array([i for i in range(0, 10)]),
|
63
|
+
pa.array(["foo"] * 10),
|
64
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
65
|
+
],
|
66
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
67
|
+
),
|
68
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
69
|
+
[
|
70
|
+
pa.array([str(i) for i in range(10)]),
|
71
|
+
pa.array([i for i in range(20, 30)]),
|
72
|
+
pa.array(["foo"] * 10),
|
73
|
+
pa.array([i / 10 for i in range(40, 50)]),
|
74
|
+
],
|
75
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
76
|
+
),
|
77
|
+
expected_terminal_exception=None,
|
78
|
+
expected_terminal_exception_message=None,
|
79
|
+
do_create_placement_group=False,
|
80
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
81
|
+
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
82
|
+
read_kwargs_provider=None,
|
83
|
+
drop_duplicates=True,
|
84
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
85
|
+
),
|
86
|
+
}
|
87
|
+
|
88
|
+
REBASE_TEST_CASES = with_compactor_version_func_test_param(REBASE_TEST_CASES)
|
@@ -37,6 +37,7 @@ from deltacat.tests.compute.compact_partition_test_cases import (
|
|
37
37
|
EMPTY_UTSV_PATH,
|
38
38
|
)
|
39
39
|
from deltacat.storage import DeleteParameters
|
40
|
+
from deltacat.exceptions import ValidationError
|
40
41
|
|
41
42
|
|
42
43
|
@dataclass(frozen=True)
|
@@ -1538,8 +1539,8 @@ REBASE_THEN_INCREMENTAL_DELETE_DELTA_TYPE_TEST_CASES = {
|
|
1538
1539
|
]
|
1539
1540
|
),
|
1540
1541
|
),
|
1541
|
-
expected_terminal_exception=
|
1542
|
-
expected_terminal_exception_message="
|
1542
|
+
expected_terminal_exception=ValidationError,
|
1543
|
+
expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
|
1543
1544
|
do_create_placement_group=True,
|
1544
1545
|
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
1545
1546
|
hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
|
@@ -28,6 +28,8 @@ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
|
28
28
|
|
29
29
|
from deltacat.storage.model.sort_key import SortKey
|
30
30
|
|
31
|
+
from deltacat.exceptions import ValidationError
|
32
|
+
|
31
33
|
ZERO_VALUED_SORT_KEY, ZERO_VALUED_PARTITION_VALUES_PARAM = [], []
|
32
34
|
ZERO_VALUED_PARTITION_KEYS_PARAM = None
|
33
35
|
ZERO_VALUED_PRIMARY_KEY = {}
|
@@ -570,8 +572,8 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
570
572
|
],
|
571
573
|
names=["pk_col_1", "sk_col_1"],
|
572
574
|
),
|
573
|
-
expected_terminal_exception=
|
574
|
-
expected_terminal_exception_message="
|
575
|
+
expected_terminal_exception=ValidationError,
|
576
|
+
expected_terminal_exception_message="One of the assertions in DeltaCAT has failed",
|
575
577
|
do_create_placement_group=False,
|
576
578
|
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
577
579
|
hash_bucket_count=None,
|
@@ -0,0 +1,209 @@
|
|
1
|
+
import pytest
|
2
|
+
import os
|
3
|
+
from moto import mock_s3
|
4
|
+
import boto3
|
5
|
+
from boto3.resources.base import ServiceResource
|
6
|
+
from deltacat.compute.compactor.utils.round_completion_file import (
|
7
|
+
read_round_completion_file,
|
8
|
+
write_round_completion_file,
|
9
|
+
)
|
10
|
+
from deltacat.tests.compute.test_util_common import get_test_partition_locator
|
11
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
12
|
+
|
13
|
+
RCF_BUCKET_NAME = "rcf-bucket"
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.fixture(autouse=True, scope="module")
|
17
|
+
def mock_aws_credential():
|
18
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
19
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
20
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
21
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
22
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
23
|
+
yield
|
24
|
+
|
25
|
+
|
26
|
+
@pytest.fixture(autouse=True, scope="module")
|
27
|
+
def s3_resource(mock_aws_credential):
|
28
|
+
with mock_s3():
|
29
|
+
yield boto3.resource("s3")
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture(autouse=True, scope="function")
|
33
|
+
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
34
|
+
s3_resource.create_bucket(
|
35
|
+
ACL="authenticated-read",
|
36
|
+
Bucket=RCF_BUCKET_NAME,
|
37
|
+
)
|
38
|
+
yield
|
39
|
+
s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
|
40
|
+
|
41
|
+
|
42
|
+
class TestReadWriteRoundCompletionFile:
|
43
|
+
def test_read_when_rcf_written_without_destination(self):
|
44
|
+
"""
|
45
|
+
This test case tests the backward compatibility by successfully
|
46
|
+
reading the previously written rcf.
|
47
|
+
"""
|
48
|
+
|
49
|
+
source_locator = get_test_partition_locator("source")
|
50
|
+
destination_locator = get_test_partition_locator("destination")
|
51
|
+
|
52
|
+
expected_rcf = RoundCompletionInfo.of(
|
53
|
+
high_watermark=122,
|
54
|
+
compacted_delta_locator={},
|
55
|
+
compacted_pyarrow_write_result={},
|
56
|
+
sort_keys_bit_width=12,
|
57
|
+
)
|
58
|
+
|
59
|
+
rcf_url = write_round_completion_file(
|
60
|
+
RCF_BUCKET_NAME, source_locator, None, expected_rcf
|
61
|
+
)
|
62
|
+
|
63
|
+
rcf = read_round_completion_file(
|
64
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
65
|
+
)
|
66
|
+
|
67
|
+
assert (
|
68
|
+
rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
|
69
|
+
)
|
70
|
+
assert rcf == expected_rcf
|
71
|
+
|
72
|
+
def test_read_when_rcf_written_with_destination(self):
|
73
|
+
"""
|
74
|
+
This test case tests the backward compatibility by successfully
|
75
|
+
reading the previously written rcf.
|
76
|
+
"""
|
77
|
+
|
78
|
+
source_locator = get_test_partition_locator("source")
|
79
|
+
destination_locator = get_test_partition_locator("destination")
|
80
|
+
|
81
|
+
expected_rcf = RoundCompletionInfo.of(
|
82
|
+
high_watermark=122,
|
83
|
+
compacted_delta_locator={},
|
84
|
+
compacted_pyarrow_write_result={},
|
85
|
+
sort_keys_bit_width=12,
|
86
|
+
)
|
87
|
+
|
88
|
+
rcf_url = write_round_completion_file(
|
89
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
90
|
+
)
|
91
|
+
|
92
|
+
rcf = read_round_completion_file(
|
93
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
94
|
+
)
|
95
|
+
|
96
|
+
assert (
|
97
|
+
rcf_url
|
98
|
+
== "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
|
99
|
+
)
|
100
|
+
assert rcf == expected_rcf
|
101
|
+
|
102
|
+
def test_read_without_destination_when_rcf_written_with_destination(self):
|
103
|
+
"""
|
104
|
+
This test case tests the backward compatibility by successfully
|
105
|
+
reading the previously written rcf.
|
106
|
+
"""
|
107
|
+
|
108
|
+
source_locator = get_test_partition_locator("source")
|
109
|
+
destination_locator = get_test_partition_locator("destination")
|
110
|
+
|
111
|
+
expected_rcf = RoundCompletionInfo.of(
|
112
|
+
high_watermark=122,
|
113
|
+
compacted_delta_locator={},
|
114
|
+
compacted_pyarrow_write_result={},
|
115
|
+
sort_keys_bit_width=12,
|
116
|
+
)
|
117
|
+
|
118
|
+
write_round_completion_file(
|
119
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
120
|
+
)
|
121
|
+
|
122
|
+
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
123
|
+
|
124
|
+
assert rcf is None
|
125
|
+
|
126
|
+
def test_read_without_destination_when_rcf_written_without_destination(self):
|
127
|
+
"""
|
128
|
+
This test case tests the backward compatibility by successfully
|
129
|
+
reading the previously written rcf.
|
130
|
+
"""
|
131
|
+
|
132
|
+
source_locator = get_test_partition_locator("source")
|
133
|
+
|
134
|
+
expected_rcf = RoundCompletionInfo.of(
|
135
|
+
high_watermark=122,
|
136
|
+
compacted_delta_locator={},
|
137
|
+
compacted_pyarrow_write_result={},
|
138
|
+
sort_keys_bit_width=12,
|
139
|
+
)
|
140
|
+
|
141
|
+
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
142
|
+
|
143
|
+
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
144
|
+
|
145
|
+
assert rcf == expected_rcf
|
146
|
+
|
147
|
+
def test_read_when_rcf_written_both_with_and_without_destination(self):
|
148
|
+
"""
|
149
|
+
This test case tests the backward compatibility by successfully
|
150
|
+
reading the previously written rcf.
|
151
|
+
"""
|
152
|
+
|
153
|
+
source_locator = get_test_partition_locator("source")
|
154
|
+
destination_locator = get_test_partition_locator("destination")
|
155
|
+
|
156
|
+
expected_rcf = RoundCompletionInfo.of(
|
157
|
+
high_watermark=122,
|
158
|
+
compacted_delta_locator={},
|
159
|
+
compacted_pyarrow_write_result={},
|
160
|
+
sort_keys_bit_width=12,
|
161
|
+
)
|
162
|
+
|
163
|
+
expected_rcf_2 = RoundCompletionInfo.of(
|
164
|
+
high_watermark=1223,
|
165
|
+
compacted_delta_locator={},
|
166
|
+
compacted_pyarrow_write_result={},
|
167
|
+
sort_keys_bit_width=1233,
|
168
|
+
)
|
169
|
+
|
170
|
+
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
171
|
+
|
172
|
+
write_round_completion_file(
|
173
|
+
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
|
174
|
+
)
|
175
|
+
|
176
|
+
rcf = read_round_completion_file(
|
177
|
+
RCF_BUCKET_NAME, source_locator, destination_locator
|
178
|
+
)
|
179
|
+
|
180
|
+
assert rcf == expected_rcf_2
|
181
|
+
|
182
|
+
def test_write_when_custom_url_is_passed(self):
|
183
|
+
"""
|
184
|
+
This test case tests the backward compatibility by successfully
|
185
|
+
reading the previously written rcf.
|
186
|
+
"""
|
187
|
+
|
188
|
+
source_locator = get_test_partition_locator("source")
|
189
|
+
|
190
|
+
expected_rcf = RoundCompletionInfo.of(
|
191
|
+
high_watermark=122,
|
192
|
+
compacted_delta_locator={},
|
193
|
+
compacted_pyarrow_write_result={},
|
194
|
+
sort_keys_bit_width=12,
|
195
|
+
)
|
196
|
+
|
197
|
+
completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
|
198
|
+
rcf_url = write_round_completion_file(
|
199
|
+
RCF_BUCKET_NAME,
|
200
|
+
source_locator,
|
201
|
+
None,
|
202
|
+
expected_rcf,
|
203
|
+
completion_file_s3_url=completion_file_s3_url,
|
204
|
+
)
|
205
|
+
|
206
|
+
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
207
|
+
|
208
|
+
assert rcf_url == completion_file_s3_url
|
209
|
+
assert rcf is None
|
@@ -1,88 +1,255 @@
|
|
1
|
-
import
|
2
|
-
import sqlite3
|
1
|
+
from typing import Dict, Any
|
3
2
|
import ray
|
4
3
|
import os
|
5
|
-
|
4
|
+
import pytest
|
5
|
+
import boto3
|
6
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
7
|
+
CompactionSessionAuditInfo,
|
8
|
+
)
|
9
|
+
from boto3.resources.base import ServiceResource
|
6
10
|
import deltacat.tests.local_deltacat_storage as ds
|
7
11
|
from deltacat.types.media import ContentType
|
8
|
-
from deltacat.compute.compactor_v2.compaction_session import
|
12
|
+
from deltacat.compute.compactor_v2.compaction_session import (
|
13
|
+
compact_partition,
|
14
|
+
)
|
9
15
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
10
16
|
CompactPartitionParams,
|
11
17
|
)
|
12
|
-
from deltacat.utils
|
13
|
-
from deltacat.tests.
|
18
|
+
from deltacat.tests.test_utils.utils import read_s3_contents
|
19
|
+
from deltacat.tests.compute.test_util_constant import (
|
20
|
+
TEST_S3_RCF_BUCKET_NAME,
|
21
|
+
)
|
22
|
+
from deltacat.tests.compute.test_util_common import get_rcf
|
23
|
+
from deltacat.tests.test_utils.pyarrow import (
|
24
|
+
stage_partition_from_file_paths,
|
25
|
+
commit_delta_to_staged_partition,
|
26
|
+
commit_delta_to_partition,
|
27
|
+
)
|
28
|
+
from moto import mock_s3
|
29
|
+
|
30
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
31
|
+
"db_file_path",
|
32
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
@pytest.fixture(autouse=True, scope="module")
|
37
|
+
def setup_ray_cluster():
|
38
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
39
|
+
yield
|
40
|
+
ray.shutdown()
|
41
|
+
|
42
|
+
|
43
|
+
@pytest.fixture(autouse=True, scope="module")
|
44
|
+
def mock_aws_credential():
|
45
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
46
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
47
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
48
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
49
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
50
|
+
yield
|
51
|
+
|
14
52
|
|
53
|
+
@pytest.fixture(scope="module")
|
54
|
+
def s3_resource(mock_aws_credential):
|
55
|
+
with mock_s3():
|
56
|
+
yield boto3.resource("s3")
|
15
57
|
|
16
|
-
|
58
|
+
|
59
|
+
@pytest.fixture(autouse=True, scope="module")
|
60
|
+
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
61
|
+
s3_resource.create_bucket(
|
62
|
+
ACL="authenticated-read",
|
63
|
+
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
64
|
+
)
|
65
|
+
yield
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture(scope="function")
|
69
|
+
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
70
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
71
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
72
|
+
}
|
73
|
+
yield kwargs_for_local_deltacat_storage
|
74
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
75
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
76
|
+
|
77
|
+
|
78
|
+
class TestCompactionSession:
|
17
79
|
"""
|
18
80
|
This class adds specific tests that aren't part of the parametrized test suite.
|
19
81
|
"""
|
20
82
|
|
21
|
-
DB_FILE_PATH = f"{current_time_ms()}.db"
|
22
83
|
NAMESPACE = "compact_partition_v2_namespace"
|
84
|
+
BACKFILL_FILE_PATH = (
|
85
|
+
"deltacat/tests/compute/compactor_v2/data/backfill_source_date_pk.csv"
|
86
|
+
)
|
87
|
+
INCREMENTAL_FILE_PATH = (
|
88
|
+
"deltacat/tests/compute/compactor_v2/data/incremental_source_date_pk.csv"
|
89
|
+
)
|
90
|
+
|
91
|
+
def test_compact_partition_when_no_input_deltas_to_compact(
|
92
|
+
self, local_deltacat_storage_kwargs
|
93
|
+
):
|
94
|
+
# setup
|
95
|
+
staged_source = stage_partition_from_file_paths(
|
96
|
+
self.NAMESPACE, ["test"], **local_deltacat_storage_kwargs
|
97
|
+
)
|
98
|
+
source_partition = ds.commit_partition(
|
99
|
+
staged_source, **local_deltacat_storage_kwargs
|
100
|
+
)
|
23
101
|
|
24
|
-
|
25
|
-
|
26
|
-
|
102
|
+
staged_dest = stage_partition_from_file_paths(
|
103
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
104
|
+
)
|
105
|
+
dest_partition = ds.commit_partition(
|
106
|
+
staged_dest, **local_deltacat_storage_kwargs
|
107
|
+
)
|
27
108
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
109
|
+
# action
|
110
|
+
rcf_url = compact_partition(
|
111
|
+
CompactPartitionParams.of(
|
112
|
+
{
|
113
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
114
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
115
|
+
"dd_max_parallelism_ratio": 1.0,
|
116
|
+
"deltacat_storage": ds,
|
117
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
118
|
+
"destination_partition_locator": dest_partition.locator,
|
119
|
+
"drop_duplicates": True,
|
120
|
+
"hash_bucket_count": 2,
|
121
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
122
|
+
"list_deltas_kwargs": {
|
123
|
+
**local_deltacat_storage_kwargs,
|
124
|
+
**{"equivalent_table_types": []},
|
125
|
+
},
|
126
|
+
"primary_keys": ["pk"],
|
127
|
+
"rebase_source_partition_locator": None,
|
128
|
+
"rebase_source_partition_high_watermark": None,
|
129
|
+
"records_per_compacted_file": 4000,
|
130
|
+
"s3_client_kwargs": {},
|
131
|
+
"source_partition_locator": source_partition.locator,
|
132
|
+
}
|
133
|
+
)
|
134
|
+
)
|
32
135
|
|
33
|
-
|
136
|
+
# verify that no RCF is written
|
137
|
+
assert rcf_url is None
|
34
138
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
139
|
+
def test_compact_partition_when_rcf_was_written_by_past_commit(
|
140
|
+
self, s3_resource, local_deltacat_storage_kwargs
|
141
|
+
):
|
142
|
+
"""
|
143
|
+
Backward compatibility test for when a RCF was written by a previous commit.
|
144
|
+
"""
|
40
145
|
|
41
|
-
@patch("deltacat.compute.compactor_v2.compaction_session.rcf")
|
42
|
-
@patch("deltacat.compute.compactor_v2.compaction_session.s3_utils")
|
43
|
-
def test_compact_partition_when_no_input_deltas_to_compact(self, s3_utils, rcf_url):
|
44
146
|
# setup
|
45
|
-
rcf_url.read_round_completion_file.return_value = None
|
46
147
|
staged_source = stage_partition_from_file_paths(
|
47
|
-
self.NAMESPACE, ["
|
148
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
48
149
|
)
|
49
|
-
|
50
|
-
|
150
|
+
|
151
|
+
source_delta = commit_delta_to_staged_partition(
|
152
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
51
153
|
)
|
52
154
|
|
53
155
|
staged_dest = stage_partition_from_file_paths(
|
54
|
-
self.NAMESPACE, ["destination"], **
|
156
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
55
157
|
)
|
56
158
|
dest_partition = ds.commit_partition(
|
57
|
-
staged_dest, **
|
159
|
+
staged_dest, **local_deltacat_storage_kwargs
|
58
160
|
)
|
59
161
|
|
60
162
|
# action
|
61
163
|
rcf_url = compact_partition(
|
62
164
|
CompactPartitionParams.of(
|
63
165
|
{
|
64
|
-
"compaction_artifact_s3_bucket":
|
166
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
65
167
|
"compacted_file_content_type": ContentType.PARQUET,
|
66
168
|
"dd_max_parallelism_ratio": 1.0,
|
67
169
|
"deltacat_storage": ds,
|
68
|
-
"deltacat_storage_kwargs":
|
170
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
69
171
|
"destination_partition_locator": dest_partition.locator,
|
70
172
|
"drop_duplicates": True,
|
71
173
|
"hash_bucket_count": 1,
|
72
|
-
"last_stream_position_to_compact":
|
174
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
73
175
|
"list_deltas_kwargs": {
|
74
|
-
**
|
176
|
+
**local_deltacat_storage_kwargs,
|
75
177
|
**{"equivalent_table_types": []},
|
76
178
|
},
|
77
179
|
"primary_keys": [],
|
180
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
181
|
+
"rebase_source_partition_high_watermark": None,
|
182
|
+
"records_per_compacted_file": 4000,
|
183
|
+
"s3_client_kwargs": {},
|
184
|
+
"source_partition_locator": source_delta.partition_locator,
|
185
|
+
}
|
186
|
+
)
|
187
|
+
)
|
188
|
+
|
189
|
+
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
190
|
+
assert bucket == TEST_S3_RCF_BUCKET_NAME
|
191
|
+
|
192
|
+
# Now delete the RCF at new location and copy it to old location
|
193
|
+
# Copy the RCF from rcf_url to another location
|
194
|
+
s3_resource.Object(TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}.json").copy_from(
|
195
|
+
CopySource=f"{TEST_S3_RCF_BUCKET_NAME}/{backfill_key1}/{backfill_key2}"
|
196
|
+
)
|
197
|
+
|
198
|
+
s3_resource.Object(
|
199
|
+
TEST_S3_RCF_BUCKET_NAME, f"{backfill_key1}/{backfill_key2}"
|
200
|
+
).delete()
|
201
|
+
|
202
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
203
|
+
|
204
|
+
new_source_delta = commit_delta_to_partition(
|
205
|
+
source_delta.partition_locator,
|
206
|
+
[self.INCREMENTAL_FILE_PATH],
|
207
|
+
**local_deltacat_storage_kwargs,
|
208
|
+
)
|
209
|
+
|
210
|
+
new_rcf_url = compact_partition(
|
211
|
+
CompactPartitionParams.of(
|
212
|
+
{
|
213
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
214
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
215
|
+
"dd_max_parallelism_ratio": 1.0,
|
216
|
+
"deltacat_storage": ds,
|
217
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
218
|
+
"destination_partition_locator": dest_partition.locator,
|
219
|
+
"drop_duplicates": True,
|
220
|
+
"hash_bucket_count": 1,
|
221
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
222
|
+
"list_deltas_kwargs": {
|
223
|
+
**local_deltacat_storage_kwargs,
|
224
|
+
**{"equivalent_table_types": []},
|
225
|
+
},
|
226
|
+
"primary_keys": ["pk"],
|
78
227
|
"rebase_source_partition_locator": None,
|
79
228
|
"rebase_source_partition_high_watermark": None,
|
80
229
|
"records_per_compacted_file": 4000,
|
81
230
|
"s3_client_kwargs": {},
|
82
|
-
"source_partition_locator":
|
231
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
83
232
|
}
|
84
233
|
)
|
85
234
|
)
|
86
235
|
|
87
|
-
|
88
|
-
|
236
|
+
new_bucket, incremental_key1, incremental_key2 = new_rcf_url.strip(
|
237
|
+
"s3://"
|
238
|
+
).split("/")
|
239
|
+
|
240
|
+
assert new_bucket == TEST_S3_RCF_BUCKET_NAME
|
241
|
+
assert backfill_key1 == incremental_key1
|
242
|
+
assert backfill_key2 != incremental_key2
|
243
|
+
|
244
|
+
rcf = get_rcf(s3_resource, new_rcf_url)
|
245
|
+
|
246
|
+
_, compaction_audit_key = rcf.compaction_audit_url.strip("s3://").split("/", 1)
|
247
|
+
compaction_audit = CompactionSessionAuditInfo(
|
248
|
+
**read_s3_contents(
|
249
|
+
s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
|
250
|
+
)
|
251
|
+
)
|
252
|
+
|
253
|
+
# as it should be running incremental
|
254
|
+
assert compaction_audit.uniform_deltas_created == 1
|
255
|
+
assert compaction_audit.input_records == 6
|