deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +65 -38
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +26 -16
- deltacat/compute/compactor_v2/constants.py +5 -11
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/dataset.py +5 -17
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +56 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +124 -29
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +11 -8
- deltacat/utils/ray_utils/dataset.py +7 -7
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,289 @@
|
|
1
|
+
import ray
|
2
|
+
import os
|
3
|
+
from moto import mock_s3
|
4
|
+
import pytest
|
5
|
+
import boto3
|
6
|
+
from boto3.resources.base import ServiceResource
|
7
|
+
import pyarrow as pa
|
8
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
9
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
+
|
11
|
+
from deltacat.tests.compute.test_util_constant import (
|
12
|
+
TEST_S3_RCF_BUCKET_NAME,
|
13
|
+
DEFAULT_NUM_WORKERS,
|
14
|
+
DEFAULT_WORKER_INSTANCE_CPUS,
|
15
|
+
)
|
16
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
17
|
+
from deltacat.tests.compute.test_util_common import (
|
18
|
+
get_compacted_delta_locator_from_rcf,
|
19
|
+
)
|
20
|
+
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
21
|
+
create_src_w_deltas_destination_rebase_w_deltas_strategy,
|
22
|
+
)
|
23
|
+
from deltacat.tests.compute.compact_partition_rebase_test_cases import (
|
24
|
+
REBASE_TEST_CASES,
|
25
|
+
)
|
26
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
27
|
+
from deltacat.types.media import StorageType
|
28
|
+
from deltacat.storage import (
|
29
|
+
DeltaLocator,
|
30
|
+
Partition,
|
31
|
+
)
|
32
|
+
from deltacat.types.media import ContentType
|
33
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
34
|
+
CompactPartitionParams,
|
35
|
+
)
|
36
|
+
from deltacat.utils.placement import (
|
37
|
+
PlacementGroupManager,
|
38
|
+
)
|
39
|
+
|
40
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
41
|
+
"db_file_path",
|
42
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
"""
|
47
|
+
MODULE scoped fixtures
|
48
|
+
"""
|
49
|
+
|
50
|
+
|
51
|
+
@pytest.fixture(autouse=True, scope="module")
|
52
|
+
def setup_ray_cluster():
|
53
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
54
|
+
yield
|
55
|
+
ray.shutdown()
|
56
|
+
|
57
|
+
|
58
|
+
@pytest.fixture(autouse=True, scope="module")
|
59
|
+
def mock_aws_credential():
|
60
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
61
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
62
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
63
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
64
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
65
|
+
yield
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture(autouse=True, scope="module")
|
69
|
+
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
70
|
+
# make sure the database file is deleted after all the compactor package tests are completed
|
71
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
72
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
73
|
+
|
74
|
+
|
75
|
+
@pytest.fixture(scope="module")
|
76
|
+
def s3_resource(mock_aws_credential):
|
77
|
+
with mock_s3():
|
78
|
+
yield boto3.resource("s3")
|
79
|
+
|
80
|
+
|
81
|
+
@pytest.fixture(autouse=True, scope="module")
|
82
|
+
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
83
|
+
s3_resource.create_bucket(
|
84
|
+
ACL="authenticated-read",
|
85
|
+
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
86
|
+
)
|
87
|
+
yield
|
88
|
+
|
89
|
+
|
90
|
+
"""
|
91
|
+
FUNCTION scoped fixtures
|
92
|
+
"""
|
93
|
+
|
94
|
+
|
95
|
+
@pytest.fixture(scope="function")
|
96
|
+
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
97
|
+
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
98
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
99
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
100
|
+
}
|
101
|
+
yield kwargs_for_local_deltacat_storage
|
102
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
103
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
104
|
+
|
105
|
+
|
106
|
+
@pytest.mark.parametrize(
|
107
|
+
[
|
108
|
+
"test_name",
|
109
|
+
"primary_keys",
|
110
|
+
"sort_keys",
|
111
|
+
"partition_keys_param",
|
112
|
+
"partition_values_param",
|
113
|
+
"input_deltas_param",
|
114
|
+
"input_deltas_delta_type",
|
115
|
+
"expected_terminal_compact_partition_result",
|
116
|
+
"expected_terminal_exception",
|
117
|
+
"expected_terminal_exception_message",
|
118
|
+
"create_placement_group_param",
|
119
|
+
"records_per_compacted_file_param",
|
120
|
+
"hash_bucket_count_param",
|
121
|
+
"read_kwargs_provider_param",
|
122
|
+
"drop_duplicates_param",
|
123
|
+
"skip_enabled_compact_partition_drivers",
|
124
|
+
"rebase_expected_compact_partition_result",
|
125
|
+
"compact_partition_func",
|
126
|
+
],
|
127
|
+
[
|
128
|
+
(
|
129
|
+
test_name,
|
130
|
+
primary_keys,
|
131
|
+
sort_keys,
|
132
|
+
partition_keys_param,
|
133
|
+
partition_values_param,
|
134
|
+
input_deltas,
|
135
|
+
input_deltas_delta_type,
|
136
|
+
expected_terminal_compact_partition_result,
|
137
|
+
expected_terminal_exception,
|
138
|
+
expected_terminal_exception_message,
|
139
|
+
create_placement_group_param,
|
140
|
+
records_per_compacted_file_param,
|
141
|
+
hash_bucket_count_param,
|
142
|
+
drop_duplicates_param,
|
143
|
+
read_kwargs_provider,
|
144
|
+
skip_enabled_compact_partition_drivers,
|
145
|
+
rebase_expected_compact_partition_result,
|
146
|
+
compact_partition_func,
|
147
|
+
)
|
148
|
+
for test_name, (
|
149
|
+
primary_keys,
|
150
|
+
sort_keys,
|
151
|
+
partition_keys_param,
|
152
|
+
partition_values_param,
|
153
|
+
input_deltas,
|
154
|
+
input_deltas_delta_type,
|
155
|
+
expected_terminal_compact_partition_result,
|
156
|
+
expected_terminal_exception,
|
157
|
+
expected_terminal_exception_message,
|
158
|
+
create_placement_group_param,
|
159
|
+
records_per_compacted_file_param,
|
160
|
+
hash_bucket_count_param,
|
161
|
+
drop_duplicates_param,
|
162
|
+
read_kwargs_provider,
|
163
|
+
skip_enabled_compact_partition_drivers,
|
164
|
+
rebase_expected_compact_partition_result,
|
165
|
+
compact_partition_func,
|
166
|
+
) in REBASE_TEST_CASES.items()
|
167
|
+
],
|
168
|
+
ids=[test_name for test_name in REBASE_TEST_CASES],
|
169
|
+
)
|
170
|
+
def test_compact_partition_rebase_same_source_and_destination(
|
171
|
+
mocker,
|
172
|
+
s3_resource: ServiceResource,
|
173
|
+
local_deltacat_storage_kwargs: Dict[str, Any],
|
174
|
+
test_name: str,
|
175
|
+
primary_keys: Set[str],
|
176
|
+
sort_keys: List[Optional[Any]],
|
177
|
+
partition_keys_param: Optional[List[Any]],
|
178
|
+
partition_values_param: List[Optional[str]],
|
179
|
+
input_deltas_param: List[pa.Array],
|
180
|
+
input_deltas_delta_type: str,
|
181
|
+
expected_terminal_compact_partition_result: pa.Table,
|
182
|
+
expected_terminal_exception: BaseException,
|
183
|
+
expected_terminal_exception_message: Optional[str],
|
184
|
+
create_placement_group_param: bool,
|
185
|
+
records_per_compacted_file_param: int,
|
186
|
+
hash_bucket_count_param: int,
|
187
|
+
drop_duplicates_param: bool,
|
188
|
+
read_kwargs_provider_param: Any,
|
189
|
+
rebase_expected_compact_partition_result: pa.Table,
|
190
|
+
skip_enabled_compact_partition_drivers: List[CompactorVersion],
|
191
|
+
compact_partition_func: Callable,
|
192
|
+
benchmark: BenchmarkFixture,
|
193
|
+
):
|
194
|
+
import deltacat.tests.local_deltacat_storage as ds
|
195
|
+
|
196
|
+
ds_mock_kwargs = local_deltacat_storage_kwargs
|
197
|
+
"""
|
198
|
+
This test tests the scenario where source partition locator == destination partition locator,
|
199
|
+
but rebase source partition locator is different.
|
200
|
+
This scenario could occur when hash bucket count changes.
|
201
|
+
"""
|
202
|
+
partition_keys = partition_keys_param
|
203
|
+
(
|
204
|
+
source_table_stream,
|
205
|
+
_,
|
206
|
+
rebased_table_stream,
|
207
|
+
) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
208
|
+
primary_keys,
|
209
|
+
sort_keys,
|
210
|
+
partition_keys,
|
211
|
+
input_deltas_param,
|
212
|
+
input_deltas_delta_type,
|
213
|
+
partition_values_param,
|
214
|
+
ds_mock_kwargs,
|
215
|
+
)
|
216
|
+
source_partition: Partition = ds.get_partition(
|
217
|
+
source_table_stream.locator,
|
218
|
+
partition_values_param,
|
219
|
+
**ds_mock_kwargs,
|
220
|
+
)
|
221
|
+
rebased_partition: Partition = ds.get_partition(
|
222
|
+
rebased_table_stream.locator,
|
223
|
+
partition_values_param,
|
224
|
+
**ds_mock_kwargs,
|
225
|
+
)
|
226
|
+
num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
|
227
|
+
total_cpus = num_workers * worker_instance_cpu
|
228
|
+
pgm = None
|
229
|
+
create_placement_group_param = False
|
230
|
+
if create_placement_group_param:
|
231
|
+
pgm = PlacementGroupManager(
|
232
|
+
1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
|
233
|
+
).pgs[0]
|
234
|
+
compact_partition_params = CompactPartitionParams.of(
|
235
|
+
{
|
236
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
237
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
238
|
+
"dd_max_parallelism_ratio": 1.0,
|
239
|
+
"deltacat_storage": ds,
|
240
|
+
"deltacat_storage_kwargs": ds_mock_kwargs,
|
241
|
+
"destination_partition_locator": rebased_partition.locator,
|
242
|
+
"hash_bucket_count": hash_bucket_count_param,
|
243
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
244
|
+
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
245
|
+
"object_store": RayPlasmaObjectStore(),
|
246
|
+
"pg_config": pgm,
|
247
|
+
"primary_keys": primary_keys,
|
248
|
+
"read_kwargs_provider": read_kwargs_provider_param,
|
249
|
+
"rebase_source_partition_locator": source_partition.locator,
|
250
|
+
"records_per_compacted_file": records_per_compacted_file_param,
|
251
|
+
"s3_client_kwargs": {},
|
252
|
+
"source_partition_locator": rebased_partition.locator,
|
253
|
+
"sort_keys": sort_keys if sort_keys else None,
|
254
|
+
}
|
255
|
+
)
|
256
|
+
|
257
|
+
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
258
|
+
ExecutionCompactionResult,
|
259
|
+
)
|
260
|
+
|
261
|
+
execute_compaction_result_spy = mocker.spy(ExecutionCompactionResult, "__init__")
|
262
|
+
|
263
|
+
# execute
|
264
|
+
rcf_file_s3_uri = compact_partition_func(compact_partition_params)
|
265
|
+
|
266
|
+
# Assert not in-place compacted
|
267
|
+
assert (
|
268
|
+
execute_compaction_result_spy.call_args.args[-1] is False
|
269
|
+
), "Table version erroneously marked as in-place compacted!"
|
270
|
+
compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
|
271
|
+
s3_resource, rcf_file_s3_uri
|
272
|
+
)
|
273
|
+
tables = ds.download_delta(
|
274
|
+
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
275
|
+
)
|
276
|
+
actual_rebase_compacted_table = pa.concat_tables(tables)
|
277
|
+
# if no primary key is specified then sort by sort_key for consistent assertion
|
278
|
+
sorting_cols: List[Any] = (
|
279
|
+
[(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
|
280
|
+
)
|
281
|
+
rebase_expected_compact_partition_result = (
|
282
|
+
rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
|
283
|
+
)
|
284
|
+
actual_rebase_compacted_table = (
|
285
|
+
actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
|
286
|
+
)
|
287
|
+
assert actual_rebase_compacted_table.equals(
|
288
|
+
rebase_expected_compact_partition_result
|
289
|
+
), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
|
@@ -24,6 +24,12 @@ from deltacat.compute.compactor import (
|
|
24
24
|
RoundCompletionInfo,
|
25
25
|
)
|
26
26
|
|
27
|
+
from deltacat.storage.model.partition import PartitionLocator
|
28
|
+
from deltacat.storage.model.stream import StreamLocator
|
29
|
+
from deltacat.storage.model.table_version import TableVersionLocator
|
30
|
+
from deltacat.storage.model.table import TableLocator
|
31
|
+
from deltacat.storage.model.namespace import NamespaceLocator
|
32
|
+
|
27
33
|
|
28
34
|
class PartitionKeyType(str, Enum):
|
29
35
|
INT = "int"
|
@@ -51,6 +57,18 @@ UTILS
|
|
51
57
|
"""
|
52
58
|
|
53
59
|
|
60
|
+
def get_test_partition_locator(partition_id):
|
61
|
+
tv_locator = TableVersionLocator.of(
|
62
|
+
TableLocator.of(NamespaceLocator.of("default"), "test_table"), "1"
|
63
|
+
)
|
64
|
+
stream_locator = StreamLocator.of(tv_locator, "test_stream_id", "local")
|
65
|
+
partition_locator = PartitionLocator.of(
|
66
|
+
stream_locator, partition_id=partition_id, partition_values=[]
|
67
|
+
)
|
68
|
+
|
69
|
+
return partition_locator
|
70
|
+
|
71
|
+
|
54
72
|
def _create_table(
|
55
73
|
namespace: str,
|
56
74
|
table_name: str,
|
@@ -140,7 +158,7 @@ def create_rebase_table(
|
|
140
158
|
def get_rcf(s3_resource, rcf_file_s3_uri: str) -> RoundCompletionInfo:
|
141
159
|
from deltacat.tests.test_utils.utils import read_s3_contents
|
142
160
|
|
143
|
-
_, rcf_object_key = rcf_file_s3_uri.
|
161
|
+
_, rcf_object_key = rcf_file_s3_uri.strip("s3://").split("/", 1)
|
144
162
|
rcf_file_output: Dict[str, Any] = read_s3_contents(
|
145
163
|
s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
|
146
164
|
)
|
@@ -151,9 +169,6 @@ def get_compacted_delta_locator_from_rcf(
|
|
151
169
|
s3_resource: ServiceResource, rcf_file_s3_uri: str
|
152
170
|
):
|
153
171
|
from deltacat.storage import DeltaLocator
|
154
|
-
from deltacat.compute.compactor import (
|
155
|
-
RoundCompletionInfo,
|
156
|
-
)
|
157
172
|
|
158
173
|
round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
|
159
174
|
|
@@ -251,6 +251,7 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
251
251
|
ds.commit_partition(staged_partition, **ds_mock_kwargs)
|
252
252
|
|
253
253
|
# get streams
|
254
|
+
# TODO: Add deltas to destination stream
|
254
255
|
destination_table_stream: Stream = ds.get_stream(
|
255
256
|
namespace=destination_table_namespace,
|
256
257
|
table_name=destination_table_name,
|
@@ -1,6 +1,9 @@
|
|
1
1
|
import unittest
|
2
2
|
import numpy as np
|
3
3
|
from unittest import mock
|
4
|
+
from deltacat.exceptions import (
|
5
|
+
PymemcachedPutObjectError,
|
6
|
+
)
|
4
7
|
|
5
8
|
|
6
9
|
class MockPyMemcacheClient:
|
@@ -86,7 +89,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
|
|
86
89
|
mock_retrying_client.return_value = mock_client.return_value
|
87
90
|
mock_client.return_value.set_many.return_value = ["abcd"]
|
88
91
|
|
89
|
-
with self.assertRaises(
|
92
|
+
with self.assertRaises(PymemcachedPutObjectError):
|
90
93
|
self.object_store.put_many(["a", "b"])
|
91
94
|
|
92
95
|
self.assertEqual(1, mock_client.return_value.set_many.call_count)
|
@@ -169,7 +172,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
|
|
169
172
|
mock_client.return_value.set.return_value = False
|
170
173
|
mock_retrying_client.return_value = mock_client.return_value
|
171
174
|
|
172
|
-
with self.assertRaises(
|
175
|
+
with self.assertRaises(PymemcachedPutObjectError):
|
173
176
|
self.object_store.put("test_ip")
|
174
177
|
|
175
178
|
self.assertEqual(1, mock_client.return_value.set.call_count)
|