deltacat 1.1.8__py3-none-any.whl → 1.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/s3u.py +46 -25
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor_v2/compaction_session.py +11 -5
- deltacat/compute/compactor_v2/constants.py +2 -11
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/interface.py +14 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +3 -1
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +41 -10
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/pyarrow.py +8 -5
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/METADATA +2 -2
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/RECORD +38 -34
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,289 @@
|
|
1
|
+
import ray
|
2
|
+
import os
|
3
|
+
from moto import mock_s3
|
4
|
+
import pytest
|
5
|
+
import boto3
|
6
|
+
from boto3.resources.base import ServiceResource
|
7
|
+
import pyarrow as pa
|
8
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
9
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
+
|
11
|
+
from deltacat.tests.compute.test_util_constant import (
|
12
|
+
TEST_S3_RCF_BUCKET_NAME,
|
13
|
+
DEFAULT_NUM_WORKERS,
|
14
|
+
DEFAULT_WORKER_INSTANCE_CPUS,
|
15
|
+
)
|
16
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
17
|
+
from deltacat.tests.compute.test_util_common import (
|
18
|
+
get_compacted_delta_locator_from_rcf,
|
19
|
+
)
|
20
|
+
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
21
|
+
create_src_w_deltas_destination_rebase_w_deltas_strategy,
|
22
|
+
)
|
23
|
+
from deltacat.tests.compute.compact_partition_rebase_test_cases import (
|
24
|
+
REBASE_TEST_CASES,
|
25
|
+
)
|
26
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
27
|
+
from deltacat.types.media import StorageType
|
28
|
+
from deltacat.storage import (
|
29
|
+
DeltaLocator,
|
30
|
+
Partition,
|
31
|
+
)
|
32
|
+
from deltacat.types.media import ContentType
|
33
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
34
|
+
CompactPartitionParams,
|
35
|
+
)
|
36
|
+
from deltacat.utils.placement import (
|
37
|
+
PlacementGroupManager,
|
38
|
+
)
|
39
|
+
|
40
|
+
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
41
|
+
"db_file_path",
|
42
|
+
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
"""
|
47
|
+
MODULE scoped fixtures
|
48
|
+
"""
|
49
|
+
|
50
|
+
|
51
|
+
@pytest.fixture(autouse=True, scope="module")
|
52
|
+
def setup_ray_cluster():
|
53
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
54
|
+
yield
|
55
|
+
ray.shutdown()
|
56
|
+
|
57
|
+
|
58
|
+
@pytest.fixture(autouse=True, scope="module")
|
59
|
+
def mock_aws_credential():
|
60
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
61
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
62
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
63
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
64
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
65
|
+
yield
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture(autouse=True, scope="module")
|
69
|
+
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
70
|
+
# make sure the database file is deleted after all the compactor package tests are completed
|
71
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
72
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
73
|
+
|
74
|
+
|
75
|
+
@pytest.fixture(scope="module")
|
76
|
+
def s3_resource(mock_aws_credential):
|
77
|
+
with mock_s3():
|
78
|
+
yield boto3.resource("s3")
|
79
|
+
|
80
|
+
|
81
|
+
@pytest.fixture(autouse=True, scope="module")
|
82
|
+
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
83
|
+
s3_resource.create_bucket(
|
84
|
+
ACL="authenticated-read",
|
85
|
+
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
86
|
+
)
|
87
|
+
yield
|
88
|
+
|
89
|
+
|
90
|
+
"""
|
91
|
+
FUNCTION scoped fixtures
|
92
|
+
"""
|
93
|
+
|
94
|
+
|
95
|
+
@pytest.fixture(scope="function")
|
96
|
+
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
97
|
+
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
98
|
+
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
99
|
+
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
100
|
+
}
|
101
|
+
yield kwargs_for_local_deltacat_storage
|
102
|
+
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
103
|
+
os.remove(DATABASE_FILE_PATH_VALUE)
|
104
|
+
|
105
|
+
|
106
|
+
@pytest.mark.parametrize(
|
107
|
+
[
|
108
|
+
"test_name",
|
109
|
+
"primary_keys",
|
110
|
+
"sort_keys",
|
111
|
+
"partition_keys_param",
|
112
|
+
"partition_values_param",
|
113
|
+
"input_deltas_param",
|
114
|
+
"input_deltas_delta_type",
|
115
|
+
"expected_terminal_compact_partition_result",
|
116
|
+
"expected_terminal_exception",
|
117
|
+
"expected_terminal_exception_message",
|
118
|
+
"create_placement_group_param",
|
119
|
+
"records_per_compacted_file_param",
|
120
|
+
"hash_bucket_count_param",
|
121
|
+
"read_kwargs_provider_param",
|
122
|
+
"drop_duplicates_param",
|
123
|
+
"skip_enabled_compact_partition_drivers",
|
124
|
+
"rebase_expected_compact_partition_result",
|
125
|
+
"compact_partition_func",
|
126
|
+
],
|
127
|
+
[
|
128
|
+
(
|
129
|
+
test_name,
|
130
|
+
primary_keys,
|
131
|
+
sort_keys,
|
132
|
+
partition_keys_param,
|
133
|
+
partition_values_param,
|
134
|
+
input_deltas,
|
135
|
+
input_deltas_delta_type,
|
136
|
+
expected_terminal_compact_partition_result,
|
137
|
+
expected_terminal_exception,
|
138
|
+
expected_terminal_exception_message,
|
139
|
+
create_placement_group_param,
|
140
|
+
records_per_compacted_file_param,
|
141
|
+
hash_bucket_count_param,
|
142
|
+
drop_duplicates_param,
|
143
|
+
read_kwargs_provider,
|
144
|
+
skip_enabled_compact_partition_drivers,
|
145
|
+
rebase_expected_compact_partition_result,
|
146
|
+
compact_partition_func,
|
147
|
+
)
|
148
|
+
for test_name, (
|
149
|
+
primary_keys,
|
150
|
+
sort_keys,
|
151
|
+
partition_keys_param,
|
152
|
+
partition_values_param,
|
153
|
+
input_deltas,
|
154
|
+
input_deltas_delta_type,
|
155
|
+
expected_terminal_compact_partition_result,
|
156
|
+
expected_terminal_exception,
|
157
|
+
expected_terminal_exception_message,
|
158
|
+
create_placement_group_param,
|
159
|
+
records_per_compacted_file_param,
|
160
|
+
hash_bucket_count_param,
|
161
|
+
drop_duplicates_param,
|
162
|
+
read_kwargs_provider,
|
163
|
+
skip_enabled_compact_partition_drivers,
|
164
|
+
rebase_expected_compact_partition_result,
|
165
|
+
compact_partition_func,
|
166
|
+
) in REBASE_TEST_CASES.items()
|
167
|
+
],
|
168
|
+
ids=[test_name for test_name in REBASE_TEST_CASES],
|
169
|
+
)
|
170
|
+
def test_compact_partition_rebase_same_source_and_destination(
|
171
|
+
mocker,
|
172
|
+
s3_resource: ServiceResource,
|
173
|
+
local_deltacat_storage_kwargs: Dict[str, Any],
|
174
|
+
test_name: str,
|
175
|
+
primary_keys: Set[str],
|
176
|
+
sort_keys: List[Optional[Any]],
|
177
|
+
partition_keys_param: Optional[List[Any]],
|
178
|
+
partition_values_param: List[Optional[str]],
|
179
|
+
input_deltas_param: List[pa.Array],
|
180
|
+
input_deltas_delta_type: str,
|
181
|
+
expected_terminal_compact_partition_result: pa.Table,
|
182
|
+
expected_terminal_exception: BaseException,
|
183
|
+
expected_terminal_exception_message: Optional[str],
|
184
|
+
create_placement_group_param: bool,
|
185
|
+
records_per_compacted_file_param: int,
|
186
|
+
hash_bucket_count_param: int,
|
187
|
+
drop_duplicates_param: bool,
|
188
|
+
read_kwargs_provider_param: Any,
|
189
|
+
rebase_expected_compact_partition_result: pa.Table,
|
190
|
+
skip_enabled_compact_partition_drivers: List[CompactorVersion],
|
191
|
+
compact_partition_func: Callable,
|
192
|
+
benchmark: BenchmarkFixture,
|
193
|
+
):
|
194
|
+
import deltacat.tests.local_deltacat_storage as ds
|
195
|
+
|
196
|
+
ds_mock_kwargs = local_deltacat_storage_kwargs
|
197
|
+
"""
|
198
|
+
This test tests the scenario where source partition locator == destination partition locator,
|
199
|
+
but rebase source partition locator is different.
|
200
|
+
This scenario could occur when hash bucket count changes.
|
201
|
+
"""
|
202
|
+
partition_keys = partition_keys_param
|
203
|
+
(
|
204
|
+
source_table_stream,
|
205
|
+
_,
|
206
|
+
rebased_table_stream,
|
207
|
+
) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
208
|
+
primary_keys,
|
209
|
+
sort_keys,
|
210
|
+
partition_keys,
|
211
|
+
input_deltas_param,
|
212
|
+
input_deltas_delta_type,
|
213
|
+
partition_values_param,
|
214
|
+
ds_mock_kwargs,
|
215
|
+
)
|
216
|
+
source_partition: Partition = ds.get_partition(
|
217
|
+
source_table_stream.locator,
|
218
|
+
partition_values_param,
|
219
|
+
**ds_mock_kwargs,
|
220
|
+
)
|
221
|
+
rebased_partition: Partition = ds.get_partition(
|
222
|
+
rebased_table_stream.locator,
|
223
|
+
partition_values_param,
|
224
|
+
**ds_mock_kwargs,
|
225
|
+
)
|
226
|
+
num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
|
227
|
+
total_cpus = num_workers * worker_instance_cpu
|
228
|
+
pgm = None
|
229
|
+
create_placement_group_param = False
|
230
|
+
if create_placement_group_param:
|
231
|
+
pgm = PlacementGroupManager(
|
232
|
+
1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
|
233
|
+
).pgs[0]
|
234
|
+
compact_partition_params = CompactPartitionParams.of(
|
235
|
+
{
|
236
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
237
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
238
|
+
"dd_max_parallelism_ratio": 1.0,
|
239
|
+
"deltacat_storage": ds,
|
240
|
+
"deltacat_storage_kwargs": ds_mock_kwargs,
|
241
|
+
"destination_partition_locator": rebased_partition.locator,
|
242
|
+
"hash_bucket_count": hash_bucket_count_param,
|
243
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
244
|
+
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
245
|
+
"object_store": RayPlasmaObjectStore(),
|
246
|
+
"pg_config": pgm,
|
247
|
+
"primary_keys": primary_keys,
|
248
|
+
"read_kwargs_provider": read_kwargs_provider_param,
|
249
|
+
"rebase_source_partition_locator": source_partition.locator,
|
250
|
+
"records_per_compacted_file": records_per_compacted_file_param,
|
251
|
+
"s3_client_kwargs": {},
|
252
|
+
"source_partition_locator": rebased_partition.locator,
|
253
|
+
"sort_keys": sort_keys if sort_keys else None,
|
254
|
+
}
|
255
|
+
)
|
256
|
+
|
257
|
+
from deltacat.compute.compactor_v2.model.compaction_session import (
|
258
|
+
ExecutionCompactionResult,
|
259
|
+
)
|
260
|
+
|
261
|
+
execute_compaction_result_spy = mocker.spy(ExecutionCompactionResult, "__init__")
|
262
|
+
|
263
|
+
# execute
|
264
|
+
rcf_file_s3_uri = compact_partition_func(compact_partition_params)
|
265
|
+
|
266
|
+
# Assert not in-place compacted
|
267
|
+
assert (
|
268
|
+
execute_compaction_result_spy.call_args.args[-1] is False
|
269
|
+
), "Table version erroneously marked as in-place compacted!"
|
270
|
+
compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
|
271
|
+
s3_resource, rcf_file_s3_uri
|
272
|
+
)
|
273
|
+
tables = ds.download_delta(
|
274
|
+
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
275
|
+
)
|
276
|
+
actual_rebase_compacted_table = pa.concat_tables(tables)
|
277
|
+
# if no primary key is specified then sort by sort_key for consistent assertion
|
278
|
+
sorting_cols: List[Any] = (
|
279
|
+
[(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
|
280
|
+
)
|
281
|
+
rebase_expected_compact_partition_result = (
|
282
|
+
rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
|
283
|
+
)
|
284
|
+
actual_rebase_compacted_table = (
|
285
|
+
actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
|
286
|
+
)
|
287
|
+
assert actual_rebase_compacted_table.equals(
|
288
|
+
rebase_expected_compact_partition_result
|
289
|
+
), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
|
@@ -251,6 +251,7 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
251
251
|
ds.commit_partition(staged_partition, **ds_mock_kwargs)
|
252
252
|
|
253
253
|
# get streams
|
254
|
+
# TODO: Add deltas to destination stream
|
254
255
|
destination_table_stream: Stream = ds.get_stream(
|
255
256
|
namespace=destination_table_namespace,
|
256
257
|
table_name=destination_table_name,
|
@@ -1,6 +1,9 @@
|
|
1
1
|
import unittest
|
2
2
|
import numpy as np
|
3
3
|
from unittest import mock
|
4
|
+
from deltacat.exceptions import (
|
5
|
+
PymemcachedPutObjectError,
|
6
|
+
)
|
4
7
|
|
5
8
|
|
6
9
|
class MockPyMemcacheClient:
|
@@ -86,7 +89,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
|
|
86
89
|
mock_retrying_client.return_value = mock_client.return_value
|
87
90
|
mock_client.return_value.set_many.return_value = ["abcd"]
|
88
91
|
|
89
|
-
with self.assertRaises(
|
92
|
+
with self.assertRaises(PymemcachedPutObjectError):
|
90
93
|
self.object_store.put_many(["a", "b"])
|
91
94
|
|
92
95
|
self.assertEqual(1, mock_client.return_value.set_many.call_count)
|
@@ -169,7 +172,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
|
|
169
172
|
mock_client.return_value.set.return_value = False
|
170
173
|
mock_retrying_client.return_value = mock_client.return_value
|
171
174
|
|
172
|
-
with self.assertRaises(
|
175
|
+
with self.assertRaises(PymemcachedPutObjectError):
|
173
176
|
self.object_store.put("test_ip")
|
174
177
|
|
175
178
|
self.assertEqual(1, mock_client.return_value.set.call_count)
|
@@ -12,6 +12,7 @@ import io
|
|
12
12
|
from deltacat.tests.test_utils.storage import create_empty_delta
|
13
13
|
from deltacat.utils.common import current_time_ms
|
14
14
|
|
15
|
+
|
15
16
|
from deltacat.storage import (
|
16
17
|
Delta,
|
17
18
|
DeltaLocator,
|
@@ -49,6 +50,10 @@ from deltacat.types.media import (
|
|
49
50
|
DistributedDatasetType,
|
50
51
|
)
|
51
52
|
from deltacat.utils.common import ReadKwargsProvider
|
53
|
+
from deltacat.tests.local_deltacat_storage.exceptions import (
|
54
|
+
InvalidNamespaceError,
|
55
|
+
LocalStorageValidationError,
|
56
|
+
)
|
52
57
|
|
53
58
|
SQLITE_CUR_ARG = "sqlite3_cur"
|
54
59
|
SQLITE_CON_ARG = "sqlite3_con"
|
@@ -97,6 +102,19 @@ def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
|
|
97
102
|
return f"cloudpickle://{manifest_entry_id}"
|
98
103
|
|
99
104
|
|
105
|
+
def _merge_and_promote(
|
106
|
+
partition_deltas: List[Delta], previous_partition_deltas: List[Delta]
|
107
|
+
):
|
108
|
+
previous_partition_deltas_spos_gt: List[Delta] = [
|
109
|
+
delta
|
110
|
+
for delta in previous_partition_deltas
|
111
|
+
if delta.stream_position > partition_deltas[0].stream_position
|
112
|
+
]
|
113
|
+
# handle the case if the previous partition deltas have a greater stream position than the partition_delta
|
114
|
+
partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
|
115
|
+
return partition_deltas
|
116
|
+
|
117
|
+
|
100
118
|
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
101
119
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
102
120
|
res = cur.execute("SELECT * FROM namespaces")
|
@@ -820,19 +838,19 @@ def commit_partition(
|
|
820
838
|
).all_items()
|
821
839
|
or []
|
822
840
|
)
|
841
|
+
|
823
842
|
partition_deltas: Optional[List[Delta]] = (
|
824
843
|
list_partition_deltas(
|
825
844
|
partition, ascending_order=False, *args, **kwargs
|
826
845
|
).all_items()
|
827
846
|
or []
|
828
847
|
)
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
|
848
|
+
|
849
|
+
# if previous_partition is passed in, table is in-place compacted and we need to run merge-and-promote
|
850
|
+
if previous_partition:
|
851
|
+
partition_deltas = _merge_and_promote(
|
852
|
+
partition_deltas, previous_partition_deltas
|
853
|
+
)
|
836
854
|
|
837
855
|
stream_position = (
|
838
856
|
partition_deltas[0].stream_position
|
@@ -840,13 +858,14 @@ def commit_partition(
|
|
840
858
|
else partition.stream_position
|
841
859
|
)
|
842
860
|
|
843
|
-
partition.state = CommitState.COMMITTED
|
844
861
|
partition.stream_position = stream_position
|
862
|
+
if partition_deltas:
|
863
|
+
partition.locator = partition_deltas[0].partition_locator
|
864
|
+
|
865
|
+
partition.state = CommitState.COMMITTED
|
845
866
|
partition.previous_stream_position = (
|
846
867
|
pv_partition.stream_position if pv_partition else None
|
847
868
|
)
|
848
|
-
if partition_deltas:
|
849
|
-
partition.locator = partition_deltas[0].partition_locator
|
850
869
|
params = (json.dumps(partition), partition.locator.canonical_string())
|
851
870
|
cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
|
852
871
|
con.commit()
|
@@ -1162,3 +1181,15 @@ def get_table_version_column_names(
|
|
1162
1181
|
**kwargs,
|
1163
1182
|
) -> Optional[List[str]]:
|
1164
1183
|
raise NotImplementedError("Fetching column names is not supported")
|
1184
|
+
|
1185
|
+
|
1186
|
+
def can_categorize(e: BaseException, **kwargs) -> bool:
|
1187
|
+
if isinstance(e, InvalidNamespaceError):
|
1188
|
+
return True
|
1189
|
+
else:
|
1190
|
+
return False
|
1191
|
+
|
1192
|
+
|
1193
|
+
def raise_categorized_error(e: BaseException, **kwargs):
|
1194
|
+
if isinstance(e, InvalidNamespaceError):
|
1195
|
+
raise LocalStorageValidationError("Namespace provided is invalid!")
|
@@ -0,0 +1,100 @@
|
|
1
|
+
import unittest
|
2
|
+
from deltacat.exceptions import categorize_errors
|
3
|
+
import ray
|
4
|
+
from deltacat.exceptions import (
|
5
|
+
DependencyPyarrowCapacityError,
|
6
|
+
NonRetryableDownloadTableError,
|
7
|
+
RetryableError,
|
8
|
+
NonRetryableError,
|
9
|
+
DeltaCatTransientError,
|
10
|
+
DependencyDaftTransientError,
|
11
|
+
UnclassifiedDeltaCatError,
|
12
|
+
)
|
13
|
+
from daft.exceptions import DaftTransientError
|
14
|
+
from deltacat.tests.local_deltacat_storage.exceptions import (
|
15
|
+
InvalidNamespaceError,
|
16
|
+
LocalStorageValidationError,
|
17
|
+
)
|
18
|
+
from botocore.exceptions import NoCredentialsError
|
19
|
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt
|
20
|
+
|
21
|
+
from pyarrow.lib import ArrowCapacityError
|
22
|
+
import deltacat.tests.local_deltacat_storage as ds
|
23
|
+
|
24
|
+
|
25
|
+
class MockUnknownException(Exception):
|
26
|
+
pass
|
27
|
+
|
28
|
+
|
29
|
+
@categorize_errors
|
30
|
+
def mock_raise_exception(exception_to_raise, deltacat_storage=ds):
|
31
|
+
raise exception_to_raise
|
32
|
+
|
33
|
+
|
34
|
+
@retry(retry=retry_if_exception_type(NoCredentialsError), stop=stop_after_attempt(2))
|
35
|
+
def mock_tenacity_wrapped_method(exception_to_raise):
|
36
|
+
mock_raise_exception(exception_to_raise)
|
37
|
+
|
38
|
+
|
39
|
+
@ray.remote
|
40
|
+
def mock_remote_task(exception_to_raise):
|
41
|
+
mock_raise_exception(exception_to_raise)
|
42
|
+
|
43
|
+
|
44
|
+
class TestCategorizeErrors(unittest.TestCase):
|
45
|
+
def test_pyarrow_exception_categorizer(self):
|
46
|
+
self.assertRaises(
|
47
|
+
DependencyPyarrowCapacityError,
|
48
|
+
lambda: mock_raise_exception(ArrowCapacityError),
|
49
|
+
)
|
50
|
+
|
51
|
+
def test_storage_exception_categorizer(self):
|
52
|
+
self.assertRaises(
|
53
|
+
LocalStorageValidationError,
|
54
|
+
lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
|
55
|
+
)
|
56
|
+
|
57
|
+
def test_non_retryable_error(self):
|
58
|
+
self.assertRaises(
|
59
|
+
NonRetryableError,
|
60
|
+
lambda: mock_raise_exception(NonRetryableDownloadTableError),
|
61
|
+
)
|
62
|
+
|
63
|
+
def test_retryable_error(self):
|
64
|
+
self.assertRaises(RetryableError, lambda: mock_raise_exception(ConnectionError))
|
65
|
+
|
66
|
+
def test_ray_task_returns_wrapped_exception(self):
|
67
|
+
self.assertRaises(
|
68
|
+
DeltaCatTransientError,
|
69
|
+
lambda: ray.get(mock_remote_task.remote(ConnectionError)),
|
70
|
+
)
|
71
|
+
|
72
|
+
def test_daft_transient_error(self):
|
73
|
+
self.assertRaises(
|
74
|
+
DependencyDaftTransientError,
|
75
|
+
lambda: ray.get(mock_remote_task.remote(DaftTransientError)),
|
76
|
+
)
|
77
|
+
|
78
|
+
def test_tenacity_underlying_error_returned(self):
|
79
|
+
self.assertRaises(
|
80
|
+
DeltaCatTransientError,
|
81
|
+
lambda: mock_tenacity_wrapped_method(NoCredentialsError),
|
82
|
+
)
|
83
|
+
|
84
|
+
def test_unclassified_error_when_error_cannot_be_categorized(self):
|
85
|
+
self.assertRaises(
|
86
|
+
UnclassifiedDeltaCatError,
|
87
|
+
lambda: ray.get(mock_remote_task.remote(MockUnknownException)),
|
88
|
+
)
|
89
|
+
|
90
|
+
def test_deltacat_exception_contains_attributes(self):
|
91
|
+
|
92
|
+
try:
|
93
|
+
mock_raise_exception(ConnectionError)
|
94
|
+
except DeltaCatTransientError as e:
|
95
|
+
self.assertTrue(hasattr(e, "is_retryable"))
|
96
|
+
self.assertTrue(hasattr(e, "error_name"))
|
97
|
+
assert e.error_name == "DeltaCatTransientError"
|
98
|
+
return
|
99
|
+
|
100
|
+
self.assertFalse(True)
|
deltacat/tests/test_logs.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
import unittest
|
2
2
|
from deltacat.types.media import ContentEncoding, ContentType
|
3
3
|
from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
|
4
|
-
|
5
4
|
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
6
5
|
from deltacat.types.partial_download import PartialParquetParameters
|
7
6
|
import pyarrow as pa
|
@@ -1,8 +1,6 @@
|
|
1
1
|
import unittest
|
2
2
|
from unittest import mock
|
3
3
|
import time
|
4
|
-
from multiprocessing import Pool
|
5
|
-
import platform
|
6
4
|
|
7
5
|
|
8
6
|
class TestGetCurrentClusterUtilization(unittest.TestCase):
|
@@ -72,29 +70,3 @@ class TestProcessUtilizationOverTimeRange(unittest.TestCase):
|
|
72
70
|
nu.schedule_callback(test_callback, 1)
|
73
71
|
time.sleep(3)
|
74
72
|
self.assertTrue(nu.test_field_set)
|
75
|
-
|
76
|
-
|
77
|
-
class TestTimeoutDecorator(unittest.TestCase):
|
78
|
-
from deltacat.utils.resources import timeout
|
79
|
-
|
80
|
-
@staticmethod
|
81
|
-
@timeout(2)
|
82
|
-
def something_that_runs_xs(x, *args, **kwargs):
|
83
|
-
time.sleep(x)
|
84
|
-
|
85
|
-
def test_timeout(self):
|
86
|
-
if platform.system() != "Windows":
|
87
|
-
self.assertRaises(
|
88
|
-
TimeoutError, lambda: self.something_that_runs_xs(3, test=10)
|
89
|
-
)
|
90
|
-
|
91
|
-
def test_sanity_in_multiprocess(self):
|
92
|
-
if platform.system() != "Windows":
|
93
|
-
# An alarm works per process
|
94
|
-
# https://pubs.opengroup.org/onlinepubs/9699919799/functions/alarm.html
|
95
|
-
with Pool(3) as p:
|
96
|
-
p.map(self.something_that_runs_xs, [1, 1.1, 1.2])
|
97
|
-
|
98
|
-
def test_sanity(self):
|
99
|
-
if platform.system() != "Windows":
|
100
|
-
self.something_that_runs_xs(1, test=10)
|
deltacat/utils/daft.py
CHANGED
@@ -16,6 +16,7 @@ from deltacat.aws.constants import (
|
|
16
16
|
BOTO_MAX_RETRIES,
|
17
17
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
18
18
|
AWS_REGION,
|
19
|
+
DEFAULT_FILE_READ_TIMEOUT_MS,
|
19
20
|
)
|
20
21
|
from deltacat.utils.performance import timed_invocation
|
21
22
|
|
@@ -112,6 +113,7 @@ def daft_s3_file_to_table(
|
|
112
113
|
coerce_int96_timestamp_unit = TimeUnit.from_str(
|
113
114
|
kwargs.get("coerce_int96_timestamp_unit", "ms")
|
114
115
|
)
|
116
|
+
file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
|
115
117
|
|
116
118
|
row_groups = None
|
117
119
|
if (
|
@@ -132,6 +134,7 @@ def daft_s3_file_to_table(
|
|
132
134
|
io_config=io_config,
|
133
135
|
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
|
134
136
|
multithreaded_io=False,
|
137
|
+
file_timeout_ms=file_timeout_ms,
|
135
138
|
)
|
136
139
|
|
137
140
|
logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -8,7 +8,7 @@ import logging
|
|
8
8
|
from functools import partial
|
9
9
|
from typing import Any, Callable, Dict, Iterable, List, Optional
|
10
10
|
from pyarrow.parquet import ParquetFile
|
11
|
-
from deltacat.exceptions import
|
11
|
+
from deltacat.exceptions import ContentTypeValidationError
|
12
12
|
|
13
13
|
import pyarrow as pa
|
14
14
|
import numpy as np
|
@@ -245,6 +245,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
|
|
245
245
|
schema: Optional[pa.Schema] = None,
|
246
246
|
pq_coerce_int96_timestamp_unit: Optional[str] = None,
|
247
247
|
parquet_reader_type: Optional[str] = None,
|
248
|
+
file_read_timeout_ms: Optional[int] = None,
|
248
249
|
):
|
249
250
|
"""
|
250
251
|
|
@@ -258,6 +259,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
|
|
258
259
|
self.schema = schema
|
259
260
|
self.pq_coerce_int96_timestamp_unit = pq_coerce_int96_timestamp_unit
|
260
261
|
self.parquet_reader_type = parquet_reader_type
|
262
|
+
self.file_read_timeout_ms = file_read_timeout_ms
|
261
263
|
|
262
264
|
def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
263
265
|
if content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
@@ -282,6 +284,8 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
|
|
282
284
|
else:
|
283
285
|
kwargs["reader_type"] = "daft"
|
284
286
|
|
287
|
+
kwargs["file_timeout_ms"] = self.file_read_timeout_ms
|
288
|
+
|
285
289
|
return kwargs
|
286
290
|
|
287
291
|
|
@@ -476,10 +480,9 @@ def s3_file_to_parquet(
|
|
476
480
|
content_type != ContentType.PARQUET.value
|
477
481
|
or content_encoding != ContentEncoding.IDENTITY
|
478
482
|
):
|
479
|
-
raise
|
480
|
-
f"S3 file with content type: {content_type} and "
|
481
|
-
|
482
|
-
"into pyarrow.parquet.ParquetFile"
|
483
|
+
raise ContentTypeValidationError(
|
484
|
+
f"S3 file with content type: {content_type} and content encoding: {content_encoding} "
|
485
|
+
"cannot be read into pyarrow.parquet.ParquetFile"
|
483
486
|
)
|
484
487
|
|
485
488
|
if s3_client_kwargs is None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
-
from typing import Any, Callable, Dict, List
|
3
|
+
from typing import Any, Callable, Dict, List, Optional
|
4
4
|
|
5
5
|
import ray
|
6
6
|
|
@@ -120,7 +120,7 @@ def log_cluster_resources() -> None:
|
|
120
120
|
logger.info(f"Cluster Nodes: {ray.nodes()}")
|
121
121
|
|
122
122
|
|
123
|
-
def get_current_ray_task_id() -> str:
|
123
|
+
def get_current_ray_task_id() -> Optional[str]:
|
124
124
|
return ray.get_runtime_context().get_task_id()
|
125
125
|
|
126
126
|
|