deltacat 1.1.8__py3-none-any.whl → 1.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +6 -0
  3. deltacat/aws/s3u.py +46 -25
  4. deltacat/compute/compactor/model/compact_partition_params.py +12 -1
  5. deltacat/compute/compactor/model/materialize_result.py +0 -4
  6. deltacat/compute/compactor_v2/compaction_session.py +11 -5
  7. deltacat/compute/compactor_v2/constants.py +2 -11
  8. deltacat/compute/compactor_v2/model/merge_input.py +6 -0
  9. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
  10. deltacat/compute/compactor_v2/steps/merge.py +12 -12
  11. deltacat/compute/compactor_v2/utils/merge.py +1 -0
  12. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  13. deltacat/compute/compactor_v2/utils/task_options.py +2 -12
  14. deltacat/exceptions.py +342 -7
  15. deltacat/io/memcached_object_store.py +7 -4
  16. deltacat/storage/interface.py +14 -0
  17. deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
  18. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
  19. deltacat/tests/compute/compact_partition_test_cases.py +4 -2
  20. deltacat/tests/compute/compactor_v2/test_compaction_session.py +3 -1
  21. deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
  22. deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
  23. deltacat/tests/io/test_memcached_object_store.py +5 -2
  24. deltacat/tests/local_deltacat_storage/__init__.py +41 -10
  25. deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
  26. deltacat/tests/test_exceptions.py +100 -0
  27. deltacat/tests/test_logs.py +1 -0
  28. deltacat/tests/utils/test_daft.py +0 -1
  29. deltacat/tests/utils/test_resources.py +0 -28
  30. deltacat/utils/daft.py +3 -0
  31. deltacat/utils/pyarrow.py +8 -5
  32. deltacat/utils/ray_utils/runtime.py +2 -2
  33. deltacat/utils/resources.py +0 -45
  34. {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/METADATA +2 -2
  35. {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/RECORD +38 -34
  36. {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/LICENSE +0 -0
  37. {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/WHEEL +0 -0
  38. {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,289 @@
1
+ import ray
2
+ import os
3
+ from moto import mock_s3
4
+ import pytest
5
+ import boto3
6
+ from boto3.resources.base import ServiceResource
7
+ import pyarrow as pa
8
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
9
+ from pytest_benchmark.fixture import BenchmarkFixture
10
+
11
+ from deltacat.tests.compute.test_util_constant import (
12
+ TEST_S3_RCF_BUCKET_NAME,
13
+ DEFAULT_NUM_WORKERS,
14
+ DEFAULT_WORKER_INSTANCE_CPUS,
15
+ )
16
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
17
+ from deltacat.tests.compute.test_util_common import (
18
+ get_compacted_delta_locator_from_rcf,
19
+ )
20
+ from deltacat.tests.compute.test_util_create_table_deltas_repo import (
21
+ create_src_w_deltas_destination_rebase_w_deltas_strategy,
22
+ )
23
+ from deltacat.tests.compute.compact_partition_rebase_test_cases import (
24
+ REBASE_TEST_CASES,
25
+ )
26
+ from typing import Any, Callable, Dict, List, Optional, Set
27
+ from deltacat.types.media import StorageType
28
+ from deltacat.storage import (
29
+ DeltaLocator,
30
+ Partition,
31
+ )
32
+ from deltacat.types.media import ContentType
33
+ from deltacat.compute.compactor.model.compact_partition_params import (
34
+ CompactPartitionParams,
35
+ )
36
+ from deltacat.utils.placement import (
37
+ PlacementGroupManager,
38
+ )
39
+
40
+ DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
41
+ "db_file_path",
42
+ "deltacat/tests/local_deltacat_storage/db_test.sqlite",
43
+ )
44
+
45
+
46
+ """
47
+ MODULE scoped fixtures
48
+ """
49
+
50
+
51
+ @pytest.fixture(autouse=True, scope="module")
52
+ def setup_ray_cluster():
53
+ ray.init(local_mode=True, ignore_reinit_error=True)
54
+ yield
55
+ ray.shutdown()
56
+
57
+
58
+ @pytest.fixture(autouse=True, scope="module")
59
+ def mock_aws_credential():
60
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
61
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
62
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
63
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
64
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
65
+ yield
66
+
67
+
68
+ @pytest.fixture(autouse=True, scope="module")
69
+ def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
70
+ # make sure the database file is deleted after all the compactor package tests are completed
71
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
72
+ os.remove(DATABASE_FILE_PATH_VALUE)
73
+
74
+
75
+ @pytest.fixture(scope="module")
76
+ def s3_resource(mock_aws_credential):
77
+ with mock_s3():
78
+ yield boto3.resource("s3")
79
+
80
+
81
+ @pytest.fixture(autouse=True, scope="module")
82
+ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
83
+ s3_resource.create_bucket(
84
+ ACL="authenticated-read",
85
+ Bucket=TEST_S3_RCF_BUCKET_NAME,
86
+ )
87
+ yield
88
+
89
+
90
+ """
91
+ FUNCTION scoped fixtures
92
+ """
93
+
94
+
95
+ @pytest.fixture(scope="function")
96
+ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
97
+ # see deltacat/tests/local_deltacat_storage/README.md for documentation
98
+ kwargs_for_local_deltacat_storage: Dict[str, Any] = {
99
+ DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
100
+ }
101
+ yield kwargs_for_local_deltacat_storage
102
+ if os.path.exists(DATABASE_FILE_PATH_VALUE):
103
+ os.remove(DATABASE_FILE_PATH_VALUE)
104
+
105
+
106
+ @pytest.mark.parametrize(
107
+ [
108
+ "test_name",
109
+ "primary_keys",
110
+ "sort_keys",
111
+ "partition_keys_param",
112
+ "partition_values_param",
113
+ "input_deltas_param",
114
+ "input_deltas_delta_type",
115
+ "expected_terminal_compact_partition_result",
116
+ "expected_terminal_exception",
117
+ "expected_terminal_exception_message",
118
+ "create_placement_group_param",
119
+ "records_per_compacted_file_param",
120
+ "hash_bucket_count_param",
121
+ "read_kwargs_provider_param",
122
+ "drop_duplicates_param",
123
+ "skip_enabled_compact_partition_drivers",
124
+ "rebase_expected_compact_partition_result",
125
+ "compact_partition_func",
126
+ ],
127
+ [
128
+ (
129
+ test_name,
130
+ primary_keys,
131
+ sort_keys,
132
+ partition_keys_param,
133
+ partition_values_param,
134
+ input_deltas,
135
+ input_deltas_delta_type,
136
+ expected_terminal_compact_partition_result,
137
+ expected_terminal_exception,
138
+ expected_terminal_exception_message,
139
+ create_placement_group_param,
140
+ records_per_compacted_file_param,
141
+ hash_bucket_count_param,
142
+ drop_duplicates_param,
143
+ read_kwargs_provider,
144
+ skip_enabled_compact_partition_drivers,
145
+ rebase_expected_compact_partition_result,
146
+ compact_partition_func,
147
+ )
148
+ for test_name, (
149
+ primary_keys,
150
+ sort_keys,
151
+ partition_keys_param,
152
+ partition_values_param,
153
+ input_deltas,
154
+ input_deltas_delta_type,
155
+ expected_terminal_compact_partition_result,
156
+ expected_terminal_exception,
157
+ expected_terminal_exception_message,
158
+ create_placement_group_param,
159
+ records_per_compacted_file_param,
160
+ hash_bucket_count_param,
161
+ drop_duplicates_param,
162
+ read_kwargs_provider,
163
+ skip_enabled_compact_partition_drivers,
164
+ rebase_expected_compact_partition_result,
165
+ compact_partition_func,
166
+ ) in REBASE_TEST_CASES.items()
167
+ ],
168
+ ids=[test_name for test_name in REBASE_TEST_CASES],
169
+ )
170
+ def test_compact_partition_rebase_same_source_and_destination(
171
+ mocker,
172
+ s3_resource: ServiceResource,
173
+ local_deltacat_storage_kwargs: Dict[str, Any],
174
+ test_name: str,
175
+ primary_keys: Set[str],
176
+ sort_keys: List[Optional[Any]],
177
+ partition_keys_param: Optional[List[Any]],
178
+ partition_values_param: List[Optional[str]],
179
+ input_deltas_param: List[pa.Array],
180
+ input_deltas_delta_type: str,
181
+ expected_terminal_compact_partition_result: pa.Table,
182
+ expected_terminal_exception: BaseException,
183
+ expected_terminal_exception_message: Optional[str],
184
+ create_placement_group_param: bool,
185
+ records_per_compacted_file_param: int,
186
+ hash_bucket_count_param: int,
187
+ drop_duplicates_param: bool,
188
+ read_kwargs_provider_param: Any,
189
+ rebase_expected_compact_partition_result: pa.Table,
190
+ skip_enabled_compact_partition_drivers: List[CompactorVersion],
191
+ compact_partition_func: Callable,
192
+ benchmark: BenchmarkFixture,
193
+ ):
194
+ import deltacat.tests.local_deltacat_storage as ds
195
+
196
+ ds_mock_kwargs = local_deltacat_storage_kwargs
197
+ """
198
+ This test tests the scenario where source partition locator == destination partition locator,
199
+ but rebase source partition locator is different.
200
+ This scenario could occur when hash bucket count changes.
201
+ """
202
+ partition_keys = partition_keys_param
203
+ (
204
+ source_table_stream,
205
+ _,
206
+ rebased_table_stream,
207
+ ) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
208
+ primary_keys,
209
+ sort_keys,
210
+ partition_keys,
211
+ input_deltas_param,
212
+ input_deltas_delta_type,
213
+ partition_values_param,
214
+ ds_mock_kwargs,
215
+ )
216
+ source_partition: Partition = ds.get_partition(
217
+ source_table_stream.locator,
218
+ partition_values_param,
219
+ **ds_mock_kwargs,
220
+ )
221
+ rebased_partition: Partition = ds.get_partition(
222
+ rebased_table_stream.locator,
223
+ partition_values_param,
224
+ **ds_mock_kwargs,
225
+ )
226
+ num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
227
+ total_cpus = num_workers * worker_instance_cpu
228
+ pgm = None
229
+ create_placement_group_param = False
230
+ if create_placement_group_param:
231
+ pgm = PlacementGroupManager(
232
+ 1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
233
+ ).pgs[0]
234
+ compact_partition_params = CompactPartitionParams.of(
235
+ {
236
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
237
+ "compacted_file_content_type": ContentType.PARQUET,
238
+ "dd_max_parallelism_ratio": 1.0,
239
+ "deltacat_storage": ds,
240
+ "deltacat_storage_kwargs": ds_mock_kwargs,
241
+ "destination_partition_locator": rebased_partition.locator,
242
+ "hash_bucket_count": hash_bucket_count_param,
243
+ "last_stream_position_to_compact": source_partition.stream_position,
244
+ "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
245
+ "object_store": RayPlasmaObjectStore(),
246
+ "pg_config": pgm,
247
+ "primary_keys": primary_keys,
248
+ "read_kwargs_provider": read_kwargs_provider_param,
249
+ "rebase_source_partition_locator": source_partition.locator,
250
+ "records_per_compacted_file": records_per_compacted_file_param,
251
+ "s3_client_kwargs": {},
252
+ "source_partition_locator": rebased_partition.locator,
253
+ "sort_keys": sort_keys if sort_keys else None,
254
+ }
255
+ )
256
+
257
+ from deltacat.compute.compactor_v2.model.compaction_session import (
258
+ ExecutionCompactionResult,
259
+ )
260
+
261
+ execute_compaction_result_spy = mocker.spy(ExecutionCompactionResult, "__init__")
262
+
263
+ # execute
264
+ rcf_file_s3_uri = compact_partition_func(compact_partition_params)
265
+
266
+ # Assert not in-place compacted
267
+ assert (
268
+ execute_compaction_result_spy.call_args.args[-1] is False
269
+ ), "Table version erroneously marked as in-place compacted!"
270
+ compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
271
+ s3_resource, rcf_file_s3_uri
272
+ )
273
+ tables = ds.download_delta(
274
+ compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
275
+ )
276
+ actual_rebase_compacted_table = pa.concat_tables(tables)
277
+ # if no primary key is specified then sort by sort_key for consistent assertion
278
+ sorting_cols: List[Any] = (
279
+ [(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
280
+ )
281
+ rebase_expected_compact_partition_result = (
282
+ rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
283
+ )
284
+ actual_rebase_compacted_table = (
285
+ actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
286
+ )
287
+ assert actual_rebase_compacted_table.equals(
288
+ rebase_expected_compact_partition_result
289
+ ), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
@@ -251,6 +251,7 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
251
251
  ds.commit_partition(staged_partition, **ds_mock_kwargs)
252
252
 
253
253
  # get streams
254
+ # TODO: Add deltas to destination stream
254
255
  destination_table_stream: Stream = ds.get_stream(
255
256
  namespace=destination_table_namespace,
256
257
  table_name=destination_table_name,
@@ -1,6 +1,9 @@
1
1
  import unittest
2
2
  import numpy as np
3
3
  from unittest import mock
4
+ from deltacat.exceptions import (
5
+ PymemcachedPutObjectError,
6
+ )
4
7
 
5
8
 
6
9
  class MockPyMemcacheClient:
@@ -86,7 +89,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
86
89
  mock_retrying_client.return_value = mock_client.return_value
87
90
  mock_client.return_value.set_many.return_value = ["abcd"]
88
91
 
89
- with self.assertRaises(RuntimeError):
92
+ with self.assertRaises(PymemcachedPutObjectError):
90
93
  self.object_store.put_many(["a", "b"])
91
94
 
92
95
  self.assertEqual(1, mock_client.return_value.set_many.call_count)
@@ -169,7 +172,7 @@ class TestMemcachedObjectStore(unittest.TestCase):
169
172
  mock_client.return_value.set.return_value = False
170
173
  mock_retrying_client.return_value = mock_client.return_value
171
174
 
172
- with self.assertRaises(RuntimeError):
175
+ with self.assertRaises(PymemcachedPutObjectError):
173
176
  self.object_store.put("test_ip")
174
177
 
175
178
  self.assertEqual(1, mock_client.return_value.set.call_count)
@@ -12,6 +12,7 @@ import io
12
12
  from deltacat.tests.test_utils.storage import create_empty_delta
13
13
  from deltacat.utils.common import current_time_ms
14
14
 
15
+
15
16
  from deltacat.storage import (
16
17
  Delta,
17
18
  DeltaLocator,
@@ -49,6 +50,10 @@ from deltacat.types.media import (
49
50
  DistributedDatasetType,
50
51
  )
51
52
  from deltacat.utils.common import ReadKwargsProvider
53
+ from deltacat.tests.local_deltacat_storage.exceptions import (
54
+ InvalidNamespaceError,
55
+ LocalStorageValidationError,
56
+ )
52
57
 
53
58
  SQLITE_CUR_ARG = "sqlite3_cur"
54
59
  SQLITE_CON_ARG = "sqlite3_con"
@@ -97,6 +102,19 @@ def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
97
102
  return f"cloudpickle://{manifest_entry_id}"
98
103
 
99
104
 
105
+ def _merge_and_promote(
106
+ partition_deltas: List[Delta], previous_partition_deltas: List[Delta]
107
+ ):
108
+ previous_partition_deltas_spos_gt: List[Delta] = [
109
+ delta
110
+ for delta in previous_partition_deltas
111
+ if delta.stream_position > partition_deltas[0].stream_position
112
+ ]
113
+ # handle the case if the previous partition deltas have a greater stream position than the partition_delta
114
+ partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
115
+ return partition_deltas
116
+
117
+
100
118
  def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
101
119
  cur, con = _get_sqlite3_cursor_con(kwargs)
102
120
  res = cur.execute("SELECT * FROM namespaces")
@@ -820,19 +838,19 @@ def commit_partition(
820
838
  ).all_items()
821
839
  or []
822
840
  )
841
+
823
842
  partition_deltas: Optional[List[Delta]] = (
824
843
  list_partition_deltas(
825
844
  partition, ascending_order=False, *args, **kwargs
826
845
  ).all_items()
827
846
  or []
828
847
  )
829
- previous_partition_deltas_spos_gt: List[Delta] = [
830
- delta
831
- for delta in previous_partition_deltas
832
- if delta.stream_position > partition_deltas[0].stream_position
833
- ]
834
- # handle the case if the previous partition deltas have a greater stream position than the partition_delta
835
- partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
848
+
849
+ # if previous_partition is passed in, table is in-place compacted and we need to run merge-and-promote
850
+ if previous_partition:
851
+ partition_deltas = _merge_and_promote(
852
+ partition_deltas, previous_partition_deltas
853
+ )
836
854
 
837
855
  stream_position = (
838
856
  partition_deltas[0].stream_position
@@ -840,13 +858,14 @@ def commit_partition(
840
858
  else partition.stream_position
841
859
  )
842
860
 
843
- partition.state = CommitState.COMMITTED
844
861
  partition.stream_position = stream_position
862
+ if partition_deltas:
863
+ partition.locator = partition_deltas[0].partition_locator
864
+
865
+ partition.state = CommitState.COMMITTED
845
866
  partition.previous_stream_position = (
846
867
  pv_partition.stream_position if pv_partition else None
847
868
  )
848
- if partition_deltas:
849
- partition.locator = partition_deltas[0].partition_locator
850
869
  params = (json.dumps(partition), partition.locator.canonical_string())
851
870
  cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
852
871
  con.commit()
@@ -1162,3 +1181,15 @@ def get_table_version_column_names(
1162
1181
  **kwargs,
1163
1182
  ) -> Optional[List[str]]:
1164
1183
  raise NotImplementedError("Fetching column names is not supported")
1184
+
1185
+
1186
+ def can_categorize(e: BaseException, **kwargs) -> bool:
1187
+ if isinstance(e, InvalidNamespaceError):
1188
+ return True
1189
+ else:
1190
+ return False
1191
+
1192
+
1193
+ def raise_categorized_error(e: BaseException, **kwargs):
1194
+ if isinstance(e, InvalidNamespaceError):
1195
+ raise LocalStorageValidationError("Namespace provided is invalid!")
@@ -0,0 +1,10 @@
1
+ class InvalidNamespaceError(Exception):
2
+ error_name = "InvalidNamespaceError"
3
+
4
+
5
+ class LocalStorageValidationError(Exception):
6
+ error_name = "LocalStorageValidationError"
7
+
8
+
9
+ class LocalStorageError(Exception):
10
+ error_name = "LocalStorageError"
@@ -0,0 +1,100 @@
1
+ import unittest
2
+ from deltacat.exceptions import categorize_errors
3
+ import ray
4
+ from deltacat.exceptions import (
5
+ DependencyPyarrowCapacityError,
6
+ NonRetryableDownloadTableError,
7
+ RetryableError,
8
+ NonRetryableError,
9
+ DeltaCatTransientError,
10
+ DependencyDaftTransientError,
11
+ UnclassifiedDeltaCatError,
12
+ )
13
+ from daft.exceptions import DaftTransientError
14
+ from deltacat.tests.local_deltacat_storage.exceptions import (
15
+ InvalidNamespaceError,
16
+ LocalStorageValidationError,
17
+ )
18
+ from botocore.exceptions import NoCredentialsError
19
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt
20
+
21
+ from pyarrow.lib import ArrowCapacityError
22
+ import deltacat.tests.local_deltacat_storage as ds
23
+
24
+
25
+ class MockUnknownException(Exception):
26
+ pass
27
+
28
+
29
+ @categorize_errors
30
+ def mock_raise_exception(exception_to_raise, deltacat_storage=ds):
31
+ raise exception_to_raise
32
+
33
+
34
+ @retry(retry=retry_if_exception_type(NoCredentialsError), stop=stop_after_attempt(2))
35
+ def mock_tenacity_wrapped_method(exception_to_raise):
36
+ mock_raise_exception(exception_to_raise)
37
+
38
+
39
+ @ray.remote
40
+ def mock_remote_task(exception_to_raise):
41
+ mock_raise_exception(exception_to_raise)
42
+
43
+
44
+ class TestCategorizeErrors(unittest.TestCase):
45
+ def test_pyarrow_exception_categorizer(self):
46
+ self.assertRaises(
47
+ DependencyPyarrowCapacityError,
48
+ lambda: mock_raise_exception(ArrowCapacityError),
49
+ )
50
+
51
+ def test_storage_exception_categorizer(self):
52
+ self.assertRaises(
53
+ LocalStorageValidationError,
54
+ lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
55
+ )
56
+
57
+ def test_non_retryable_error(self):
58
+ self.assertRaises(
59
+ NonRetryableError,
60
+ lambda: mock_raise_exception(NonRetryableDownloadTableError),
61
+ )
62
+
63
+ def test_retryable_error(self):
64
+ self.assertRaises(RetryableError, lambda: mock_raise_exception(ConnectionError))
65
+
66
+ def test_ray_task_returns_wrapped_exception(self):
67
+ self.assertRaises(
68
+ DeltaCatTransientError,
69
+ lambda: ray.get(mock_remote_task.remote(ConnectionError)),
70
+ )
71
+
72
+ def test_daft_transient_error(self):
73
+ self.assertRaises(
74
+ DependencyDaftTransientError,
75
+ lambda: ray.get(mock_remote_task.remote(DaftTransientError)),
76
+ )
77
+
78
+ def test_tenacity_underlying_error_returned(self):
79
+ self.assertRaises(
80
+ DeltaCatTransientError,
81
+ lambda: mock_tenacity_wrapped_method(NoCredentialsError),
82
+ )
83
+
84
+ def test_unclassified_error_when_error_cannot_be_categorized(self):
85
+ self.assertRaises(
86
+ UnclassifiedDeltaCatError,
87
+ lambda: ray.get(mock_remote_task.remote(MockUnknownException)),
88
+ )
89
+
90
+ def test_deltacat_exception_contains_attributes(self):
91
+
92
+ try:
93
+ mock_raise_exception(ConnectionError)
94
+ except DeltaCatTransientError as e:
95
+ self.assertTrue(hasattr(e, "is_retryable"))
96
+ self.assertTrue(hasattr(e, "error_name"))
97
+ assert e.error_name == "DeltaCatTransientError"
98
+ return
99
+
100
+ self.assertFalse(True)
@@ -38,6 +38,7 @@ class TestJsonFormatter(unittest.TestCase):
38
38
  self.assertEqual({"message": "test_message"}, result)
39
39
 
40
40
  def test_format_sanity(self):
41
+ ray.shutdown()
41
42
  formatter = JsonFormatter({"message": "msg"})
42
43
 
43
44
  record = LogRecord(
@@ -1,7 +1,6 @@
1
1
  import unittest
2
2
  from deltacat.types.media import ContentEncoding, ContentType
3
3
  from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
4
-
5
4
  from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
6
5
  from deltacat.types.partial_download import PartialParquetParameters
7
6
  import pyarrow as pa
@@ -1,8 +1,6 @@
1
1
  import unittest
2
2
  from unittest import mock
3
3
  import time
4
- from multiprocessing import Pool
5
- import platform
6
4
 
7
5
 
8
6
  class TestGetCurrentClusterUtilization(unittest.TestCase):
@@ -72,29 +70,3 @@ class TestProcessUtilizationOverTimeRange(unittest.TestCase):
72
70
  nu.schedule_callback(test_callback, 1)
73
71
  time.sleep(3)
74
72
  self.assertTrue(nu.test_field_set)
75
-
76
-
77
- class TestTimeoutDecorator(unittest.TestCase):
78
- from deltacat.utils.resources import timeout
79
-
80
- @staticmethod
81
- @timeout(2)
82
- def something_that_runs_xs(x, *args, **kwargs):
83
- time.sleep(x)
84
-
85
- def test_timeout(self):
86
- if platform.system() != "Windows":
87
- self.assertRaises(
88
- TimeoutError, lambda: self.something_that_runs_xs(3, test=10)
89
- )
90
-
91
- def test_sanity_in_multiprocess(self):
92
- if platform.system() != "Windows":
93
- # An alarm works per process
94
- # https://pubs.opengroup.org/onlinepubs/9699919799/functions/alarm.html
95
- with Pool(3) as p:
96
- p.map(self.something_that_runs_xs, [1, 1.1, 1.2])
97
-
98
- def test_sanity(self):
99
- if platform.system() != "Windows":
100
- self.something_that_runs_xs(1, test=10)
deltacat/utils/daft.py CHANGED
@@ -16,6 +16,7 @@ from deltacat.aws.constants import (
16
16
  BOTO_MAX_RETRIES,
17
17
  DAFT_MAX_S3_CONNECTIONS_PER_FILE,
18
18
  AWS_REGION,
19
+ DEFAULT_FILE_READ_TIMEOUT_MS,
19
20
  )
20
21
  from deltacat.utils.performance import timed_invocation
21
22
 
@@ -112,6 +113,7 @@ def daft_s3_file_to_table(
112
113
  coerce_int96_timestamp_unit = TimeUnit.from_str(
113
114
  kwargs.get("coerce_int96_timestamp_unit", "ms")
114
115
  )
116
+ file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
115
117
 
116
118
  row_groups = None
117
119
  if (
@@ -132,6 +134,7 @@ def daft_s3_file_to_table(
132
134
  io_config=io_config,
133
135
  coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
134
136
  multithreaded_io=False,
137
+ file_timeout_ms=file_timeout_ms,
135
138
  )
136
139
 
137
140
  logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
deltacat/utils/pyarrow.py CHANGED
@@ -8,7 +8,7 @@ import logging
8
8
  from functools import partial
9
9
  from typing import Any, Callable, Dict, Iterable, List, Optional
10
10
  from pyarrow.parquet import ParquetFile
11
- from deltacat.exceptions import ValidationError
11
+ from deltacat.exceptions import ContentTypeValidationError
12
12
 
13
13
  import pyarrow as pa
14
14
  import numpy as np
@@ -245,6 +245,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
245
245
  schema: Optional[pa.Schema] = None,
246
246
  pq_coerce_int96_timestamp_unit: Optional[str] = None,
247
247
  parquet_reader_type: Optional[str] = None,
248
+ file_read_timeout_ms: Optional[int] = None,
248
249
  ):
249
250
  """
250
251
 
@@ -258,6 +259,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
258
259
  self.schema = schema
259
260
  self.pq_coerce_int96_timestamp_unit = pq_coerce_int96_timestamp_unit
260
261
  self.parquet_reader_type = parquet_reader_type
262
+ self.file_read_timeout_ms = file_read_timeout_ms
261
263
 
262
264
  def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
263
265
  if content_type in DELIMITED_TEXT_CONTENT_TYPES:
@@ -282,6 +284,8 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
282
284
  else:
283
285
  kwargs["reader_type"] = "daft"
284
286
 
287
+ kwargs["file_timeout_ms"] = self.file_read_timeout_ms
288
+
285
289
  return kwargs
286
290
 
287
291
 
@@ -476,10 +480,9 @@ def s3_file_to_parquet(
476
480
  content_type != ContentType.PARQUET.value
477
481
  or content_encoding != ContentEncoding.IDENTITY
478
482
  ):
479
- raise ValidationError(
480
- f"S3 file with content type: {content_type} and "
481
- f"content encoding: {content_encoding} cannot be read"
482
- "into pyarrow.parquet.ParquetFile"
483
+ raise ContentTypeValidationError(
484
+ f"S3 file with content type: {content_type} and content encoding: {content_encoding} "
485
+ "cannot be read into pyarrow.parquet.ParquetFile"
483
486
  )
484
487
 
485
488
  if s3_client_kwargs is None:
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import time
3
- from typing import Any, Callable, Dict, List
3
+ from typing import Any, Callable, Dict, List, Optional
4
4
 
5
5
  import ray
6
6
 
@@ -120,7 +120,7 @@ def log_cluster_resources() -> None:
120
120
  logger.info(f"Cluster Nodes: {ray.nodes()}")
121
121
 
122
122
 
123
- def get_current_ray_task_id() -> str:
123
+ def get_current_ray_task_id() -> Optional[str]:
124
124
  return ray.get_runtime_context().get_task_id()
125
125
 
126
126