deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +65 -38
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +26 -16
- deltacat/compute/compactor_v2/constants.py +5 -11
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/dataset.py +5 -17
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +56 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +124 -29
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +11 -8
- deltacat/utils/ray_utils/dataset.py +7 -7
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@ import io
|
|
12
12
|
from deltacat.tests.test_utils.storage import create_empty_delta
|
13
13
|
from deltacat.utils.common import current_time_ms
|
14
14
|
|
15
|
+
|
15
16
|
from deltacat.storage import (
|
16
17
|
Delta,
|
17
18
|
DeltaLocator,
|
@@ -40,6 +41,12 @@ from deltacat.storage import (
|
|
40
41
|
ManifestEntry,
|
41
42
|
ManifestEntryList,
|
42
43
|
DeleteParameters,
|
44
|
+
PartitionFilter,
|
45
|
+
PartitionValues,
|
46
|
+
DeltaPartitionSpec,
|
47
|
+
StreamPartitionSpec,
|
48
|
+
TransformName,
|
49
|
+
IdentityTransformParameters,
|
43
50
|
)
|
44
51
|
from deltacat.types.media import (
|
45
52
|
ContentType,
|
@@ -49,6 +56,10 @@ from deltacat.types.media import (
|
|
49
56
|
DistributedDatasetType,
|
50
57
|
)
|
51
58
|
from deltacat.utils.common import ReadKwargsProvider
|
59
|
+
from deltacat.tests.local_deltacat_storage.exceptions import (
|
60
|
+
InvalidNamespaceError,
|
61
|
+
LocalStorageValidationError,
|
62
|
+
)
|
52
63
|
|
53
64
|
SQLITE_CUR_ARG = "sqlite3_cur"
|
54
65
|
SQLITE_CON_ARG = "sqlite3_con"
|
@@ -97,6 +108,19 @@ def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
|
|
97
108
|
return f"cloudpickle://{manifest_entry_id}"
|
98
109
|
|
99
110
|
|
111
|
+
def _merge_and_promote(
|
112
|
+
partition_deltas: List[Delta], previous_partition_deltas: List[Delta]
|
113
|
+
):
|
114
|
+
previous_partition_deltas_spos_gt: List[Delta] = [
|
115
|
+
delta
|
116
|
+
for delta in previous_partition_deltas
|
117
|
+
if delta.stream_position > partition_deltas[0].stream_position
|
118
|
+
]
|
119
|
+
# handle the case if the previous partition deltas have a greater stream position than the partition_delta
|
120
|
+
partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
|
121
|
+
return partition_deltas
|
122
|
+
|
123
|
+
|
100
124
|
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
101
125
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
102
126
|
res = cur.execute("SELECT * FROM namespaces")
|
@@ -176,12 +200,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
|
|
176
200
|
def list_deltas(
|
177
201
|
namespace: str,
|
178
202
|
table_name: str,
|
179
|
-
partition_values: Optional[
|
203
|
+
partition_values: Optional[PartitionValues] = None,
|
180
204
|
table_version: Optional[str] = None,
|
181
205
|
first_stream_position: Optional[int] = None,
|
182
206
|
last_stream_position: Optional[int] = None,
|
183
207
|
ascending_order: Optional[bool] = None,
|
184
208
|
include_manifest: bool = False,
|
209
|
+
partition_filter: Optional[PartitionFilter] = None,
|
185
210
|
*args,
|
186
211
|
**kwargs,
|
187
212
|
) -> ListResult[Delta]:
|
@@ -189,6 +214,13 @@ def list_deltas(
|
|
189
214
|
if stream is None:
|
190
215
|
return ListResult.of([], None, None)
|
191
216
|
|
217
|
+
if partition_values is not None and partition_filter is not None:
|
218
|
+
raise ValueError(
|
219
|
+
"Only one of partition_values or partition_filter must be provided"
|
220
|
+
)
|
221
|
+
if partition_filter is not None:
|
222
|
+
partition_values = partition_filter.partition_values
|
223
|
+
|
192
224
|
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
193
225
|
|
194
226
|
all_deltas = list_partition_deltas(
|
@@ -279,15 +311,25 @@ def get_delta(
|
|
279
311
|
namespace: str,
|
280
312
|
table_name: str,
|
281
313
|
stream_position: int,
|
282
|
-
partition_values: Optional[
|
314
|
+
partition_values: Optional[PartitionValues] = None,
|
283
315
|
table_version: Optional[str] = None,
|
284
316
|
include_manifest: bool = False,
|
317
|
+
partition_filter: Optional[PartitionFilter] = None,
|
285
318
|
*args,
|
286
319
|
**kwargs,
|
287
320
|
) -> Optional[Delta]:
|
288
321
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
289
322
|
|
290
323
|
stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
|
324
|
+
|
325
|
+
if partition_values is not None and partition_filter is not None:
|
326
|
+
raise ValueError(
|
327
|
+
"Only one of partition_values or partition_filter must be provided"
|
328
|
+
)
|
329
|
+
|
330
|
+
if partition_filter is not None:
|
331
|
+
partition_values = partition_filter.partition_values
|
332
|
+
|
291
333
|
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
292
334
|
delta_locator = DeltaLocator.of(partition.locator, stream_position)
|
293
335
|
|
@@ -310,22 +352,24 @@ def get_delta(
|
|
310
352
|
def get_latest_delta(
|
311
353
|
namespace: str,
|
312
354
|
table_name: str,
|
313
|
-
partition_values: Optional[
|
355
|
+
partition_values: Optional[PartitionValues] = None,
|
314
356
|
table_version: Optional[str] = None,
|
315
357
|
include_manifest: bool = False,
|
358
|
+
partition_filter: Optional[PartitionFilter] = None,
|
316
359
|
*args,
|
317
360
|
**kwargs,
|
318
361
|
) -> Optional[Delta]:
|
319
362
|
|
320
363
|
deltas = list_deltas(
|
321
|
-
namespace,
|
322
|
-
table_name,
|
323
|
-
partition_values,
|
324
|
-
table_version,
|
325
|
-
None,
|
326
|
-
None,
|
327
|
-
False,
|
328
|
-
include_manifest,
|
364
|
+
namespace=namespace,
|
365
|
+
table_name=table_name,
|
366
|
+
partition_values=partition_values,
|
367
|
+
table_version=table_version,
|
368
|
+
first_stream_position=None,
|
369
|
+
last_stream_position=None,
|
370
|
+
ascending_order=False,
|
371
|
+
include_manifest=include_manifest,
|
372
|
+
partition_filter=partition_filter,
|
329
373
|
*args,
|
330
374
|
**kwargs,
|
331
375
|
).all_items()
|
@@ -345,13 +389,24 @@ def download_delta(
|
|
345
389
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
346
390
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
347
391
|
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
392
|
+
partition_filter: Optional[PartitionFilter] = None,
|
348
393
|
*args,
|
349
394
|
**kwargs,
|
350
395
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
351
396
|
result = []
|
352
397
|
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
353
398
|
|
399
|
+
partition_values: PartitionValues = None
|
400
|
+
if partition_filter is not None:
|
401
|
+
partition_values = partition_filter.partition_values
|
402
|
+
|
354
403
|
for entry_index in range(len(manifest.entries)):
|
404
|
+
if (
|
405
|
+
partition_values is not None
|
406
|
+
and partition_values != manifest.entries[entry_index].meta.partition_values
|
407
|
+
):
|
408
|
+
continue
|
409
|
+
|
355
410
|
result.append(
|
356
411
|
download_delta_manifest_entry(
|
357
412
|
delta_like=delta_like,
|
@@ -506,11 +561,29 @@ def create_table_version(
|
|
506
561
|
table_description: Optional[str] = None,
|
507
562
|
table_properties: Optional[Dict[str, str]] = None,
|
508
563
|
supported_content_types: Optional[List[ContentType]] = None,
|
564
|
+
partition_spec: Optional[StreamPartitionSpec] = None,
|
509
565
|
*args,
|
510
566
|
**kwargs,
|
511
567
|
) -> Stream:
|
512
568
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
513
569
|
|
570
|
+
if partition_keys is not None and partition_spec is not None:
|
571
|
+
raise ValueError(
|
572
|
+
"Only one of partition_keys or partition_spec must be provided"
|
573
|
+
)
|
574
|
+
if partition_spec is not None:
|
575
|
+
assert (
|
576
|
+
partition_spec.ordered_transforms is not None
|
577
|
+
), "Ordered transforms must be specified when partition_spec is specified"
|
578
|
+
partition_keys = []
|
579
|
+
for transform in partition_spec.ordered_transforms:
|
580
|
+
assert transform.name == TransformName.IDENTITY, (
|
581
|
+
"Local DeltaCAT storage does not support creating table versions "
|
582
|
+
"with non identity transform partition spec"
|
583
|
+
)
|
584
|
+
transform_params: IdentityTransformParameters = transform.parameters
|
585
|
+
partition_keys.append(transform_params.column_name)
|
586
|
+
|
514
587
|
latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
|
515
588
|
if (
|
516
589
|
table_version is not None
|
@@ -758,7 +831,7 @@ def delete_stream(
|
|
758
831
|
|
759
832
|
|
760
833
|
def stage_partition(
|
761
|
-
stream: Stream, partition_values: Optional[
|
834
|
+
stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
|
762
835
|
) -> Partition:
|
763
836
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
764
837
|
partition_id = uuid.uuid4().__str__()
|
@@ -820,19 +893,19 @@ def commit_partition(
|
|
820
893
|
).all_items()
|
821
894
|
or []
|
822
895
|
)
|
896
|
+
|
823
897
|
partition_deltas: Optional[List[Delta]] = (
|
824
898
|
list_partition_deltas(
|
825
899
|
partition, ascending_order=False, *args, **kwargs
|
826
900
|
).all_items()
|
827
901
|
or []
|
828
902
|
)
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
|
903
|
+
|
904
|
+
# if previous_partition is passed in, table is in-place compacted and we need to run merge-and-promote
|
905
|
+
if previous_partition:
|
906
|
+
partition_deltas = _merge_and_promote(
|
907
|
+
partition_deltas, previous_partition_deltas
|
908
|
+
)
|
836
909
|
|
837
910
|
stream_position = (
|
838
911
|
partition_deltas[0].stream_position
|
@@ -840,13 +913,14 @@ def commit_partition(
|
|
840
913
|
else partition.stream_position
|
841
914
|
)
|
842
915
|
|
843
|
-
partition.state = CommitState.COMMITTED
|
844
916
|
partition.stream_position = stream_position
|
917
|
+
if partition_deltas:
|
918
|
+
partition.locator = partition_deltas[0].partition_locator
|
919
|
+
|
920
|
+
partition.state = CommitState.COMMITTED
|
845
921
|
partition.previous_stream_position = (
|
846
922
|
pv_partition.stream_position if pv_partition else None
|
847
923
|
)
|
848
|
-
if partition_deltas:
|
849
|
-
partition.locator = partition_deltas[0].partition_locator
|
850
924
|
params = (json.dumps(partition), partition.locator.canonical_string())
|
851
925
|
cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
|
852
926
|
con.commit()
|
@@ -858,7 +932,7 @@ def delete_partition(
|
|
858
932
|
namespace: str,
|
859
933
|
table_name: str,
|
860
934
|
table_version: Optional[str] = None,
|
861
|
-
partition_values: Optional[
|
935
|
+
partition_values: Optional[PartitionValues] = None,
|
862
936
|
*args,
|
863
937
|
**kwargs,
|
864
938
|
) -> None:
|
@@ -875,7 +949,7 @@ def delete_partition(
|
|
875
949
|
|
876
950
|
def get_partition(
|
877
951
|
stream_locator: StreamLocator,
|
878
|
-
partition_values: Optional[
|
952
|
+
partition_values: Optional[PartitionValues] = None,
|
879
953
|
*args,
|
880
954
|
**kwargs,
|
881
955
|
) -> Optional[Partition]:
|
@@ -916,12 +990,14 @@ def stage_delta(
|
|
916
990
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
917
991
|
content_type: ContentType = ContentType.PARQUET,
|
918
992
|
delete_parameters: Optional[DeleteParameters] = None,
|
993
|
+
partition_spec: Optional[DeltaPartitionSpec] = None,
|
994
|
+
partition_values: Optional[PartitionValues] = None,
|
919
995
|
*args,
|
920
996
|
**kwargs,
|
921
997
|
) -> Delta:
|
922
998
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
923
|
-
|
924
|
-
uri = _get_manifest_entry_uri(
|
999
|
+
manifest_id = uuid.uuid4().__str__()
|
1000
|
+
uri = _get_manifest_entry_uri(manifest_id)
|
925
1001
|
|
926
1002
|
if data is None:
|
927
1003
|
delta = create_empty_delta(
|
@@ -929,7 +1005,7 @@ def stage_delta(
|
|
929
1005
|
delta_type,
|
930
1006
|
author,
|
931
1007
|
properties=properties,
|
932
|
-
manifest_entry_id=
|
1008
|
+
manifest_entry_id=manifest_id,
|
933
1009
|
)
|
934
1010
|
cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", (uri, None))
|
935
1011
|
params = (delta.locator.canonical_string(), "staged_delta", json.dumps(delta))
|
@@ -937,6 +1013,12 @@ def stage_delta(
|
|
937
1013
|
con.commit()
|
938
1014
|
return delta
|
939
1015
|
|
1016
|
+
if partition_spec:
|
1017
|
+
assert partition_values is not None, (
|
1018
|
+
"partition_values must be provided as local "
|
1019
|
+
"storage does not support computing it from input data"
|
1020
|
+
)
|
1021
|
+
|
940
1022
|
serialized_data = None
|
941
1023
|
if content_type == ContentType.PARQUET:
|
942
1024
|
buffer = io.BytesIO()
|
@@ -961,18 +1043,19 @@ def stage_delta(
|
|
961
1043
|
content_type=content_type,
|
962
1044
|
content_encoding=ContentEncoding.IDENTITY,
|
963
1045
|
source_content_length=data.nbytes,
|
1046
|
+
partition_values=partition_values,
|
964
1047
|
)
|
965
1048
|
|
966
1049
|
manifest = Manifest.of(
|
967
1050
|
entries=ManifestEntryList.of(
|
968
1051
|
[
|
969
1052
|
ManifestEntry.of(
|
970
|
-
uri=uri, url=uri, meta=meta, mandatory=True, uuid=
|
1053
|
+
uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_id
|
971
1054
|
)
|
972
1055
|
]
|
973
1056
|
),
|
974
1057
|
author=author,
|
975
|
-
uuid=
|
1058
|
+
uuid=manifest_id,
|
976
1059
|
)
|
977
1060
|
|
978
1061
|
delta = Delta.of(
|
@@ -1162,3 +1245,15 @@ def get_table_version_column_names(
|
|
1162
1245
|
**kwargs,
|
1163
1246
|
) -> Optional[List[str]]:
|
1164
1247
|
raise NotImplementedError("Fetching column names is not supported")
|
1248
|
+
|
1249
|
+
|
1250
|
+
def can_categorize(e: BaseException, **kwargs) -> bool:
|
1251
|
+
if isinstance(e, InvalidNamespaceError):
|
1252
|
+
return True
|
1253
|
+
else:
|
1254
|
+
return False
|
1255
|
+
|
1256
|
+
|
1257
|
+
def raise_categorized_error(e: BaseException, **kwargs):
|
1258
|
+
if isinstance(e, InvalidNamespaceError):
|
1259
|
+
raise LocalStorageValidationError("Namespace provided is invalid!")
|
@@ -0,0 +1,100 @@
|
|
1
|
+
import unittest
|
2
|
+
from deltacat.exceptions import categorize_errors
|
3
|
+
import ray
|
4
|
+
from deltacat.exceptions import (
|
5
|
+
DependencyPyarrowCapacityError,
|
6
|
+
NonRetryableDownloadTableError,
|
7
|
+
RetryableError,
|
8
|
+
NonRetryableError,
|
9
|
+
DeltaCatTransientError,
|
10
|
+
DependencyDaftTransientError,
|
11
|
+
UnclassifiedDeltaCatError,
|
12
|
+
)
|
13
|
+
from daft.exceptions import DaftTransientError
|
14
|
+
from deltacat.tests.local_deltacat_storage.exceptions import (
|
15
|
+
InvalidNamespaceError,
|
16
|
+
LocalStorageValidationError,
|
17
|
+
)
|
18
|
+
from botocore.exceptions import NoCredentialsError
|
19
|
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt
|
20
|
+
|
21
|
+
from pyarrow.lib import ArrowCapacityError
|
22
|
+
import deltacat.tests.local_deltacat_storage as ds
|
23
|
+
|
24
|
+
|
25
|
+
class MockUnknownException(Exception):
|
26
|
+
pass
|
27
|
+
|
28
|
+
|
29
|
+
@categorize_errors
|
30
|
+
def mock_raise_exception(exception_to_raise, deltacat_storage=ds):
|
31
|
+
raise exception_to_raise
|
32
|
+
|
33
|
+
|
34
|
+
@retry(retry=retry_if_exception_type(NoCredentialsError), stop=stop_after_attempt(2))
|
35
|
+
def mock_tenacity_wrapped_method(exception_to_raise):
|
36
|
+
mock_raise_exception(exception_to_raise)
|
37
|
+
|
38
|
+
|
39
|
+
@ray.remote
|
40
|
+
def mock_remote_task(exception_to_raise):
|
41
|
+
mock_raise_exception(exception_to_raise)
|
42
|
+
|
43
|
+
|
44
|
+
class TestCategorizeErrors(unittest.TestCase):
|
45
|
+
def test_pyarrow_exception_categorizer(self):
|
46
|
+
self.assertRaises(
|
47
|
+
DependencyPyarrowCapacityError,
|
48
|
+
lambda: mock_raise_exception(ArrowCapacityError),
|
49
|
+
)
|
50
|
+
|
51
|
+
def test_storage_exception_categorizer(self):
|
52
|
+
self.assertRaises(
|
53
|
+
LocalStorageValidationError,
|
54
|
+
lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
|
55
|
+
)
|
56
|
+
|
57
|
+
def test_non_retryable_error(self):
|
58
|
+
self.assertRaises(
|
59
|
+
NonRetryableError,
|
60
|
+
lambda: mock_raise_exception(NonRetryableDownloadTableError),
|
61
|
+
)
|
62
|
+
|
63
|
+
def test_retryable_error(self):
|
64
|
+
self.assertRaises(RetryableError, lambda: mock_raise_exception(ConnectionError))
|
65
|
+
|
66
|
+
def test_ray_task_returns_wrapped_exception(self):
|
67
|
+
self.assertRaises(
|
68
|
+
DeltaCatTransientError,
|
69
|
+
lambda: ray.get(mock_remote_task.remote(ConnectionError)),
|
70
|
+
)
|
71
|
+
|
72
|
+
def test_daft_transient_error(self):
|
73
|
+
self.assertRaises(
|
74
|
+
DependencyDaftTransientError,
|
75
|
+
lambda: ray.get(mock_remote_task.remote(DaftTransientError)),
|
76
|
+
)
|
77
|
+
|
78
|
+
def test_tenacity_underlying_error_returned(self):
|
79
|
+
self.assertRaises(
|
80
|
+
DeltaCatTransientError,
|
81
|
+
lambda: mock_tenacity_wrapped_method(NoCredentialsError),
|
82
|
+
)
|
83
|
+
|
84
|
+
def test_unclassified_error_when_error_cannot_be_categorized(self):
|
85
|
+
self.assertRaises(
|
86
|
+
UnclassifiedDeltaCatError,
|
87
|
+
lambda: ray.get(mock_remote_task.remote(MockUnknownException)),
|
88
|
+
)
|
89
|
+
|
90
|
+
def test_deltacat_exception_contains_attributes(self):
|
91
|
+
|
92
|
+
try:
|
93
|
+
mock_raise_exception(ConnectionError)
|
94
|
+
except DeltaCatTransientError as e:
|
95
|
+
self.assertTrue(hasattr(e, "is_retryable"))
|
96
|
+
self.assertTrue(hasattr(e, "error_name"))
|
97
|
+
assert e.error_name == "DeltaCatTransientError"
|
98
|
+
return
|
99
|
+
|
100
|
+
self.assertFalse(True)
|
deltacat/tests/test_logs.py
CHANGED
@@ -66,7 +66,10 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
66
66
|
|
67
67
|
|
68
68
|
def commit_delta_to_partition(
|
69
|
-
partition: Partition,
|
69
|
+
partition: Union[Partition, PartitionLocator],
|
70
|
+
file_paths: List[str],
|
71
|
+
*args,
|
72
|
+
**kwargs,
|
70
73
|
) -> Delta:
|
71
74
|
tables = []
|
72
75
|
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from ray.data import from_items
|
2
|
+
from typing import Any
|
3
|
+
import pytest
|
4
|
+
import fsspec
|
5
|
+
from fsspec import AbstractFileSystem
|
6
|
+
from ray.data.datasource import FilenameProvider
|
7
|
+
from deltacat.types.media import ContentType
|
8
|
+
import ray
|
9
|
+
|
10
|
+
|
11
|
+
class TestDatasetToFile:
|
12
|
+
|
13
|
+
BASE_PATH = "/tmp"
|
14
|
+
SUB_PATH = "abcd"
|
15
|
+
|
16
|
+
@pytest.fixture(autouse=True, scope="module")
|
17
|
+
def ensure_ray_down(self):
|
18
|
+
# ray.data fails when ray is instantiated in local mode
|
19
|
+
ray.shutdown()
|
20
|
+
|
21
|
+
@pytest.fixture(scope="module")
|
22
|
+
def mock_dataset(self):
|
23
|
+
return from_items([{"col1": i, "col2": i * 2} for i in range(1000)])
|
24
|
+
|
25
|
+
@pytest.fixture(scope="module")
|
26
|
+
def mock_filename_provider(self):
|
27
|
+
class MockFilenameProvider(FilenameProvider):
|
28
|
+
def get_filename_for_block(
|
29
|
+
self, block: Any, task_index: int, block_index: int
|
30
|
+
) -> str:
|
31
|
+
return TestDatasetToFile.SUB_PATH
|
32
|
+
|
33
|
+
return MockFilenameProvider()
|
34
|
+
|
35
|
+
def test_parquet_sanity(self, mock_dataset, mock_filename_provider):
|
36
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
37
|
+
|
38
|
+
fs: AbstractFileSystem = fsspec.filesystem("local")
|
39
|
+
|
40
|
+
dataset_to_file(
|
41
|
+
mock_dataset,
|
42
|
+
self.BASE_PATH,
|
43
|
+
file_system=fs,
|
44
|
+
block_path_provider=mock_filename_provider,
|
45
|
+
)
|
46
|
+
|
47
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
48
|
+
assert fs.exists(file_expected_at), "file was not written"
|
49
|
+
fs.delete(file_expected_at)
|
50
|
+
|
51
|
+
def test_csv_sanity(self, mock_dataset, mock_filename_provider):
|
52
|
+
from deltacat.utils.ray_utils.dataset import dataset_to_file
|
53
|
+
|
54
|
+
fs: AbstractFileSystem = fsspec.filesystem("local")
|
55
|
+
|
56
|
+
dataset_to_file(
|
57
|
+
mock_dataset,
|
58
|
+
self.BASE_PATH,
|
59
|
+
file_system=fs,
|
60
|
+
block_path_provider=mock_filename_provider,
|
61
|
+
content_type=ContentType.CSV.value,
|
62
|
+
)
|
63
|
+
|
64
|
+
file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
|
65
|
+
assert fs.exists(file_expected_at), "file was not written"
|
66
|
+
fs.delete(file_expected_at)
|
@@ -1,7 +1,6 @@
|
|
1
1
|
import unittest
|
2
2
|
from deltacat.types.media import ContentEncoding, ContentType
|
3
3
|
from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
|
4
|
-
|
5
4
|
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
6
5
|
from deltacat.types.partial_download import PartialParquetParameters
|
7
6
|
import pyarrow as pa
|
@@ -1,8 +1,6 @@
|
|
1
1
|
import unittest
|
2
2
|
from unittest import mock
|
3
3
|
import time
|
4
|
-
from multiprocessing import Pool
|
5
|
-
import platform
|
6
4
|
|
7
5
|
|
8
6
|
class TestGetCurrentClusterUtilization(unittest.TestCase):
|
@@ -72,29 +70,3 @@ class TestProcessUtilizationOverTimeRange(unittest.TestCase):
|
|
72
70
|
nu.schedule_callback(test_callback, 1)
|
73
71
|
time.sleep(3)
|
74
72
|
self.assertTrue(nu.test_field_set)
|
75
|
-
|
76
|
-
|
77
|
-
class TestTimeoutDecorator(unittest.TestCase):
|
78
|
-
from deltacat.utils.resources import timeout
|
79
|
-
|
80
|
-
@staticmethod
|
81
|
-
@timeout(2)
|
82
|
-
def something_that_runs_xs(x, *args, **kwargs):
|
83
|
-
time.sleep(x)
|
84
|
-
|
85
|
-
def test_timeout(self):
|
86
|
-
if platform.system() != "Windows":
|
87
|
-
self.assertRaises(
|
88
|
-
TimeoutError, lambda: self.something_that_runs_xs(3, test=10)
|
89
|
-
)
|
90
|
-
|
91
|
-
def test_sanity_in_multiprocess(self):
|
92
|
-
if platform.system() != "Windows":
|
93
|
-
# An alarm works per process
|
94
|
-
# https://pubs.opengroup.org/onlinepubs/9699919799/functions/alarm.html
|
95
|
-
with Pool(3) as p:
|
96
|
-
p.map(self.something_that_runs_xs, [1, 1.1, 1.2])
|
97
|
-
|
98
|
-
def test_sanity(self):
|
99
|
-
if platform.system() != "Windows":
|
100
|
-
self.something_that_runs_xs(1, test=10)
|
deltacat/utils/daft.py
CHANGED
@@ -16,6 +16,7 @@ from deltacat.aws.constants import (
|
|
16
16
|
BOTO_MAX_RETRIES,
|
17
17
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
18
18
|
AWS_REGION,
|
19
|
+
DEFAULT_FILE_READ_TIMEOUT_MS,
|
19
20
|
)
|
20
21
|
from deltacat.utils.performance import timed_invocation
|
21
22
|
|
@@ -112,6 +113,7 @@ def daft_s3_file_to_table(
|
|
112
113
|
coerce_int96_timestamp_unit = TimeUnit.from_str(
|
113
114
|
kwargs.get("coerce_int96_timestamp_unit", "ms")
|
114
115
|
)
|
116
|
+
file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
|
115
117
|
|
116
118
|
row_groups = None
|
117
119
|
if (
|
@@ -132,6 +134,7 @@ def daft_s3_file_to_table(
|
|
132
134
|
io_config=io_config,
|
133
135
|
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
|
134
136
|
multithreaded_io=False,
|
137
|
+
file_timeout_ms=file_timeout_ms,
|
135
138
|
)
|
136
139
|
|
137
140
|
logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
|
deltacat/utils/numpy.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
from typing import List, Optional
|
1
|
+
from typing import List, Optional, Callable, Union
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import pyarrow as pa
|
5
5
|
from fsspec import AbstractFileSystem
|
6
|
-
from ray.data.datasource import BlockWritePathProvider
|
7
6
|
|
7
|
+
from ray.data.datasource import FilenameProvider
|
8
8
|
from deltacat.types.media import ContentType
|
9
9
|
from deltacat.utils import pandas as pd_utils
|
10
10
|
from deltacat.utils import pyarrow as pa_utils
|
@@ -52,7 +52,7 @@ def ndarray_to_file(
|
|
52
52
|
np_array: np.ndarray,
|
53
53
|
path: str,
|
54
54
|
file_system: AbstractFileSystem,
|
55
|
-
block_path_provider:
|
55
|
+
block_path_provider: Union[FilenameProvider, Callable],
|
56
56
|
content_type: str = ContentType.PARQUET.value,
|
57
57
|
**kwargs
|
58
58
|
) -> None:
|
deltacat/utils/pandas.py
CHANGED
@@ -2,12 +2,12 @@ import csv
|
|
2
2
|
import io
|
3
3
|
import logging
|
4
4
|
import math
|
5
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional
|
5
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
6
6
|
|
7
7
|
import pandas as pd
|
8
8
|
import pyarrow as pa
|
9
9
|
from fsspec import AbstractFileSystem
|
10
|
-
from ray.data.datasource import
|
10
|
+
from ray.data.datasource import FilenameProvider
|
11
11
|
|
12
12
|
from deltacat import logs
|
13
13
|
from deltacat.types.media import (
|
@@ -262,7 +262,7 @@ def dataframe_to_file(
|
|
262
262
|
dataframe: pd.DataFrame,
|
263
263
|
base_path: str,
|
264
264
|
file_system: AbstractFileSystem,
|
265
|
-
block_path_provider:
|
265
|
+
block_path_provider: Union[Callable, FilenameProvider],
|
266
266
|
content_type: str = ContentType.PARQUET.value,
|
267
267
|
**kwargs,
|
268
268
|
) -> None:
|