deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +6 -0
  3. deltacat/aws/redshift/model/manifest.py +16 -0
  4. deltacat/aws/s3u.py +65 -38
  5. deltacat/compute/compactor/compaction_session.py +5 -1
  6. deltacat/compute/compactor/model/compact_partition_params.py +12 -1
  7. deltacat/compute/compactor/model/materialize_result.py +0 -4
  8. deltacat/compute/compactor/repartition_session.py +1 -0
  9. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  10. deltacat/compute/compactor_v2/compaction_session.py +26 -16
  11. deltacat/compute/compactor_v2/constants.py +5 -11
  12. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  13. deltacat/compute/compactor_v2/model/merge_input.py +6 -0
  14. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
  15. deltacat/compute/compactor_v2/steps/merge.py +12 -12
  16. deltacat/compute/compactor_v2/utils/merge.py +1 -0
  17. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  18. deltacat/compute/compactor_v2/utils/task_options.py +2 -12
  19. deltacat/exceptions.py +342 -7
  20. deltacat/io/dataset.py +5 -17
  21. deltacat/io/memcached_object_store.py +7 -4
  22. deltacat/storage/__init__.py +24 -0
  23. deltacat/storage/interface.py +56 -6
  24. deltacat/storage/model/delta.py +23 -3
  25. deltacat/storage/model/partition.py +6 -7
  26. deltacat/storage/model/partition_spec.py +71 -0
  27. deltacat/storage/model/stream.py +38 -1
  28. deltacat/storage/model/transform.py +127 -0
  29. deltacat/tests/aws/test_s3u.py +2 -0
  30. deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
  31. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
  32. deltacat/tests/compute/compact_partition_test_cases.py +4 -2
  33. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  34. deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
  35. deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
  36. deltacat/tests/compute/test_util_common.py +19 -4
  37. deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
  38. deltacat/tests/io/test_memcached_object_store.py +5 -2
  39. deltacat/tests/local_deltacat_storage/__init__.py +124 -29
  40. deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
  41. deltacat/tests/test_exceptions.py +100 -0
  42. deltacat/tests/test_logs.py +1 -0
  43. deltacat/tests/test_utils/pyarrow.py +4 -1
  44. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  45. deltacat/tests/utils/test_daft.py +0 -1
  46. deltacat/tests/utils/test_resources.py +0 -28
  47. deltacat/utils/daft.py +3 -0
  48. deltacat/utils/numpy.py +3 -3
  49. deltacat/utils/pandas.py +3 -3
  50. deltacat/utils/pyarrow.py +11 -8
  51. deltacat/utils/ray_utils/dataset.py +7 -7
  52. deltacat/utils/ray_utils/runtime.py +2 -2
  53. deltacat/utils/resources.py +0 -45
  54. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
  55. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
  56. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  57. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  58. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  59. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@ import io
12
12
  from deltacat.tests.test_utils.storage import create_empty_delta
13
13
  from deltacat.utils.common import current_time_ms
14
14
 
15
+
15
16
  from deltacat.storage import (
16
17
  Delta,
17
18
  DeltaLocator,
@@ -40,6 +41,12 @@ from deltacat.storage import (
40
41
  ManifestEntry,
41
42
  ManifestEntryList,
42
43
  DeleteParameters,
44
+ PartitionFilter,
45
+ PartitionValues,
46
+ DeltaPartitionSpec,
47
+ StreamPartitionSpec,
48
+ TransformName,
49
+ IdentityTransformParameters,
43
50
  )
44
51
  from deltacat.types.media import (
45
52
  ContentType,
@@ -49,6 +56,10 @@ from deltacat.types.media import (
49
56
  DistributedDatasetType,
50
57
  )
51
58
  from deltacat.utils.common import ReadKwargsProvider
59
+ from deltacat.tests.local_deltacat_storage.exceptions import (
60
+ InvalidNamespaceError,
61
+ LocalStorageValidationError,
62
+ )
52
63
 
53
64
  SQLITE_CUR_ARG = "sqlite3_cur"
54
65
  SQLITE_CON_ARG = "sqlite3_con"
@@ -97,6 +108,19 @@ def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
97
108
  return f"cloudpickle://{manifest_entry_id}"
98
109
 
99
110
 
111
+ def _merge_and_promote(
112
+ partition_deltas: List[Delta], previous_partition_deltas: List[Delta]
113
+ ):
114
+ previous_partition_deltas_spos_gt: List[Delta] = [
115
+ delta
116
+ for delta in previous_partition_deltas
117
+ if delta.stream_position > partition_deltas[0].stream_position
118
+ ]
119
+ # handle the case if the previous partition deltas have a greater stream position than the partition_delta
120
+ partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
121
+ return partition_deltas
122
+
123
+
100
124
  def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
101
125
  cur, con = _get_sqlite3_cursor_con(kwargs)
102
126
  res = cur.execute("SELECT * FROM namespaces")
@@ -176,12 +200,13 @@ def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partit
176
200
  def list_deltas(
177
201
  namespace: str,
178
202
  table_name: str,
179
- partition_values: Optional[List[Any]] = None,
203
+ partition_values: Optional[PartitionValues] = None,
180
204
  table_version: Optional[str] = None,
181
205
  first_stream_position: Optional[int] = None,
182
206
  last_stream_position: Optional[int] = None,
183
207
  ascending_order: Optional[bool] = None,
184
208
  include_manifest: bool = False,
209
+ partition_filter: Optional[PartitionFilter] = None,
185
210
  *args,
186
211
  **kwargs,
187
212
  ) -> ListResult[Delta]:
@@ -189,6 +214,13 @@ def list_deltas(
189
214
  if stream is None:
190
215
  return ListResult.of([], None, None)
191
216
 
217
+ if partition_values is not None and partition_filter is not None:
218
+ raise ValueError(
219
+ "Only one of partition_values or partition_filter must be provided"
220
+ )
221
+ if partition_filter is not None:
222
+ partition_values = partition_filter.partition_values
223
+
192
224
  partition = get_partition(stream.locator, partition_values, *args, **kwargs)
193
225
 
194
226
  all_deltas = list_partition_deltas(
@@ -279,15 +311,25 @@ def get_delta(
279
311
  namespace: str,
280
312
  table_name: str,
281
313
  stream_position: int,
282
- partition_values: Optional[List[Any]] = None,
314
+ partition_values: Optional[PartitionValues] = None,
283
315
  table_version: Optional[str] = None,
284
316
  include_manifest: bool = False,
317
+ partition_filter: Optional[PartitionFilter] = None,
285
318
  *args,
286
319
  **kwargs,
287
320
  ) -> Optional[Delta]:
288
321
  cur, con = _get_sqlite3_cursor_con(kwargs)
289
322
 
290
323
  stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
324
+
325
+ if partition_values is not None and partition_filter is not None:
326
+ raise ValueError(
327
+ "Only one of partition_values or partition_filter must be provided"
328
+ )
329
+
330
+ if partition_filter is not None:
331
+ partition_values = partition_filter.partition_values
332
+
291
333
  partition = get_partition(stream.locator, partition_values, *args, **kwargs)
292
334
  delta_locator = DeltaLocator.of(partition.locator, stream_position)
293
335
 
@@ -310,22 +352,24 @@ def get_delta(
310
352
  def get_latest_delta(
311
353
  namespace: str,
312
354
  table_name: str,
313
- partition_values: Optional[List[Any]] = None,
355
+ partition_values: Optional[PartitionValues] = None,
314
356
  table_version: Optional[str] = None,
315
357
  include_manifest: bool = False,
358
+ partition_filter: Optional[PartitionFilter] = None,
316
359
  *args,
317
360
  **kwargs,
318
361
  ) -> Optional[Delta]:
319
362
 
320
363
  deltas = list_deltas(
321
- namespace,
322
- table_name,
323
- partition_values,
324
- table_version,
325
- None,
326
- None,
327
- False,
328
- include_manifest,
364
+ namespace=namespace,
365
+ table_name=table_name,
366
+ partition_values=partition_values,
367
+ table_version=table_version,
368
+ first_stream_position=None,
369
+ last_stream_position=None,
370
+ ascending_order=False,
371
+ include_manifest=include_manifest,
372
+ partition_filter=partition_filter,
329
373
  *args,
330
374
  **kwargs,
331
375
  ).all_items()
@@ -345,13 +389,24 @@ def download_delta(
345
389
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
346
390
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
347
391
  distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
392
+ partition_filter: Optional[PartitionFilter] = None,
348
393
  *args,
349
394
  **kwargs,
350
395
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
351
396
  result = []
352
397
  manifest = get_delta_manifest(delta_like, *args, **kwargs)
353
398
 
399
+ partition_values: PartitionValues = None
400
+ if partition_filter is not None:
401
+ partition_values = partition_filter.partition_values
402
+
354
403
  for entry_index in range(len(manifest.entries)):
404
+ if (
405
+ partition_values is not None
406
+ and partition_values != manifest.entries[entry_index].meta.partition_values
407
+ ):
408
+ continue
409
+
355
410
  result.append(
356
411
  download_delta_manifest_entry(
357
412
  delta_like=delta_like,
@@ -506,11 +561,29 @@ def create_table_version(
506
561
  table_description: Optional[str] = None,
507
562
  table_properties: Optional[Dict[str, str]] = None,
508
563
  supported_content_types: Optional[List[ContentType]] = None,
564
+ partition_spec: Optional[StreamPartitionSpec] = None,
509
565
  *args,
510
566
  **kwargs,
511
567
  ) -> Stream:
512
568
  cur, con = _get_sqlite3_cursor_con(kwargs)
513
569
 
570
+ if partition_keys is not None and partition_spec is not None:
571
+ raise ValueError(
572
+ "Only one of partition_keys or partition_spec must be provided"
573
+ )
574
+ if partition_spec is not None:
575
+ assert (
576
+ partition_spec.ordered_transforms is not None
577
+ ), "Ordered transforms must be specified when partition_spec is specified"
578
+ partition_keys = []
579
+ for transform in partition_spec.ordered_transforms:
580
+ assert transform.name == TransformName.IDENTITY, (
581
+ "Local DeltaCAT storage does not support creating table versions "
582
+ "with non identity transform partition spec"
583
+ )
584
+ transform_params: IdentityTransformParameters = transform.parameters
585
+ partition_keys.append(transform_params.column_name)
586
+
514
587
  latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
515
588
  if (
516
589
  table_version is not None
@@ -758,7 +831,7 @@ def delete_stream(
758
831
 
759
832
 
760
833
  def stage_partition(
761
- stream: Stream, partition_values: Optional[List[Any]] = None, *args, **kwargs
834
+ stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
762
835
  ) -> Partition:
763
836
  cur, con = _get_sqlite3_cursor_con(kwargs)
764
837
  partition_id = uuid.uuid4().__str__()
@@ -820,19 +893,19 @@ def commit_partition(
820
893
  ).all_items()
821
894
  or []
822
895
  )
896
+
823
897
  partition_deltas: Optional[List[Delta]] = (
824
898
  list_partition_deltas(
825
899
  partition, ascending_order=False, *args, **kwargs
826
900
  ).all_items()
827
901
  or []
828
902
  )
829
- previous_partition_deltas_spos_gt: List[Delta] = [
830
- delta
831
- for delta in previous_partition_deltas
832
- if delta.stream_position > partition_deltas[0].stream_position
833
- ]
834
- # handle the case if the previous partition deltas have a greater stream position than the partition_delta
835
- partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
903
+
904
+ # if previous_partition is passed in, table is in-place compacted and we need to run merge-and-promote
905
+ if previous_partition:
906
+ partition_deltas = _merge_and_promote(
907
+ partition_deltas, previous_partition_deltas
908
+ )
836
909
 
837
910
  stream_position = (
838
911
  partition_deltas[0].stream_position
@@ -840,13 +913,14 @@ def commit_partition(
840
913
  else partition.stream_position
841
914
  )
842
915
 
843
- partition.state = CommitState.COMMITTED
844
916
  partition.stream_position = stream_position
917
+ if partition_deltas:
918
+ partition.locator = partition_deltas[0].partition_locator
919
+
920
+ partition.state = CommitState.COMMITTED
845
921
  partition.previous_stream_position = (
846
922
  pv_partition.stream_position if pv_partition else None
847
923
  )
848
- if partition_deltas:
849
- partition.locator = partition_deltas[0].partition_locator
850
924
  params = (json.dumps(partition), partition.locator.canonical_string())
851
925
  cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
852
926
  con.commit()
@@ -858,7 +932,7 @@ def delete_partition(
858
932
  namespace: str,
859
933
  table_name: str,
860
934
  table_version: Optional[str] = None,
861
- partition_values: Optional[List[Any]] = None,
935
+ partition_values: Optional[PartitionValues] = None,
862
936
  *args,
863
937
  **kwargs,
864
938
  ) -> None:
@@ -875,7 +949,7 @@ def delete_partition(
875
949
 
876
950
  def get_partition(
877
951
  stream_locator: StreamLocator,
878
- partition_values: Optional[List[Any]] = None,
952
+ partition_values: Optional[PartitionValues] = None,
879
953
  *args,
880
954
  **kwargs,
881
955
  ) -> Optional[Partition]:
@@ -916,12 +990,14 @@ def stage_delta(
916
990
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
917
991
  content_type: ContentType = ContentType.PARQUET,
918
992
  delete_parameters: Optional[DeleteParameters] = None,
993
+ partition_spec: Optional[DeltaPartitionSpec] = None,
994
+ partition_values: Optional[PartitionValues] = None,
919
995
  *args,
920
996
  **kwargs,
921
997
  ) -> Delta:
922
998
  cur, con = _get_sqlite3_cursor_con(kwargs)
923
- manifest_entry_id = uuid.uuid4().__str__()
924
- uri = _get_manifest_entry_uri(manifest_entry_id)
999
+ manifest_id = uuid.uuid4().__str__()
1000
+ uri = _get_manifest_entry_uri(manifest_id)
925
1001
 
926
1002
  if data is None:
927
1003
  delta = create_empty_delta(
@@ -929,7 +1005,7 @@ def stage_delta(
929
1005
  delta_type,
930
1006
  author,
931
1007
  properties=properties,
932
- manifest_entry_id=manifest_entry_id,
1008
+ manifest_entry_id=manifest_id,
933
1009
  )
934
1010
  cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", (uri, None))
935
1011
  params = (delta.locator.canonical_string(), "staged_delta", json.dumps(delta))
@@ -937,6 +1013,12 @@ def stage_delta(
937
1013
  con.commit()
938
1014
  return delta
939
1015
 
1016
+ if partition_spec:
1017
+ assert partition_values is not None, (
1018
+ "partition_values must be provided as local "
1019
+ "storage does not support computing it from input data"
1020
+ )
1021
+
940
1022
  serialized_data = None
941
1023
  if content_type == ContentType.PARQUET:
942
1024
  buffer = io.BytesIO()
@@ -961,18 +1043,19 @@ def stage_delta(
961
1043
  content_type=content_type,
962
1044
  content_encoding=ContentEncoding.IDENTITY,
963
1045
  source_content_length=data.nbytes,
1046
+ partition_values=partition_values,
964
1047
  )
965
1048
 
966
1049
  manifest = Manifest.of(
967
1050
  entries=ManifestEntryList.of(
968
1051
  [
969
1052
  ManifestEntry.of(
970
- uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_entry_id
1053
+ uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_id
971
1054
  )
972
1055
  ]
973
1056
  ),
974
1057
  author=author,
975
- uuid=manifest_entry_id,
1058
+ uuid=manifest_id,
976
1059
  )
977
1060
 
978
1061
  delta = Delta.of(
@@ -1162,3 +1245,15 @@ def get_table_version_column_names(
1162
1245
  **kwargs,
1163
1246
  ) -> Optional[List[str]]:
1164
1247
  raise NotImplementedError("Fetching column names is not supported")
1248
+
1249
+
1250
+ def can_categorize(e: BaseException, **kwargs) -> bool:
1251
+ if isinstance(e, InvalidNamespaceError):
1252
+ return True
1253
+ else:
1254
+ return False
1255
+
1256
+
1257
+ def raise_categorized_error(e: BaseException, **kwargs):
1258
+ if isinstance(e, InvalidNamespaceError):
1259
+ raise LocalStorageValidationError("Namespace provided is invalid!")
@@ -0,0 +1,10 @@
1
+ class InvalidNamespaceError(Exception):
2
+ error_name = "InvalidNamespaceError"
3
+
4
+
5
+ class LocalStorageValidationError(Exception):
6
+ error_name = "LocalStorageValidationError"
7
+
8
+
9
+ class LocalStorageError(Exception):
10
+ error_name = "LocalStorageError"
@@ -0,0 +1,100 @@
1
+ import unittest
2
+ from deltacat.exceptions import categorize_errors
3
+ import ray
4
+ from deltacat.exceptions import (
5
+ DependencyPyarrowCapacityError,
6
+ NonRetryableDownloadTableError,
7
+ RetryableError,
8
+ NonRetryableError,
9
+ DeltaCatTransientError,
10
+ DependencyDaftTransientError,
11
+ UnclassifiedDeltaCatError,
12
+ )
13
+ from daft.exceptions import DaftTransientError
14
+ from deltacat.tests.local_deltacat_storage.exceptions import (
15
+ InvalidNamespaceError,
16
+ LocalStorageValidationError,
17
+ )
18
+ from botocore.exceptions import NoCredentialsError
19
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt
20
+
21
+ from pyarrow.lib import ArrowCapacityError
22
+ import deltacat.tests.local_deltacat_storage as ds
23
+
24
+
25
+ class MockUnknownException(Exception):
26
+ pass
27
+
28
+
29
+ @categorize_errors
30
+ def mock_raise_exception(exception_to_raise, deltacat_storage=ds):
31
+ raise exception_to_raise
32
+
33
+
34
+ @retry(retry=retry_if_exception_type(NoCredentialsError), stop=stop_after_attempt(2))
35
+ def mock_tenacity_wrapped_method(exception_to_raise):
36
+ mock_raise_exception(exception_to_raise)
37
+
38
+
39
+ @ray.remote
40
+ def mock_remote_task(exception_to_raise):
41
+ mock_raise_exception(exception_to_raise)
42
+
43
+
44
+ class TestCategorizeErrors(unittest.TestCase):
45
+ def test_pyarrow_exception_categorizer(self):
46
+ self.assertRaises(
47
+ DependencyPyarrowCapacityError,
48
+ lambda: mock_raise_exception(ArrowCapacityError),
49
+ )
50
+
51
+ def test_storage_exception_categorizer(self):
52
+ self.assertRaises(
53
+ LocalStorageValidationError,
54
+ lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
55
+ )
56
+
57
+ def test_non_retryable_error(self):
58
+ self.assertRaises(
59
+ NonRetryableError,
60
+ lambda: mock_raise_exception(NonRetryableDownloadTableError),
61
+ )
62
+
63
+ def test_retryable_error(self):
64
+ self.assertRaises(RetryableError, lambda: mock_raise_exception(ConnectionError))
65
+
66
+ def test_ray_task_returns_wrapped_exception(self):
67
+ self.assertRaises(
68
+ DeltaCatTransientError,
69
+ lambda: ray.get(mock_remote_task.remote(ConnectionError)),
70
+ )
71
+
72
+ def test_daft_transient_error(self):
73
+ self.assertRaises(
74
+ DependencyDaftTransientError,
75
+ lambda: ray.get(mock_remote_task.remote(DaftTransientError)),
76
+ )
77
+
78
+ def test_tenacity_underlying_error_returned(self):
79
+ self.assertRaises(
80
+ DeltaCatTransientError,
81
+ lambda: mock_tenacity_wrapped_method(NoCredentialsError),
82
+ )
83
+
84
+ def test_unclassified_error_when_error_cannot_be_categorized(self):
85
+ self.assertRaises(
86
+ UnclassifiedDeltaCatError,
87
+ lambda: ray.get(mock_remote_task.remote(MockUnknownException)),
88
+ )
89
+
90
+ def test_deltacat_exception_contains_attributes(self):
91
+
92
+ try:
93
+ mock_raise_exception(ConnectionError)
94
+ except DeltaCatTransientError as e:
95
+ self.assertTrue(hasattr(e, "is_retryable"))
96
+ self.assertTrue(hasattr(e, "error_name"))
97
+ assert e.error_name == "DeltaCatTransientError"
98
+ return
99
+
100
+ self.assertFalse(True)
@@ -38,6 +38,7 @@ class TestJsonFormatter(unittest.TestCase):
38
38
  self.assertEqual({"message": "test_message"}, result)
39
39
 
40
40
  def test_format_sanity(self):
41
+ ray.shutdown()
41
42
  formatter = JsonFormatter({"message": "msg"})
42
43
 
43
44
  record = LogRecord(
@@ -66,7 +66,10 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
66
66
 
67
67
 
68
68
  def commit_delta_to_partition(
69
- partition: Partition, file_paths: List[str], *args, **kwargs
69
+ partition: Union[Partition, PartitionLocator],
70
+ file_paths: List[str],
71
+ *args,
72
+ **kwargs,
70
73
  ) -> Delta:
71
74
  tables = []
72
75
 
@@ -0,0 +1,66 @@
1
+ from ray.data import from_items
2
+ from typing import Any
3
+ import pytest
4
+ import fsspec
5
+ from fsspec import AbstractFileSystem
6
+ from ray.data.datasource import FilenameProvider
7
+ from deltacat.types.media import ContentType
8
+ import ray
9
+
10
+
11
+ class TestDatasetToFile:
12
+
13
+ BASE_PATH = "/tmp"
14
+ SUB_PATH = "abcd"
15
+
16
+ @pytest.fixture(autouse=True, scope="module")
17
+ def ensure_ray_down(self):
18
+ # ray.data fails when ray is instantiated in local mode
19
+ ray.shutdown()
20
+
21
+ @pytest.fixture(scope="module")
22
+ def mock_dataset(self):
23
+ return from_items([{"col1": i, "col2": i * 2} for i in range(1000)])
24
+
25
+ @pytest.fixture(scope="module")
26
+ def mock_filename_provider(self):
27
+ class MockFilenameProvider(FilenameProvider):
28
+ def get_filename_for_block(
29
+ self, block: Any, task_index: int, block_index: int
30
+ ) -> str:
31
+ return TestDatasetToFile.SUB_PATH
32
+
33
+ return MockFilenameProvider()
34
+
35
+ def test_parquet_sanity(self, mock_dataset, mock_filename_provider):
36
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
37
+
38
+ fs: AbstractFileSystem = fsspec.filesystem("local")
39
+
40
+ dataset_to_file(
41
+ mock_dataset,
42
+ self.BASE_PATH,
43
+ file_system=fs,
44
+ block_path_provider=mock_filename_provider,
45
+ )
46
+
47
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
48
+ assert fs.exists(file_expected_at), "file was not written"
49
+ fs.delete(file_expected_at)
50
+
51
+ def test_csv_sanity(self, mock_dataset, mock_filename_provider):
52
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
53
+
54
+ fs: AbstractFileSystem = fsspec.filesystem("local")
55
+
56
+ dataset_to_file(
57
+ mock_dataset,
58
+ self.BASE_PATH,
59
+ file_system=fs,
60
+ block_path_provider=mock_filename_provider,
61
+ content_type=ContentType.CSV.value,
62
+ )
63
+
64
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
65
+ assert fs.exists(file_expected_at), "file was not written"
66
+ fs.delete(file_expected_at)
@@ -1,7 +1,6 @@
1
1
  import unittest
2
2
  from deltacat.types.media import ContentEncoding, ContentType
3
3
  from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
4
-
5
4
  from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
6
5
  from deltacat.types.partial_download import PartialParquetParameters
7
6
  import pyarrow as pa
@@ -1,8 +1,6 @@
1
1
  import unittest
2
2
  from unittest import mock
3
3
  import time
4
- from multiprocessing import Pool
5
- import platform
6
4
 
7
5
 
8
6
  class TestGetCurrentClusterUtilization(unittest.TestCase):
@@ -72,29 +70,3 @@ class TestProcessUtilizationOverTimeRange(unittest.TestCase):
72
70
  nu.schedule_callback(test_callback, 1)
73
71
  time.sleep(3)
74
72
  self.assertTrue(nu.test_field_set)
75
-
76
-
77
- class TestTimeoutDecorator(unittest.TestCase):
78
- from deltacat.utils.resources import timeout
79
-
80
- @staticmethod
81
- @timeout(2)
82
- def something_that_runs_xs(x, *args, **kwargs):
83
- time.sleep(x)
84
-
85
- def test_timeout(self):
86
- if platform.system() != "Windows":
87
- self.assertRaises(
88
- TimeoutError, lambda: self.something_that_runs_xs(3, test=10)
89
- )
90
-
91
- def test_sanity_in_multiprocess(self):
92
- if platform.system() != "Windows":
93
- # An alarm works per process
94
- # https://pubs.opengroup.org/onlinepubs/9699919799/functions/alarm.html
95
- with Pool(3) as p:
96
- p.map(self.something_that_runs_xs, [1, 1.1, 1.2])
97
-
98
- def test_sanity(self):
99
- if platform.system() != "Windows":
100
- self.something_that_runs_xs(1, test=10)
deltacat/utils/daft.py CHANGED
@@ -16,6 +16,7 @@ from deltacat.aws.constants import (
16
16
  BOTO_MAX_RETRIES,
17
17
  DAFT_MAX_S3_CONNECTIONS_PER_FILE,
18
18
  AWS_REGION,
19
+ DEFAULT_FILE_READ_TIMEOUT_MS,
19
20
  )
20
21
  from deltacat.utils.performance import timed_invocation
21
22
 
@@ -112,6 +113,7 @@ def daft_s3_file_to_table(
112
113
  coerce_int96_timestamp_unit = TimeUnit.from_str(
113
114
  kwargs.get("coerce_int96_timestamp_unit", "ms")
114
115
  )
116
+ file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
115
117
 
116
118
  row_groups = None
117
119
  if (
@@ -132,6 +134,7 @@ def daft_s3_file_to_table(
132
134
  io_config=io_config,
133
135
  coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
134
136
  multithreaded_io=False,
137
+ file_timeout_ms=file_timeout_ms,
135
138
  )
136
139
 
137
140
  logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
deltacat/utils/numpy.py CHANGED
@@ -1,10 +1,10 @@
1
- from typing import List, Optional
1
+ from typing import List, Optional, Callable, Union
2
2
 
3
3
  import numpy as np
4
4
  import pyarrow as pa
5
5
  from fsspec import AbstractFileSystem
6
- from ray.data.datasource import BlockWritePathProvider
7
6
 
7
+ from ray.data.datasource import FilenameProvider
8
8
  from deltacat.types.media import ContentType
9
9
  from deltacat.utils import pandas as pd_utils
10
10
  from deltacat.utils import pyarrow as pa_utils
@@ -52,7 +52,7 @@ def ndarray_to_file(
52
52
  np_array: np.ndarray,
53
53
  path: str,
54
54
  file_system: AbstractFileSystem,
55
- block_path_provider: BlockWritePathProvider,
55
+ block_path_provider: Union[FilenameProvider, Callable],
56
56
  content_type: str = ContentType.PARQUET.value,
57
57
  **kwargs
58
58
  ) -> None:
deltacat/utils/pandas.py CHANGED
@@ -2,12 +2,12 @@ import csv
2
2
  import io
3
3
  import logging
4
4
  import math
5
- from typing import Any, Callable, Dict, Iterable, List, Optional
5
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
6
6
 
7
7
  import pandas as pd
8
8
  import pyarrow as pa
9
9
  from fsspec import AbstractFileSystem
10
- from ray.data.datasource import BlockWritePathProvider
10
+ from ray.data.datasource import FilenameProvider
11
11
 
12
12
  from deltacat import logs
13
13
  from deltacat.types.media import (
@@ -262,7 +262,7 @@ def dataframe_to_file(
262
262
  dataframe: pd.DataFrame,
263
263
  base_path: str,
264
264
  file_system: AbstractFileSystem,
265
- block_path_provider: BlockWritePathProvider,
265
+ block_path_provider: Union[Callable, FilenameProvider],
266
266
  content_type: str = ContentType.PARQUET.value,
267
267
  **kwargs,
268
268
  ) -> None: