deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +6 -0
  3. deltacat/aws/redshift/model/manifest.py +16 -0
  4. deltacat/aws/s3u.py +65 -38
  5. deltacat/compute/compactor/compaction_session.py +5 -1
  6. deltacat/compute/compactor/model/compact_partition_params.py +12 -1
  7. deltacat/compute/compactor/model/materialize_result.py +0 -4
  8. deltacat/compute/compactor/repartition_session.py +1 -0
  9. deltacat/compute/compactor/utils/round_completion_file.py +39 -9
  10. deltacat/compute/compactor_v2/compaction_session.py +26 -16
  11. deltacat/compute/compactor_v2/constants.py +5 -11
  12. deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
  13. deltacat/compute/compactor_v2/model/merge_input.py +6 -0
  14. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
  15. deltacat/compute/compactor_v2/steps/merge.py +12 -12
  16. deltacat/compute/compactor_v2/utils/merge.py +1 -0
  17. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  18. deltacat/compute/compactor_v2/utils/task_options.py +2 -12
  19. deltacat/exceptions.py +342 -7
  20. deltacat/io/dataset.py +5 -17
  21. deltacat/io/memcached_object_store.py +7 -4
  22. deltacat/storage/__init__.py +24 -0
  23. deltacat/storage/interface.py +56 -6
  24. deltacat/storage/model/delta.py +23 -3
  25. deltacat/storage/model/partition.py +6 -7
  26. deltacat/storage/model/partition_spec.py +71 -0
  27. deltacat/storage/model/stream.py +38 -1
  28. deltacat/storage/model/transform.py +127 -0
  29. deltacat/tests/aws/test_s3u.py +2 -0
  30. deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
  31. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
  32. deltacat/tests/compute/compact_partition_test_cases.py +4 -2
  33. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
  34. deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
  35. deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
  36. deltacat/tests/compute/test_util_common.py +19 -4
  37. deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
  38. deltacat/tests/io/test_memcached_object_store.py +5 -2
  39. deltacat/tests/local_deltacat_storage/__init__.py +124 -29
  40. deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
  41. deltacat/tests/test_exceptions.py +100 -0
  42. deltacat/tests/test_logs.py +1 -0
  43. deltacat/tests/test_utils/pyarrow.py +4 -1
  44. deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
  45. deltacat/tests/utils/test_daft.py +0 -1
  46. deltacat/tests/utils/test_resources.py +0 -28
  47. deltacat/utils/daft.py +3 -0
  48. deltacat/utils/numpy.py +3 -3
  49. deltacat/utils/pandas.py +3 -3
  50. deltacat/utils/pyarrow.py +11 -8
  51. deltacat/utils/ray_utils/dataset.py +7 -7
  52. deltacat/utils/ray_utils/runtime.py +2 -2
  53. deltacat/utils/resources.py +0 -45
  54. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
  55. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
  56. deltacat/io/aws/redshift/redshift_datasource.py +0 -578
  57. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
  58. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
  59. {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -47,6 +47,7 @@ class MergeInput(Dict):
47
47
  deltacat_storage=unimplemented_deltacat_storage,
48
48
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
49
  memory_logs_enabled: Optional[bool] = None,
50
+ disable_copy_by_reference: Optional[bool] = None,
50
51
  ) -> MergeInput:
51
52
 
52
53
  result = MergeInput()
@@ -69,6 +70,7 @@ class MergeInput(Dict):
69
70
  result["deltacat_storage"] = deltacat_storage
70
71
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
71
72
  result["memory_logs_enabled"] = memory_logs_enabled
73
+ result["disable_copy_by_reference"] = disable_copy_by_reference
72
74
  return result
73
75
 
74
76
  @property
@@ -148,3 +150,7 @@ class MergeInput(Dict):
148
150
  @property
149
151
  def delete_strategy(self) -> Optional[DeleteStrategy]:
150
152
  return self.get("delete_strategy")
153
+
154
+ @property
155
+ def disable_copy_by_reference(self) -> bool:
156
+ return self["disable_copy_by_reference"]
@@ -29,14 +29,15 @@ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_m
29
29
  from deltacat.utils.resources import (
30
30
  get_current_process_peak_memory_usage_in_bytes,
31
31
  ProcessUtilizationOverTimeRange,
32
- timeout,
33
32
  )
34
33
  from deltacat.constants import BYTES_PER_GIBIBYTE
35
34
  from deltacat.compute.compactor_v2.constants import (
36
35
  HASH_BUCKET_TIME_IN_SECONDS,
37
36
  HASH_BUCKET_FAILURE_COUNT,
38
37
  HASH_BUCKET_SUCCESS_COUNT,
39
- HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS,
38
+ )
39
+ from deltacat.exceptions import (
40
+ categorize_errors,
40
41
  )
41
42
 
42
43
  if importlib.util.find_spec("memray"):
@@ -79,7 +80,7 @@ def _group_file_records_by_pk_hash_bucket(
79
80
  logger.info("Grouping by pk hash bucket")
80
81
  group_start = time.monotonic()
81
82
  hash_bucket_to_table = group_by_pk_hash_bucket(
82
- dfe.table, num_hash_buckets, primary_keys
83
+ table=dfe.table, num_buckets=num_hash_buckets, primary_keys=primary_keys
83
84
  )
84
85
  group_end = time.monotonic()
85
86
  logger.info(f"Grouping took: {group_end - group_start}")
@@ -98,12 +99,9 @@ def _group_file_records_by_pk_hash_bucket(
98
99
  return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
99
100
 
100
101
 
101
- # TODO: use timeout parameter in ray.remote
102
- # https://github.com/ray-project/ray/issues/18916
103
- # Note: order of decorators is important
104
102
  @success_metric(name=HASH_BUCKET_SUCCESS_COUNT)
105
103
  @failure_metric(name=HASH_BUCKET_FAILURE_COUNT)
106
- @timeout(HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS)
104
+ @categorize_errors
107
105
  def _timed_hash_bucket(input: HashBucketInput):
108
106
  task_id = get_current_ray_task_id()
109
107
  worker_id = get_current_ray_worker_id()
@@ -28,7 +28,6 @@ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_m
28
28
  from deltacat.utils.resources import (
29
29
  get_current_process_peak_memory_usage_in_bytes,
30
30
  ProcessUtilizationOverTimeRange,
31
- timeout,
32
31
  )
33
32
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
34
33
  generate_pk_hash_column,
@@ -47,9 +46,10 @@ from deltacat.compute.compactor_v2.constants import (
47
46
  MERGE_TIME_IN_SECONDS,
48
47
  MERGE_SUCCESS_COUNT,
49
48
  MERGE_FAILURE_COUNT,
50
- MERGE_TASK_TIMEOUT_IN_SECONDS,
51
49
  )
52
-
50
+ from deltacat.exceptions import (
51
+ categorize_errors,
52
+ )
53
53
 
54
54
  if importlib.util.find_spec("memray"):
55
55
  import memray
@@ -284,16 +284,19 @@ def _can_copy_by_reference(
284
284
  Can copy by reference only if there are no deletes to merge in
285
285
  and previous compacted stream id matches that of new stream
286
286
  """
287
- return (
287
+ copy_by_ref = (
288
288
  not has_delete
289
289
  and not merge_file_group.dfe_groups
290
290
  and input.round_completion_info is not None
291
- and (
292
- input.write_to_partition.stream_id
293
- == input.round_completion_info.compacted_delta_locator.stream_id
294
- )
295
291
  )
296
292
 
293
+ if input.disable_copy_by_reference:
294
+ copy_by_ref = False
295
+
296
+ logger.info(f"Copy by reference is {copy_by_ref} for {merge_file_group.hb_index}")
297
+
298
+ return copy_by_ref
299
+
297
300
 
298
301
  def _flatten_dfe_list(
299
302
  df_envelopes_list: List[List[DeltaFileEnvelope]],
@@ -486,12 +489,9 @@ def _copy_manifests_from_hash_bucketing(
486
489
  return materialized_results
487
490
 
488
491
 
489
- # TODO: use timeout parameter in ray.remote
490
- # https://github.com/ray-project/ray/issues/18916
491
- # Note: order of decorators is important
492
492
  @success_metric(name=MERGE_SUCCESS_COUNT)
493
493
  @failure_metric(name=MERGE_FAILURE_COUNT)
494
- @timeout(MERGE_TASK_TIMEOUT_IN_SECONDS)
494
+ @categorize_errors
495
495
  def _timed_merge(input: MergeInput) -> MergeResult:
496
496
  task_id = get_current_ray_task_id()
497
497
  worker_id = get_current_ray_worker_id()
@@ -133,4 +133,5 @@ def generate_local_merge_input(
133
133
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
134
134
  delete_strategy=delete_strategy,
135
135
  delete_file_envelopes=delete_file_envelopes,
136
+ disable_copy_by_reference=params.disable_copy_by_reference,
136
137
  )
@@ -157,7 +157,12 @@ def _optimized_group_record_batches_by_hash_bucket(
157
157
  def group_by_pk_hash_bucket(
158
158
  table: pa.Table, num_buckets: int, primary_keys: List[str]
159
159
  ) -> np.ndarray:
160
- table = generate_pk_hash_column([table], primary_keys, requires_sha1=True)[0]
160
+ new_tables = generate_pk_hash_column([table], primary_keys, requires_hash=True)
161
+ assert (
162
+ len(new_tables) == 1
163
+ ), f"Expected only 1 table in the result but found {len(new_tables)}"
164
+
165
+ table = generate_pk_hash_column([table], primary_keys, requires_hash=True)[0]
161
166
 
162
167
  # group hash bucket record indices
163
168
  result = group_record_indices_by_hash_bucket(
@@ -171,7 +176,7 @@ def group_by_pk_hash_bucket(
171
176
  def generate_pk_hash_column(
172
177
  tables: List[pa.Table],
173
178
  primary_keys: Optional[List[str]] = None,
174
- requires_sha1: bool = False,
179
+ requires_hash: bool = False,
175
180
  ) -> List[pa.Table]:
176
181
  """
177
182
  Returns a new table list after generating the primary key hash if desired.
@@ -203,12 +208,12 @@ def generate_pk_hash_column(
203
208
  if primary_keys:
204
209
  hash_column_list = [_generate_pk_hash(table) for table in tables]
205
210
 
206
- can_sha1 = requires_sha1 or _is_sha1_desired(hash_column_list)
211
+ can_sha1 = requires_hash or _is_sha1_desired(hash_column_list)
207
212
  else:
208
213
  hash_column_list = [_generate_uuid(table) for table in tables]
209
214
 
210
215
  logger.info(
211
- f"can_generate_sha1={can_sha1} for the table and requires_sha1={requires_sha1}"
216
+ f"can_generate_sha1={can_sha1} for the table and requires_sha1={requires_hash}"
212
217
  )
213
218
 
214
219
  result = []
@@ -1,6 +1,4 @@
1
- import botocore
2
1
  import logging
3
- import tenacity
4
2
  from typing import Dict, Optional, List, Tuple, Any
5
3
  from deltacat import logs
6
4
  from deltacat.compute.compactor_v2.model.merge_file_group import (
@@ -21,8 +19,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
21
19
  from deltacat.compute.compactor_v2.constants import (
22
20
  PARQUET_TO_PYARROW_INFLATION,
23
21
  )
24
- from daft.exceptions import DaftTransientError
25
-
22
+ from deltacat.exceptions import RetryableError
26
23
 
27
24
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
25
 
@@ -79,14 +76,7 @@ def get_task_options(
79
76
 
80
77
  # List of possible botocore exceptions are available at
81
78
  # https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
82
- task_opts["retry_exceptions"] = [
83
- botocore.exceptions.ConnectionError,
84
- botocore.exceptions.HTTPClientError,
85
- ConnectionError,
86
- TimeoutError,
87
- DaftTransientError,
88
- tenacity.RetryError,
89
- ]
79
+ task_opts["retry_exceptions"] = [RetryableError]
90
80
 
91
81
  return task_opts
92
82
 
deltacat/exceptions.py CHANGED
@@ -1,14 +1,349 @@
1
- class RetryableError(Exception):
2
- pass
1
+ from __future__ import annotations
2
+ from enum import Enum
3
+ import botocore
4
+ import ray
5
+ import logging
6
+ import tenacity
7
+ from deltacat import logs
8
+ from ray.exceptions import (
9
+ RayError,
10
+ RayTaskError,
11
+ RuntimeEnvSetupError,
12
+ WorkerCrashedError,
13
+ NodeDiedError,
14
+ OutOfMemoryError,
15
+ )
16
+ from deltacat.storage import interface as DeltaCatStorage
17
+ from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
18
+ from botocore.exceptions import BotoCoreError
19
+ from typing import Callable
20
+ from deltacat.utils.ray_utils.runtime import (
21
+ get_current_ray_task_id,
22
+ )
23
+ from daft.exceptions import DaftTransientError, DaftCoreException
3
24
 
25
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
4
26
 
5
- class NonRetryableError(Exception):
6
- pass
27
+ DELTACAT_STORAGE_PARAM = "deltacat_storage"
28
+ DELTACAT_STORAGE_KWARGS_PARAM = "deltacat_storage_kwargs"
7
29
 
8
30
 
9
- class ConcurrentModificationError(Exception):
10
- pass
31
+ class DeltaCatErrorNames(str, Enum):
32
+
33
+ DEPENDENCY_RAY_ERROR = "DependencyRayError"
34
+ DEPENDENCY_RAY_WORKER_DIED_ERROR = "DependencyRayWorkerDiedError"
35
+ DEPENDENCY_RAY_OUT_OF_MEMORY_ERROR = "DependencyRayOOMError"
36
+ DEPENDENCY_RAY_RUNTIME_SETUP_ERROR = "DependencyRayRuntimeSetupError"
37
+ DEPENDENCY_BOTOCORE_ERROR = "DependencyBotocoreError"
38
+ DEPENDENCY_BOTOCORE_CONNECTION_ERROR = "DependencyBotocoreConnectionError"
39
+ DEPENDENCY_BOTOCORE_CREDENTIAL_ERROR = "DependencyBotocoreCredentialError"
40
+ DEPENDENCY_BOTOCORE_TIMEOUT_ERROR = "DependencyBotocoreTimeoutError"
41
+ NON_RETRYABLE_DOWNLOAD_TABLE_ERROR = "NonRetryableDownloadTableError"
42
+ NON_RETRYABLE_DOWNLOAD_FILE_ERROR = "NonRetryableDownloadFileError"
43
+ NON_RETRYABLE_UPLOAD_TABLE_ERROR = "NonRetryableUploadTableError"
44
+ NON_RETRYABLE_UPLOAD_FILE_ERROR = "NonRetryableUploadFileError"
45
+ DEPENDENCY_PYARROW_ERROR = "DependencyPyarrowError"
46
+ DEPENDENCY_PYARROW_INVALID_ERROR = "DependencyPyarrowInvalidError"
47
+ DEPENDENCY_PYARROW_CAPACITY_ERROR = "DependencyPyarrowCapacityError"
48
+ PYMEMCACHED_PUT_OBJECT_ERROR = "PymemcachedPutObjectError"
49
+ DEPENDENCY_DAFT_ERROR = "DependencyDaftError"
50
+
51
+ GENERAL_THROTTLING_ERROR = "GeneralThrottlingError"
52
+ RETRYABLE_UPLOAD_TABLE_ERROR = "RetryableUploadTableError"
53
+ RETRYABLE_UPLOAD_FILE_ERROR = "RetryableUploadFileError"
54
+ RETRYABLE_DOWNLOAD_FILE_ERROR = "RetryableDownloadFileError"
55
+ RETRYABLE_DOWNLOAD_TABLE_ERROR = "RetryableDownloadTableError"
56
+ RETRYABLE_TIMEOUT_ERROR = "RetryableTimeoutError"
57
+ DEPENDENCY_DAFT_TRANSIENT_ERROR = "DependencyDaftTransientError"
58
+
59
+ VALIDATION_ERROR = "ValidationError"
60
+ CONTENT_TYPE_VALIDATION_ERROR = "ContentTypeValidationError"
61
+
62
+ DELTACAT_SYSTEM_ERROR = "DeltaCatSystemError"
63
+ DELTACAT_TRANSIENT_ERROR = "DeltaCatTransientError"
64
+ UNCLASSIFIED_DELTACAT_ERROR = "UnclassifiedDeltaCatError"
65
+ UNRECOGNIZED_RAY_TASK_ERROR = "UnrecognizedRayTaskError"
66
+
67
+
68
+ class DeltaCatError(Exception):
69
+ def __init__(self, *args, **kwargs):
70
+ task_id, node_ip = self._get_ray_task_id_and_node_ip()
71
+ self.task_id = task_id
72
+ self.node_ip = node_ip
73
+ super().__init__(*args, **kwargs)
74
+
75
+ def _get_ray_task_id_and_node_ip(self):
76
+ task_id = get_current_ray_task_id()
77
+ node_ip = ray.util.get_node_ip_address()
78
+ return task_id, node_ip
79
+
80
+
81
+ class NonRetryableError(DeltaCatError):
82
+ is_retryable = False
83
+
84
+
85
+ class RetryableError(DeltaCatError):
86
+ is_retryable = True
11
87
 
12
88
 
13
89
  class ValidationError(NonRetryableError):
14
- pass
90
+ error_name = DeltaCatErrorNames.VALIDATION_ERROR.value
91
+
92
+
93
+ class UnclassifiedDeltaCatError(NonRetryableError):
94
+ error_name = DeltaCatErrorNames.UNCLASSIFIED_DELTACAT_ERROR.value
95
+
96
+
97
+ class DependencyRayError(NonRetryableError):
98
+ error_name = DeltaCatErrorNames.DEPENDENCY_RAY_ERROR.value
99
+
100
+
101
+ class DeltaCatTransientError(RetryableError):
102
+ error_name = DeltaCatErrorNames.DELTACAT_TRANSIENT_ERROR.value
103
+
104
+
105
+ class DependencyDaftError(NonRetryableError):
106
+ error_name = DeltaCatErrorNames.DEPENDENCY_DAFT_ERROR.value
107
+
108
+
109
+ class DependencyRayWorkerDiedError(RetryableError):
110
+ error_name = DeltaCatErrorNames.DEPENDENCY_RAY_WORKER_DIED_ERROR.value
111
+
112
+
113
+ class DependencyRayOutOfMemoryError(RetryableError):
114
+ error_name = DeltaCatErrorNames.DEPENDENCY_RAY_OUT_OF_MEMORY_ERROR.value
115
+
116
+
117
+ class DependencyRayRuntimeSetupError(RetryableError):
118
+ error_name = DeltaCatErrorNames.DEPENDENCY_RAY_RUNTIME_SETUP_ERROR.value
119
+
120
+
121
+ class DependencyPyarrowError(NonRetryableError):
122
+ error_name = DeltaCatErrorNames.DEPENDENCY_PYARROW_ERROR.value
123
+
124
+
125
+ class DependencyPyarrowInvalidError(NonRetryableError):
126
+ error_name = DeltaCatErrorNames.DEPENDENCY_PYARROW_INVALID_ERROR.value
127
+
128
+
129
+ class DependencyPyarrowCapacityError(NonRetryableError):
130
+ error_name = DeltaCatErrorNames.DEPENDENCY_PYARROW_CAPACITY_ERROR.value
131
+
132
+
133
+ class PymemcachedPutObjectError(RetryableError):
134
+ error_name = DeltaCatErrorNames.PYMEMCACHED_PUT_OBJECT_ERROR.value
135
+
136
+
137
+ class ContentTypeValidationError(NonRetryableError):
138
+ error_name = DeltaCatErrorNames.CONTENT_TYPE_VALIDATION_ERROR.value
139
+
140
+
141
+ class DependencyBotocoreError(NonRetryableError):
142
+ error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_ERROR.value
143
+
144
+
145
+ class DependencyBotocoreConnectionError(DeltaCatTransientError):
146
+ error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_CONNECTION_ERROR.value
147
+
148
+
149
+ class DependencyBotocoreCredentialError(DeltaCatTransientError):
150
+ error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_CREDENTIAL_ERROR.value
151
+
152
+
153
+ class DependencyBotocoreTimeoutError(DeltaCatTransientError):
154
+ error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_TIMEOUT_ERROR.value
155
+
156
+
157
+ class NonRetryableDownloadFileError(NonRetryableError):
158
+ error_name = DeltaCatErrorNames.NON_RETRYABLE_DOWNLOAD_FILE_ERROR.value
159
+
160
+
161
+ class NonRetryableDownloadTableError(NonRetryableDownloadFileError):
162
+ error_name = DeltaCatErrorNames.NON_RETRYABLE_DOWNLOAD_TABLE_ERROR.value
163
+
164
+
165
+ class NonRetryableUploadFileError(NonRetryableError):
166
+ error_name = DeltaCatErrorNames.NON_RETRYABLE_UPLOAD_FILE_ERROR.value
167
+
168
+
169
+ class NonRetryableUploadTableError(NonRetryableUploadFileError):
170
+ error_name = DeltaCatErrorNames.NON_RETRYABLE_UPLOAD_TABLE_ERROR.value
171
+
172
+
173
+ class GeneralThrottlingError(RetryableError):
174
+ error_name = DeltaCatErrorNames.GENERAL_THROTTLING_ERROR.value
175
+
176
+
177
+ class RetryableUploadFileError(RetryableError):
178
+ error_name = DeltaCatErrorNames.RETRYABLE_UPLOAD_FILE_ERROR.value
179
+
180
+
181
+ class RetryableUploadTableError(RetryableUploadFileError):
182
+ error_name = DeltaCatErrorNames.RETRYABLE_UPLOAD_TABLE_ERROR.value
183
+
184
+
185
+ class RetryableDownloadFileError(RetryableError):
186
+ error_name = DeltaCatErrorNames.RETRYABLE_DOWNLOAD_FILE_ERROR.value
187
+
188
+
189
+ class RetryableDownloadTableError(RetryableDownloadFileError):
190
+ error_name = DeltaCatErrorNames.RETRYABLE_DOWNLOAD_TABLE_ERROR.value
191
+
192
+
193
+ class RetryableTimeoutError(RetryableError):
194
+ error_name = DeltaCatErrorNames.RETRYABLE_TIMEOUT_ERROR.value
195
+
196
+
197
+ class DependencyDaftTransientError(RetryableError):
198
+ error_name = DeltaCatErrorNames.DEPENDENCY_DAFT_TRANSIENT_ERROR.value
199
+
200
+
201
+ class DeltaCatSystemError(NonRetryableError):
202
+ error_name = DeltaCatErrorNames.DELTACAT_SYSTEM_ERROR.value
203
+
204
+
205
+ class UnrecognizedRayTaskError(NonRetryableError):
206
+ error_name = DeltaCatErrorNames.UNRECOGNIZED_RAY_TASK_ERROR.value
207
+
208
+
209
+ def categorize_errors(func: Callable):
210
+ def wrapper(*args, **kwargs):
211
+ try:
212
+ return func(*args, **kwargs)
213
+ except BaseException as e:
214
+ deltacat_storage = None
215
+ deltacat_storage_kwargs = {}
216
+ if kwargs:
217
+ deltacat_storage = kwargs.get(DELTACAT_STORAGE_PARAM)
218
+ deltacat_storage_kwargs = kwargs.get(DELTACAT_STORAGE_KWARGS_PARAM, {})
219
+ if not deltacat_storage and args:
220
+ for arg in args:
221
+ if (
222
+ isinstance(arg, dict)
223
+ and arg.get(DELTACAT_STORAGE_PARAM) is not None
224
+ ):
225
+ deltacat_storage = arg.get(DELTACAT_STORAGE_PARAM)
226
+ deltacat_storage_kwargs = arg.get(
227
+ DELTACAT_STORAGE_KWARGS_PARAM, {}
228
+ )
229
+ break
230
+
231
+ categorize_deltacat_exception(e, deltacat_storage, deltacat_storage_kwargs)
232
+
233
+ return wrapper
234
+
235
+
236
+ def categorize_deltacat_exception(
237
+ e: BaseException,
238
+ deltacat_storage: DeltaCatStorage = None,
239
+ deltacat_storage_kwargs: dict = None,
240
+ ):
241
+ if deltacat_storage_kwargs is None:
242
+ deltacat_storage_kwargs = {}
243
+
244
+ if isinstance(e, DeltaCatError):
245
+ raise e
246
+ elif deltacat_storage and deltacat_storage.can_categorize(
247
+ e, **deltacat_storage_kwargs
248
+ ):
249
+ deltacat_storage.raise_categorized_error(e, **deltacat_storage_kwargs)
250
+ elif isinstance(e, RayError):
251
+ _categorize_ray_error(e)
252
+ elif isinstance(e, tenacity.RetryError):
253
+ _categorize_tenacity_error(e)
254
+ elif isinstance(e, ArrowException):
255
+ _categorize_dependency_pyarrow_error(e)
256
+ elif isinstance(e, AssertionError):
257
+ _categorize_assertion_error(e)
258
+ elif isinstance(e, DaftCoreException):
259
+ _categorize_daft_error(e)
260
+ elif isinstance(e, BotoCoreError):
261
+ _categorize_botocore_error(e)
262
+ else:
263
+ _categorize_all_remaining_errors(e)
264
+
265
+ logger.error(f"Error categorization failed for {e}.", exc_info=True)
266
+ raise UnclassifiedDeltaCatError(
267
+ "Error could not categorized into DeltaCat error"
268
+ ) from e
269
+
270
+
271
+ def _categorize_ray_error(e: RayError):
272
+ if isinstance(e, RuntimeEnvSetupError):
273
+ raise DependencyRayRuntimeSetupError("Ray failed to setup runtime env.") from e
274
+ elif isinstance(e, WorkerCrashedError) or isinstance(e, NodeDiedError):
275
+ raise DependencyRayWorkerDiedError("Ray worker died unexpectedly.") from e
276
+ elif isinstance(e, OutOfMemoryError):
277
+ raise DependencyRayOutOfMemoryError("Ray worker Out Of Memory.") from e
278
+ elif isinstance(e, RayTaskError):
279
+ if e.cause is not None and isinstance(e.cause, Exception):
280
+ categorize_deltacat_exception(e.cause)
281
+ else:
282
+ raise UnrecognizedRayTaskError(
283
+ "Unrecognized underlying error detected in a Ray task."
284
+ ) from e
285
+ else:
286
+ raise DependencyRayError("Dependency Ray error occurred.") from e
287
+
288
+
289
+ def _categorize_tenacity_error(e: tenacity.RetryError):
290
+ if e.__cause__ is not None and isinstance(e.__cause__, Exception):
291
+ categorize_deltacat_exception(e.__cause__)
292
+ else:
293
+ raise RetryableError("Unrecognized retryable error occurred.") from e
294
+
295
+
296
+ def _categorize_dependency_pyarrow_error(e: ArrowException):
297
+ if isinstance(e, ArrowInvalid):
298
+ raise DependencyPyarrowInvalidError(
299
+ f"Pyarrow Invalid error occurred. Reason: {e}"
300
+ ) from e
301
+ elif isinstance(e, ArrowCapacityError):
302
+ raise DependencyPyarrowCapacityError("Pyarrow Capacity error occurred.") from e
303
+ else:
304
+ raise DependencyPyarrowError("Pyarrow error occurred.") from e
305
+
306
+
307
+ def _categorize_assertion_error(e: BaseException):
308
+ raise ValidationError(
309
+ f"One of the assertions in DeltaCAT has failed. Reason: {e}"
310
+ ) from e
311
+
312
+
313
+ def _categorize_daft_error(e: DaftCoreException):
314
+ if isinstance(e, DaftTransientError):
315
+ raise DependencyDaftTransientError("Daft Transient error occurred.") from e
316
+ elif isinstance(e, DaftCoreException):
317
+ raise DependencyDaftError("Daft error occurred.") from e
318
+
319
+
320
+ def _categorize_botocore_error(e: BotoCoreError):
321
+ if isinstance(e, botocore.exceptions.ConnectionError) or isinstance(
322
+ e, botocore.exceptions.HTTPClientError
323
+ ):
324
+ raise DependencyBotocoreConnectionError(
325
+ "Botocore connection error occurred."
326
+ ) from e
327
+ elif isinstance(e, botocore.exceptions.CredentialRetrievalError) or isinstance(
328
+ e, botocore.exceptions.NoCredentialsError
329
+ ):
330
+ raise DependencyBotocoreCredentialError(
331
+ "Botocore credential retrieval failed"
332
+ ) from e
333
+ elif isinstance(e, botocore.exceptions.ReadTimeoutError) or isinstance(
334
+ e, botocore.exceptions.ConnectTimeoutError
335
+ ):
336
+ raise DependencyBotocoreTimeoutError("Botocore connection timed out.") from e
337
+ else:
338
+ raise DependencyBotocoreError("Botocore error occurred.") from e
339
+
340
+
341
+ def _categorize_all_remaining_errors(e: BaseException):
342
+ if isinstance(e, ConnectionError):
343
+ raise DeltaCatTransientError("Connection error has occurred.") from e
344
+ elif isinstance(e, TimeoutError):
345
+ raise DeltaCatTransientError("Timeout error has occurred.") from e
346
+ elif isinstance(e, OSError):
347
+ raise DeltaCatTransientError("OSError occurred.") from e
348
+ elif isinstance(e, SystemExit):
349
+ raise DeltaCatSystemError("Unexpected System error occurred.") from e
deltacat/io/dataset.py CHANGED
@@ -6,9 +6,6 @@ from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
6
6
  import pyarrow as pa
7
7
  import s3fs
8
8
  from ray.data import Dataset
9
- from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
10
-
11
- from deltacat.io.aws.redshift.redshift_datasource import RedshiftDatasource
12
9
 
13
10
  T = TypeVar("T")
14
11
 
@@ -27,7 +24,6 @@ class DeltacatDataset(Dataset[T]):
27
24
  filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
28
25
  try_create_dir: bool = True,
29
26
  arrow_open_stream_args: Optional[Dict[str, Any]] = None,
30
- block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
31
27
  arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
32
28
  **arrow_parquet_args,
33
29
  ) -> None:
@@ -59,9 +55,8 @@ class DeltacatDataset(Dataset[T]):
59
55
  if True. Does nothing if all directories already exist.
60
56
  arrow_open_stream_args: kwargs passed to
61
57
  pyarrow.fs.FileSystem.open_output_stream
62
- block_path_provider: BlockWritePathProvider implementation
63
- to write each dataset block to a custom output path. Uses
64
- DefaultBlockWritePathProvider if None.
58
+ filename_provider: FilenameProvider implementation
59
+ to write each dataset block to a custom output path.
65
60
  arrow_parquet_args_fn: Callable that returns a dictionary of write
66
61
  arguments to use when writing each block to a file. Overrides
67
62
  any duplicate keys from arrow_parquet_args. This should be used
@@ -72,14 +67,7 @@ class DeltacatDataset(Dataset[T]):
72
67
  pyarrow.parquet.write_table(), which is used to write out each
73
68
  block to a file.
74
69
  """
75
- self.write_datasource(
76
- RedshiftDatasource(),
77
- path=path,
78
- dataset_uuid=self._uuid,
79
- filesystem=filesystem,
80
- try_create_dir=try_create_dir,
81
- open_stream_args=arrow_open_stream_args,
82
- block_path_provider=block_path_provider,
83
- write_args_fn=arrow_parquet_args_fn,
84
- **arrow_parquet_args,
70
+ raise NotImplementedError(
71
+ "Writing to Redshift is not yet supported. "
72
+ "Please use DeltacatDataset.write_parquet() instead."
85
73
  )
@@ -12,6 +12,9 @@ from pymemcache.client.retrying import RetryingClient
12
12
  from pymemcache.exceptions import MemcacheUnexpectedCloseError
13
13
  from pymemcache.client.rendezvous import RendezvousHash
14
14
  from deltacat.utils.cloudpickle import dump_into_chunks
15
+ from deltacat.exceptions import (
16
+ PymemcachedPutObjectError,
17
+ )
15
18
 
16
19
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
17
20
 
@@ -72,7 +75,7 @@ class MemcachedObjectStore(IObjectStore):
72
75
  for create_ref_ip, ref_to_object in input.items():
73
76
  client = self._get_client_by_ip(create_ref_ip)
74
77
  if client.set_many(ref_to_object, noreply=self.noreply):
75
- raise RuntimeError("Unable to write few keys to cache")
78
+ raise PymemcachedPutObjectError("Unable to write a few keys to cache")
76
79
 
77
80
  return result
78
81
 
@@ -87,10 +90,10 @@ class MemcachedObjectStore(IObjectStore):
87
90
 
88
91
  try:
89
92
  if not client.set(ref, chunk, noreply=self.noreply):
90
- raise RuntimeError(f"Unable to write {ref} to cache")
93
+ raise PymemcachedPutObjectError(f"Unable to write {ref} to cache")
91
94
  except BaseException as e:
92
- raise RuntimeError(
93
- f"Received {e} while writing ref={ref} and obj size={len(chunk)}"
95
+ raise PymemcachedPutObjectError(
96
+ f"Received {e} while writing ref={ref} and obj size={len(chunk)}",
94
97
  )
95
98
 
96
99
  return self._create_ref(uid, create_ref_ip, len(serialized_list))
@@ -14,6 +14,20 @@ from deltacat.storage.model.stream import Stream, StreamLocator
14
14
  from deltacat.storage.model.table import Table, TableLocator
15
15
  from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
16
16
  from deltacat.storage.model.delete_parameters import DeleteParameters
17
+ from deltacat.storage.model.partition_spec import (
18
+ PartitionFilter,
19
+ PartitionValues,
20
+ DeltaPartitionSpec,
21
+ StreamPartitionSpec,
22
+ )
23
+ from deltacat.storage.model.transform import (
24
+ Transform,
25
+ TransformName,
26
+ TransformParameters,
27
+ BucketingStrategy,
28
+ BucketTransformParameters,
29
+ IdentityTransformParameters,
30
+ )
17
31
 
18
32
  from deltacat.storage.model.types import (
19
33
  CommitState,
@@ -56,4 +70,14 @@ __all__ = [
56
70
  "TableVersionLocator",
57
71
  "SortKey",
58
72
  "SortOrder",
73
+ "PartitionFilter",
74
+ "PartitionValues",
75
+ "DeltaPartitionSpec",
76
+ "StreamPartitionSpec",
77
+ "Transform",
78
+ "TransformName",
79
+ "TransformParameters",
80
+ "BucketingStrategy",
81
+ "BucketTransformParameters",
82
+ "IdentityTransformParameters",
59
83
  ]