deltacat 1.1.8__py3-none-any.whl → 1.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/redshift/model/manifest.py +16 -0
- deltacat/aws/s3u.py +65 -38
- deltacat/compute/compactor/compaction_session.py +5 -1
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor/repartition_session.py +1 -0
- deltacat/compute/compactor/utils/round_completion_file.py +39 -9
- deltacat/compute/compactor_v2/compaction_session.py +26 -16
- deltacat/compute/compactor_v2/constants.py +5 -11
- deltacat/compute/compactor_v2/model/{compaction_session.py → evaluate_compaction_result.py} +1 -2
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/dataset.py +5 -17
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/__init__.py +24 -0
- deltacat/storage/interface.py +56 -6
- deltacat/storage/model/delta.py +23 -3
- deltacat/storage/model/partition.py +6 -7
- deltacat/storage/model/partition_spec.py +71 -0
- deltacat/storage/model/stream.py +38 -1
- deltacat/storage/model/transform.py +127 -0
- deltacat/tests/aws/test_s3u.py +2 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +209 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +204 -37
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_common.py +19 -4
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +124 -29
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/test_utils/pyarrow.py +4 -1
- deltacat/tests/utils/ray_utils/test_dataset.py +66 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/numpy.py +3 -3
- deltacat/utils/pandas.py +3 -3
- deltacat/utils/pyarrow.py +11 -8
- deltacat/utils/ray_utils/dataset.py +7 -7
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/METADATA +6 -5
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/RECORD +58 -51
- deltacat/io/aws/redshift/redshift_datasource.py +0 -578
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.10.dist-info}/top_level.txt +0 -0
@@ -47,6 +47,7 @@ class MergeInput(Dict):
|
|
47
47
|
deltacat_storage=unimplemented_deltacat_storage,
|
48
48
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
49
|
memory_logs_enabled: Optional[bool] = None,
|
50
|
+
disable_copy_by_reference: Optional[bool] = None,
|
50
51
|
) -> MergeInput:
|
51
52
|
|
52
53
|
result = MergeInput()
|
@@ -69,6 +70,7 @@ class MergeInput(Dict):
|
|
69
70
|
result["deltacat_storage"] = deltacat_storage
|
70
71
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
71
72
|
result["memory_logs_enabled"] = memory_logs_enabled
|
73
|
+
result["disable_copy_by_reference"] = disable_copy_by_reference
|
72
74
|
return result
|
73
75
|
|
74
76
|
@property
|
@@ -148,3 +150,7 @@ class MergeInput(Dict):
|
|
148
150
|
@property
|
149
151
|
def delete_strategy(self) -> Optional[DeleteStrategy]:
|
150
152
|
return self.get("delete_strategy")
|
153
|
+
|
154
|
+
@property
|
155
|
+
def disable_copy_by_reference(self) -> bool:
|
156
|
+
return self["disable_copy_by_reference"]
|
@@ -29,14 +29,15 @@ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_m
|
|
29
29
|
from deltacat.utils.resources import (
|
30
30
|
get_current_process_peak_memory_usage_in_bytes,
|
31
31
|
ProcessUtilizationOverTimeRange,
|
32
|
-
timeout,
|
33
32
|
)
|
34
33
|
from deltacat.constants import BYTES_PER_GIBIBYTE
|
35
34
|
from deltacat.compute.compactor_v2.constants import (
|
36
35
|
HASH_BUCKET_TIME_IN_SECONDS,
|
37
36
|
HASH_BUCKET_FAILURE_COUNT,
|
38
37
|
HASH_BUCKET_SUCCESS_COUNT,
|
39
|
-
|
38
|
+
)
|
39
|
+
from deltacat.exceptions import (
|
40
|
+
categorize_errors,
|
40
41
|
)
|
41
42
|
|
42
43
|
if importlib.util.find_spec("memray"):
|
@@ -79,7 +80,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
79
80
|
logger.info("Grouping by pk hash bucket")
|
80
81
|
group_start = time.monotonic()
|
81
82
|
hash_bucket_to_table = group_by_pk_hash_bucket(
|
82
|
-
dfe.table, num_hash_buckets, primary_keys
|
83
|
+
table=dfe.table, num_buckets=num_hash_buckets, primary_keys=primary_keys
|
83
84
|
)
|
84
85
|
group_end = time.monotonic()
|
85
86
|
logger.info(f"Grouping took: {group_end - group_start}")
|
@@ -98,12 +99,9 @@ def _group_file_records_by_pk_hash_bucket(
|
|
98
99
|
return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
|
99
100
|
|
100
101
|
|
101
|
-
# TODO: use timeout parameter in ray.remote
|
102
|
-
# https://github.com/ray-project/ray/issues/18916
|
103
|
-
# Note: order of decorators is important
|
104
102
|
@success_metric(name=HASH_BUCKET_SUCCESS_COUNT)
|
105
103
|
@failure_metric(name=HASH_BUCKET_FAILURE_COUNT)
|
106
|
-
@
|
104
|
+
@categorize_errors
|
107
105
|
def _timed_hash_bucket(input: HashBucketInput):
|
108
106
|
task_id = get_current_ray_task_id()
|
109
107
|
worker_id = get_current_ray_worker_id()
|
@@ -28,7 +28,6 @@ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_m
|
|
28
28
|
from deltacat.utils.resources import (
|
29
29
|
get_current_process_peak_memory_usage_in_bytes,
|
30
30
|
ProcessUtilizationOverTimeRange,
|
31
|
-
timeout,
|
32
31
|
)
|
33
32
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
34
33
|
generate_pk_hash_column,
|
@@ -47,9 +46,10 @@ from deltacat.compute.compactor_v2.constants import (
|
|
47
46
|
MERGE_TIME_IN_SECONDS,
|
48
47
|
MERGE_SUCCESS_COUNT,
|
49
48
|
MERGE_FAILURE_COUNT,
|
50
|
-
MERGE_TASK_TIMEOUT_IN_SECONDS,
|
51
49
|
)
|
52
|
-
|
50
|
+
from deltacat.exceptions import (
|
51
|
+
categorize_errors,
|
52
|
+
)
|
53
53
|
|
54
54
|
if importlib.util.find_spec("memray"):
|
55
55
|
import memray
|
@@ -284,16 +284,19 @@ def _can_copy_by_reference(
|
|
284
284
|
Can copy by reference only if there are no deletes to merge in
|
285
285
|
and previous compacted stream id matches that of new stream
|
286
286
|
"""
|
287
|
-
|
287
|
+
copy_by_ref = (
|
288
288
|
not has_delete
|
289
289
|
and not merge_file_group.dfe_groups
|
290
290
|
and input.round_completion_info is not None
|
291
|
-
and (
|
292
|
-
input.write_to_partition.stream_id
|
293
|
-
== input.round_completion_info.compacted_delta_locator.stream_id
|
294
|
-
)
|
295
291
|
)
|
296
292
|
|
293
|
+
if input.disable_copy_by_reference:
|
294
|
+
copy_by_ref = False
|
295
|
+
|
296
|
+
logger.info(f"Copy by reference is {copy_by_ref} for {merge_file_group.hb_index}")
|
297
|
+
|
298
|
+
return copy_by_ref
|
299
|
+
|
297
300
|
|
298
301
|
def _flatten_dfe_list(
|
299
302
|
df_envelopes_list: List[List[DeltaFileEnvelope]],
|
@@ -486,12 +489,9 @@ def _copy_manifests_from_hash_bucketing(
|
|
486
489
|
return materialized_results
|
487
490
|
|
488
491
|
|
489
|
-
# TODO: use timeout parameter in ray.remote
|
490
|
-
# https://github.com/ray-project/ray/issues/18916
|
491
|
-
# Note: order of decorators is important
|
492
492
|
@success_metric(name=MERGE_SUCCESS_COUNT)
|
493
493
|
@failure_metric(name=MERGE_FAILURE_COUNT)
|
494
|
-
@
|
494
|
+
@categorize_errors
|
495
495
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
496
496
|
task_id = get_current_ray_task_id()
|
497
497
|
worker_id = get_current_ray_worker_id()
|
@@ -157,7 +157,12 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
157
157
|
def group_by_pk_hash_bucket(
|
158
158
|
table: pa.Table, num_buckets: int, primary_keys: List[str]
|
159
159
|
) -> np.ndarray:
|
160
|
-
|
160
|
+
new_tables = generate_pk_hash_column([table], primary_keys, requires_hash=True)
|
161
|
+
assert (
|
162
|
+
len(new_tables) == 1
|
163
|
+
), f"Expected only 1 table in the result but found {len(new_tables)}"
|
164
|
+
|
165
|
+
table = generate_pk_hash_column([table], primary_keys, requires_hash=True)[0]
|
161
166
|
|
162
167
|
# group hash bucket record indices
|
163
168
|
result = group_record_indices_by_hash_bucket(
|
@@ -171,7 +176,7 @@ def group_by_pk_hash_bucket(
|
|
171
176
|
def generate_pk_hash_column(
|
172
177
|
tables: List[pa.Table],
|
173
178
|
primary_keys: Optional[List[str]] = None,
|
174
|
-
|
179
|
+
requires_hash: bool = False,
|
175
180
|
) -> List[pa.Table]:
|
176
181
|
"""
|
177
182
|
Returns a new table list after generating the primary key hash if desired.
|
@@ -203,12 +208,12 @@ def generate_pk_hash_column(
|
|
203
208
|
if primary_keys:
|
204
209
|
hash_column_list = [_generate_pk_hash(table) for table in tables]
|
205
210
|
|
206
|
-
can_sha1 =
|
211
|
+
can_sha1 = requires_hash or _is_sha1_desired(hash_column_list)
|
207
212
|
else:
|
208
213
|
hash_column_list = [_generate_uuid(table) for table in tables]
|
209
214
|
|
210
215
|
logger.info(
|
211
|
-
f"can_generate_sha1={can_sha1} for the table and requires_sha1={
|
216
|
+
f"can_generate_sha1={can_sha1} for the table and requires_sha1={requires_hash}"
|
212
217
|
)
|
213
218
|
|
214
219
|
result = []
|
@@ -1,6 +1,4 @@
|
|
1
|
-
import botocore
|
2
1
|
import logging
|
3
|
-
import tenacity
|
4
2
|
from typing import Dict, Optional, List, Tuple, Any
|
5
3
|
from deltacat import logs
|
6
4
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
@@ -21,8 +19,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
21
19
|
from deltacat.compute.compactor_v2.constants import (
|
22
20
|
PARQUET_TO_PYARROW_INFLATION,
|
23
21
|
)
|
24
|
-
from
|
25
|
-
|
22
|
+
from deltacat.exceptions import RetryableError
|
26
23
|
|
27
24
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
28
25
|
|
@@ -79,14 +76,7 @@ def get_task_options(
|
|
79
76
|
|
80
77
|
# List of possible botocore exceptions are available at
|
81
78
|
# https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
|
82
|
-
task_opts["retry_exceptions"] = [
|
83
|
-
botocore.exceptions.ConnectionError,
|
84
|
-
botocore.exceptions.HTTPClientError,
|
85
|
-
ConnectionError,
|
86
|
-
TimeoutError,
|
87
|
-
DaftTransientError,
|
88
|
-
tenacity.RetryError,
|
89
|
-
]
|
79
|
+
task_opts["retry_exceptions"] = [RetryableError]
|
90
80
|
|
91
81
|
return task_opts
|
92
82
|
|
deltacat/exceptions.py
CHANGED
@@ -1,14 +1,349 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
from enum import Enum
|
3
|
+
import botocore
|
4
|
+
import ray
|
5
|
+
import logging
|
6
|
+
import tenacity
|
7
|
+
from deltacat import logs
|
8
|
+
from ray.exceptions import (
|
9
|
+
RayError,
|
10
|
+
RayTaskError,
|
11
|
+
RuntimeEnvSetupError,
|
12
|
+
WorkerCrashedError,
|
13
|
+
NodeDiedError,
|
14
|
+
OutOfMemoryError,
|
15
|
+
)
|
16
|
+
from deltacat.storage import interface as DeltaCatStorage
|
17
|
+
from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
|
18
|
+
from botocore.exceptions import BotoCoreError
|
19
|
+
from typing import Callable
|
20
|
+
from deltacat.utils.ray_utils.runtime import (
|
21
|
+
get_current_ray_task_id,
|
22
|
+
)
|
23
|
+
from daft.exceptions import DaftTransientError, DaftCoreException
|
3
24
|
|
25
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
4
26
|
|
5
|
-
|
6
|
-
|
27
|
+
DELTACAT_STORAGE_PARAM = "deltacat_storage"
|
28
|
+
DELTACAT_STORAGE_KWARGS_PARAM = "deltacat_storage_kwargs"
|
7
29
|
|
8
30
|
|
9
|
-
class
|
10
|
-
|
31
|
+
class DeltaCatErrorNames(str, Enum):
|
32
|
+
|
33
|
+
DEPENDENCY_RAY_ERROR = "DependencyRayError"
|
34
|
+
DEPENDENCY_RAY_WORKER_DIED_ERROR = "DependencyRayWorkerDiedError"
|
35
|
+
DEPENDENCY_RAY_OUT_OF_MEMORY_ERROR = "DependencyRayOOMError"
|
36
|
+
DEPENDENCY_RAY_RUNTIME_SETUP_ERROR = "DependencyRayRuntimeSetupError"
|
37
|
+
DEPENDENCY_BOTOCORE_ERROR = "DependencyBotocoreError"
|
38
|
+
DEPENDENCY_BOTOCORE_CONNECTION_ERROR = "DependencyBotocoreConnectionError"
|
39
|
+
DEPENDENCY_BOTOCORE_CREDENTIAL_ERROR = "DependencyBotocoreCredentialError"
|
40
|
+
DEPENDENCY_BOTOCORE_TIMEOUT_ERROR = "DependencyBotocoreTimeoutError"
|
41
|
+
NON_RETRYABLE_DOWNLOAD_TABLE_ERROR = "NonRetryableDownloadTableError"
|
42
|
+
NON_RETRYABLE_DOWNLOAD_FILE_ERROR = "NonRetryableDownloadFileError"
|
43
|
+
NON_RETRYABLE_UPLOAD_TABLE_ERROR = "NonRetryableUploadTableError"
|
44
|
+
NON_RETRYABLE_UPLOAD_FILE_ERROR = "NonRetryableUploadFileError"
|
45
|
+
DEPENDENCY_PYARROW_ERROR = "DependencyPyarrowError"
|
46
|
+
DEPENDENCY_PYARROW_INVALID_ERROR = "DependencyPyarrowInvalidError"
|
47
|
+
DEPENDENCY_PYARROW_CAPACITY_ERROR = "DependencyPyarrowCapacityError"
|
48
|
+
PYMEMCACHED_PUT_OBJECT_ERROR = "PymemcachedPutObjectError"
|
49
|
+
DEPENDENCY_DAFT_ERROR = "DependencyDaftError"
|
50
|
+
|
51
|
+
GENERAL_THROTTLING_ERROR = "GeneralThrottlingError"
|
52
|
+
RETRYABLE_UPLOAD_TABLE_ERROR = "RetryableUploadTableError"
|
53
|
+
RETRYABLE_UPLOAD_FILE_ERROR = "RetryableUploadFileError"
|
54
|
+
RETRYABLE_DOWNLOAD_FILE_ERROR = "RetryableDownloadFileError"
|
55
|
+
RETRYABLE_DOWNLOAD_TABLE_ERROR = "RetryableDownloadTableError"
|
56
|
+
RETRYABLE_TIMEOUT_ERROR = "RetryableTimeoutError"
|
57
|
+
DEPENDENCY_DAFT_TRANSIENT_ERROR = "DependencyDaftTransientError"
|
58
|
+
|
59
|
+
VALIDATION_ERROR = "ValidationError"
|
60
|
+
CONTENT_TYPE_VALIDATION_ERROR = "ContentTypeValidationError"
|
61
|
+
|
62
|
+
DELTACAT_SYSTEM_ERROR = "DeltaCatSystemError"
|
63
|
+
DELTACAT_TRANSIENT_ERROR = "DeltaCatTransientError"
|
64
|
+
UNCLASSIFIED_DELTACAT_ERROR = "UnclassifiedDeltaCatError"
|
65
|
+
UNRECOGNIZED_RAY_TASK_ERROR = "UnrecognizedRayTaskError"
|
66
|
+
|
67
|
+
|
68
|
+
class DeltaCatError(Exception):
|
69
|
+
def __init__(self, *args, **kwargs):
|
70
|
+
task_id, node_ip = self._get_ray_task_id_and_node_ip()
|
71
|
+
self.task_id = task_id
|
72
|
+
self.node_ip = node_ip
|
73
|
+
super().__init__(*args, **kwargs)
|
74
|
+
|
75
|
+
def _get_ray_task_id_and_node_ip(self):
|
76
|
+
task_id = get_current_ray_task_id()
|
77
|
+
node_ip = ray.util.get_node_ip_address()
|
78
|
+
return task_id, node_ip
|
79
|
+
|
80
|
+
|
81
|
+
class NonRetryableError(DeltaCatError):
|
82
|
+
is_retryable = False
|
83
|
+
|
84
|
+
|
85
|
+
class RetryableError(DeltaCatError):
|
86
|
+
is_retryable = True
|
11
87
|
|
12
88
|
|
13
89
|
class ValidationError(NonRetryableError):
|
14
|
-
|
90
|
+
error_name = DeltaCatErrorNames.VALIDATION_ERROR.value
|
91
|
+
|
92
|
+
|
93
|
+
class UnclassifiedDeltaCatError(NonRetryableError):
|
94
|
+
error_name = DeltaCatErrorNames.UNCLASSIFIED_DELTACAT_ERROR.value
|
95
|
+
|
96
|
+
|
97
|
+
class DependencyRayError(NonRetryableError):
|
98
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_RAY_ERROR.value
|
99
|
+
|
100
|
+
|
101
|
+
class DeltaCatTransientError(RetryableError):
|
102
|
+
error_name = DeltaCatErrorNames.DELTACAT_TRANSIENT_ERROR.value
|
103
|
+
|
104
|
+
|
105
|
+
class DependencyDaftError(NonRetryableError):
|
106
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_DAFT_ERROR.value
|
107
|
+
|
108
|
+
|
109
|
+
class DependencyRayWorkerDiedError(RetryableError):
|
110
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_RAY_WORKER_DIED_ERROR.value
|
111
|
+
|
112
|
+
|
113
|
+
class DependencyRayOutOfMemoryError(RetryableError):
|
114
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_RAY_OUT_OF_MEMORY_ERROR.value
|
115
|
+
|
116
|
+
|
117
|
+
class DependencyRayRuntimeSetupError(RetryableError):
|
118
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_RAY_RUNTIME_SETUP_ERROR.value
|
119
|
+
|
120
|
+
|
121
|
+
class DependencyPyarrowError(NonRetryableError):
|
122
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_PYARROW_ERROR.value
|
123
|
+
|
124
|
+
|
125
|
+
class DependencyPyarrowInvalidError(NonRetryableError):
|
126
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_PYARROW_INVALID_ERROR.value
|
127
|
+
|
128
|
+
|
129
|
+
class DependencyPyarrowCapacityError(NonRetryableError):
|
130
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_PYARROW_CAPACITY_ERROR.value
|
131
|
+
|
132
|
+
|
133
|
+
class PymemcachedPutObjectError(RetryableError):
|
134
|
+
error_name = DeltaCatErrorNames.PYMEMCACHED_PUT_OBJECT_ERROR.value
|
135
|
+
|
136
|
+
|
137
|
+
class ContentTypeValidationError(NonRetryableError):
|
138
|
+
error_name = DeltaCatErrorNames.CONTENT_TYPE_VALIDATION_ERROR.value
|
139
|
+
|
140
|
+
|
141
|
+
class DependencyBotocoreError(NonRetryableError):
|
142
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_ERROR.value
|
143
|
+
|
144
|
+
|
145
|
+
class DependencyBotocoreConnectionError(DeltaCatTransientError):
|
146
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_CONNECTION_ERROR.value
|
147
|
+
|
148
|
+
|
149
|
+
class DependencyBotocoreCredentialError(DeltaCatTransientError):
|
150
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_CREDENTIAL_ERROR.value
|
151
|
+
|
152
|
+
|
153
|
+
class DependencyBotocoreTimeoutError(DeltaCatTransientError):
|
154
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_BOTOCORE_TIMEOUT_ERROR.value
|
155
|
+
|
156
|
+
|
157
|
+
class NonRetryableDownloadFileError(NonRetryableError):
|
158
|
+
error_name = DeltaCatErrorNames.NON_RETRYABLE_DOWNLOAD_FILE_ERROR.value
|
159
|
+
|
160
|
+
|
161
|
+
class NonRetryableDownloadTableError(NonRetryableDownloadFileError):
|
162
|
+
error_name = DeltaCatErrorNames.NON_RETRYABLE_DOWNLOAD_TABLE_ERROR.value
|
163
|
+
|
164
|
+
|
165
|
+
class NonRetryableUploadFileError(NonRetryableError):
|
166
|
+
error_name = DeltaCatErrorNames.NON_RETRYABLE_UPLOAD_FILE_ERROR.value
|
167
|
+
|
168
|
+
|
169
|
+
class NonRetryableUploadTableError(NonRetryableUploadFileError):
|
170
|
+
error_name = DeltaCatErrorNames.NON_RETRYABLE_UPLOAD_TABLE_ERROR.value
|
171
|
+
|
172
|
+
|
173
|
+
class GeneralThrottlingError(RetryableError):
|
174
|
+
error_name = DeltaCatErrorNames.GENERAL_THROTTLING_ERROR.value
|
175
|
+
|
176
|
+
|
177
|
+
class RetryableUploadFileError(RetryableError):
|
178
|
+
error_name = DeltaCatErrorNames.RETRYABLE_UPLOAD_FILE_ERROR.value
|
179
|
+
|
180
|
+
|
181
|
+
class RetryableUploadTableError(RetryableUploadFileError):
|
182
|
+
error_name = DeltaCatErrorNames.RETRYABLE_UPLOAD_TABLE_ERROR.value
|
183
|
+
|
184
|
+
|
185
|
+
class RetryableDownloadFileError(RetryableError):
|
186
|
+
error_name = DeltaCatErrorNames.RETRYABLE_DOWNLOAD_FILE_ERROR.value
|
187
|
+
|
188
|
+
|
189
|
+
class RetryableDownloadTableError(RetryableDownloadFileError):
|
190
|
+
error_name = DeltaCatErrorNames.RETRYABLE_DOWNLOAD_TABLE_ERROR.value
|
191
|
+
|
192
|
+
|
193
|
+
class RetryableTimeoutError(RetryableError):
|
194
|
+
error_name = DeltaCatErrorNames.RETRYABLE_TIMEOUT_ERROR.value
|
195
|
+
|
196
|
+
|
197
|
+
class DependencyDaftTransientError(RetryableError):
|
198
|
+
error_name = DeltaCatErrorNames.DEPENDENCY_DAFT_TRANSIENT_ERROR.value
|
199
|
+
|
200
|
+
|
201
|
+
class DeltaCatSystemError(NonRetryableError):
|
202
|
+
error_name = DeltaCatErrorNames.DELTACAT_SYSTEM_ERROR.value
|
203
|
+
|
204
|
+
|
205
|
+
class UnrecognizedRayTaskError(NonRetryableError):
|
206
|
+
error_name = DeltaCatErrorNames.UNRECOGNIZED_RAY_TASK_ERROR.value
|
207
|
+
|
208
|
+
|
209
|
+
def categorize_errors(func: Callable):
|
210
|
+
def wrapper(*args, **kwargs):
|
211
|
+
try:
|
212
|
+
return func(*args, **kwargs)
|
213
|
+
except BaseException as e:
|
214
|
+
deltacat_storage = None
|
215
|
+
deltacat_storage_kwargs = {}
|
216
|
+
if kwargs:
|
217
|
+
deltacat_storage = kwargs.get(DELTACAT_STORAGE_PARAM)
|
218
|
+
deltacat_storage_kwargs = kwargs.get(DELTACAT_STORAGE_KWARGS_PARAM, {})
|
219
|
+
if not deltacat_storage and args:
|
220
|
+
for arg in args:
|
221
|
+
if (
|
222
|
+
isinstance(arg, dict)
|
223
|
+
and arg.get(DELTACAT_STORAGE_PARAM) is not None
|
224
|
+
):
|
225
|
+
deltacat_storage = arg.get(DELTACAT_STORAGE_PARAM)
|
226
|
+
deltacat_storage_kwargs = arg.get(
|
227
|
+
DELTACAT_STORAGE_KWARGS_PARAM, {}
|
228
|
+
)
|
229
|
+
break
|
230
|
+
|
231
|
+
categorize_deltacat_exception(e, deltacat_storage, deltacat_storage_kwargs)
|
232
|
+
|
233
|
+
return wrapper
|
234
|
+
|
235
|
+
|
236
|
+
def categorize_deltacat_exception(
|
237
|
+
e: BaseException,
|
238
|
+
deltacat_storage: DeltaCatStorage = None,
|
239
|
+
deltacat_storage_kwargs: dict = None,
|
240
|
+
):
|
241
|
+
if deltacat_storage_kwargs is None:
|
242
|
+
deltacat_storage_kwargs = {}
|
243
|
+
|
244
|
+
if isinstance(e, DeltaCatError):
|
245
|
+
raise e
|
246
|
+
elif deltacat_storage and deltacat_storage.can_categorize(
|
247
|
+
e, **deltacat_storage_kwargs
|
248
|
+
):
|
249
|
+
deltacat_storage.raise_categorized_error(e, **deltacat_storage_kwargs)
|
250
|
+
elif isinstance(e, RayError):
|
251
|
+
_categorize_ray_error(e)
|
252
|
+
elif isinstance(e, tenacity.RetryError):
|
253
|
+
_categorize_tenacity_error(e)
|
254
|
+
elif isinstance(e, ArrowException):
|
255
|
+
_categorize_dependency_pyarrow_error(e)
|
256
|
+
elif isinstance(e, AssertionError):
|
257
|
+
_categorize_assertion_error(e)
|
258
|
+
elif isinstance(e, DaftCoreException):
|
259
|
+
_categorize_daft_error(e)
|
260
|
+
elif isinstance(e, BotoCoreError):
|
261
|
+
_categorize_botocore_error(e)
|
262
|
+
else:
|
263
|
+
_categorize_all_remaining_errors(e)
|
264
|
+
|
265
|
+
logger.error(f"Error categorization failed for {e}.", exc_info=True)
|
266
|
+
raise UnclassifiedDeltaCatError(
|
267
|
+
"Error could not categorized into DeltaCat error"
|
268
|
+
) from e
|
269
|
+
|
270
|
+
|
271
|
+
def _categorize_ray_error(e: RayError):
|
272
|
+
if isinstance(e, RuntimeEnvSetupError):
|
273
|
+
raise DependencyRayRuntimeSetupError("Ray failed to setup runtime env.") from e
|
274
|
+
elif isinstance(e, WorkerCrashedError) or isinstance(e, NodeDiedError):
|
275
|
+
raise DependencyRayWorkerDiedError("Ray worker died unexpectedly.") from e
|
276
|
+
elif isinstance(e, OutOfMemoryError):
|
277
|
+
raise DependencyRayOutOfMemoryError("Ray worker Out Of Memory.") from e
|
278
|
+
elif isinstance(e, RayTaskError):
|
279
|
+
if e.cause is not None and isinstance(e.cause, Exception):
|
280
|
+
categorize_deltacat_exception(e.cause)
|
281
|
+
else:
|
282
|
+
raise UnrecognizedRayTaskError(
|
283
|
+
"Unrecognized underlying error detected in a Ray task."
|
284
|
+
) from e
|
285
|
+
else:
|
286
|
+
raise DependencyRayError("Dependency Ray error occurred.") from e
|
287
|
+
|
288
|
+
|
289
|
+
def _categorize_tenacity_error(e: tenacity.RetryError):
|
290
|
+
if e.__cause__ is not None and isinstance(e.__cause__, Exception):
|
291
|
+
categorize_deltacat_exception(e.__cause__)
|
292
|
+
else:
|
293
|
+
raise RetryableError("Unrecognized retryable error occurred.") from e
|
294
|
+
|
295
|
+
|
296
|
+
def _categorize_dependency_pyarrow_error(e: ArrowException):
|
297
|
+
if isinstance(e, ArrowInvalid):
|
298
|
+
raise DependencyPyarrowInvalidError(
|
299
|
+
f"Pyarrow Invalid error occurred. Reason: {e}"
|
300
|
+
) from e
|
301
|
+
elif isinstance(e, ArrowCapacityError):
|
302
|
+
raise DependencyPyarrowCapacityError("Pyarrow Capacity error occurred.") from e
|
303
|
+
else:
|
304
|
+
raise DependencyPyarrowError("Pyarrow error occurred.") from e
|
305
|
+
|
306
|
+
|
307
|
+
def _categorize_assertion_error(e: BaseException):
|
308
|
+
raise ValidationError(
|
309
|
+
f"One of the assertions in DeltaCAT has failed. Reason: {e}"
|
310
|
+
) from e
|
311
|
+
|
312
|
+
|
313
|
+
def _categorize_daft_error(e: DaftCoreException):
|
314
|
+
if isinstance(e, DaftTransientError):
|
315
|
+
raise DependencyDaftTransientError("Daft Transient error occurred.") from e
|
316
|
+
elif isinstance(e, DaftCoreException):
|
317
|
+
raise DependencyDaftError("Daft error occurred.") from e
|
318
|
+
|
319
|
+
|
320
|
+
def _categorize_botocore_error(e: BotoCoreError):
|
321
|
+
if isinstance(e, botocore.exceptions.ConnectionError) or isinstance(
|
322
|
+
e, botocore.exceptions.HTTPClientError
|
323
|
+
):
|
324
|
+
raise DependencyBotocoreConnectionError(
|
325
|
+
"Botocore connection error occurred."
|
326
|
+
) from e
|
327
|
+
elif isinstance(e, botocore.exceptions.CredentialRetrievalError) or isinstance(
|
328
|
+
e, botocore.exceptions.NoCredentialsError
|
329
|
+
):
|
330
|
+
raise DependencyBotocoreCredentialError(
|
331
|
+
"Botocore credential retrieval failed"
|
332
|
+
) from e
|
333
|
+
elif isinstance(e, botocore.exceptions.ReadTimeoutError) or isinstance(
|
334
|
+
e, botocore.exceptions.ConnectTimeoutError
|
335
|
+
):
|
336
|
+
raise DependencyBotocoreTimeoutError("Botocore connection timed out.") from e
|
337
|
+
else:
|
338
|
+
raise DependencyBotocoreError("Botocore error occurred.") from e
|
339
|
+
|
340
|
+
|
341
|
+
def _categorize_all_remaining_errors(e: BaseException):
|
342
|
+
if isinstance(e, ConnectionError):
|
343
|
+
raise DeltaCatTransientError("Connection error has occurred.") from e
|
344
|
+
elif isinstance(e, TimeoutError):
|
345
|
+
raise DeltaCatTransientError("Timeout error has occurred.") from e
|
346
|
+
elif isinstance(e, OSError):
|
347
|
+
raise DeltaCatTransientError("OSError occurred.") from e
|
348
|
+
elif isinstance(e, SystemExit):
|
349
|
+
raise DeltaCatSystemError("Unexpected System error occurred.") from e
|
deltacat/io/dataset.py
CHANGED
@@ -6,9 +6,6 @@ from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
|
|
6
6
|
import pyarrow as pa
|
7
7
|
import s3fs
|
8
8
|
from ray.data import Dataset
|
9
|
-
from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
|
10
|
-
|
11
|
-
from deltacat.io.aws.redshift.redshift_datasource import RedshiftDatasource
|
12
9
|
|
13
10
|
T = TypeVar("T")
|
14
11
|
|
@@ -27,7 +24,6 @@ class DeltacatDataset(Dataset[T]):
|
|
27
24
|
filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
|
28
25
|
try_create_dir: bool = True,
|
29
26
|
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
|
30
|
-
block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
|
31
27
|
arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
|
32
28
|
**arrow_parquet_args,
|
33
29
|
) -> None:
|
@@ -59,9 +55,8 @@ class DeltacatDataset(Dataset[T]):
|
|
59
55
|
if True. Does nothing if all directories already exist.
|
60
56
|
arrow_open_stream_args: kwargs passed to
|
61
57
|
pyarrow.fs.FileSystem.open_output_stream
|
62
|
-
|
63
|
-
to write each dataset block to a custom output path.
|
64
|
-
DefaultBlockWritePathProvider if None.
|
58
|
+
filename_provider: FilenameProvider implementation
|
59
|
+
to write each dataset block to a custom output path.
|
65
60
|
arrow_parquet_args_fn: Callable that returns a dictionary of write
|
66
61
|
arguments to use when writing each block to a file. Overrides
|
67
62
|
any duplicate keys from arrow_parquet_args. This should be used
|
@@ -72,14 +67,7 @@ class DeltacatDataset(Dataset[T]):
|
|
72
67
|
pyarrow.parquet.write_table(), which is used to write out each
|
73
68
|
block to a file.
|
74
69
|
"""
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
dataset_uuid=self._uuid,
|
79
|
-
filesystem=filesystem,
|
80
|
-
try_create_dir=try_create_dir,
|
81
|
-
open_stream_args=arrow_open_stream_args,
|
82
|
-
block_path_provider=block_path_provider,
|
83
|
-
write_args_fn=arrow_parquet_args_fn,
|
84
|
-
**arrow_parquet_args,
|
70
|
+
raise NotImplementedError(
|
71
|
+
"Writing to Redshift is not yet supported. "
|
72
|
+
"Please use DeltacatDataset.write_parquet() instead."
|
85
73
|
)
|
@@ -12,6 +12,9 @@ from pymemcache.client.retrying import RetryingClient
|
|
12
12
|
from pymemcache.exceptions import MemcacheUnexpectedCloseError
|
13
13
|
from pymemcache.client.rendezvous import RendezvousHash
|
14
14
|
from deltacat.utils.cloudpickle import dump_into_chunks
|
15
|
+
from deltacat.exceptions import (
|
16
|
+
PymemcachedPutObjectError,
|
17
|
+
)
|
15
18
|
|
16
19
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
17
20
|
|
@@ -72,7 +75,7 @@ class MemcachedObjectStore(IObjectStore):
|
|
72
75
|
for create_ref_ip, ref_to_object in input.items():
|
73
76
|
client = self._get_client_by_ip(create_ref_ip)
|
74
77
|
if client.set_many(ref_to_object, noreply=self.noreply):
|
75
|
-
raise
|
78
|
+
raise PymemcachedPutObjectError("Unable to write a few keys to cache")
|
76
79
|
|
77
80
|
return result
|
78
81
|
|
@@ -87,10 +90,10 @@ class MemcachedObjectStore(IObjectStore):
|
|
87
90
|
|
88
91
|
try:
|
89
92
|
if not client.set(ref, chunk, noreply=self.noreply):
|
90
|
-
raise
|
93
|
+
raise PymemcachedPutObjectError(f"Unable to write {ref} to cache")
|
91
94
|
except BaseException as e:
|
92
|
-
raise
|
93
|
-
f"Received {e} while writing ref={ref} and obj size={len(chunk)}"
|
95
|
+
raise PymemcachedPutObjectError(
|
96
|
+
f"Received {e} while writing ref={ref} and obj size={len(chunk)}",
|
94
97
|
)
|
95
98
|
|
96
99
|
return self._create_ref(uid, create_ref_ip, len(serialized_list))
|
deltacat/storage/__init__.py
CHANGED
@@ -14,6 +14,20 @@ from deltacat.storage.model.stream import Stream, StreamLocator
|
|
14
14
|
from deltacat.storage.model.table import Table, TableLocator
|
15
15
|
from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
|
16
16
|
from deltacat.storage.model.delete_parameters import DeleteParameters
|
17
|
+
from deltacat.storage.model.partition_spec import (
|
18
|
+
PartitionFilter,
|
19
|
+
PartitionValues,
|
20
|
+
DeltaPartitionSpec,
|
21
|
+
StreamPartitionSpec,
|
22
|
+
)
|
23
|
+
from deltacat.storage.model.transform import (
|
24
|
+
Transform,
|
25
|
+
TransformName,
|
26
|
+
TransformParameters,
|
27
|
+
BucketingStrategy,
|
28
|
+
BucketTransformParameters,
|
29
|
+
IdentityTransformParameters,
|
30
|
+
)
|
17
31
|
|
18
32
|
from deltacat.storage.model.types import (
|
19
33
|
CommitState,
|
@@ -56,4 +70,14 @@ __all__ = [
|
|
56
70
|
"TableVersionLocator",
|
57
71
|
"SortKey",
|
58
72
|
"SortOrder",
|
73
|
+
"PartitionFilter",
|
74
|
+
"PartitionValues",
|
75
|
+
"DeltaPartitionSpec",
|
76
|
+
"StreamPartitionSpec",
|
77
|
+
"Transform",
|
78
|
+
"TransformName",
|
79
|
+
"TransformParameters",
|
80
|
+
"BucketingStrategy",
|
81
|
+
"BucketTransformParameters",
|
82
|
+
"IdentityTransformParameters",
|
59
83
|
]
|