cledar-sdk 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cledar/__init__.py +1 -0
- cledar/kafka/README.md +239 -0
- cledar/kafka/__init__.py +42 -0
- cledar/kafka/clients/base.py +117 -0
- cledar/kafka/clients/consumer.py +138 -0
- cledar/kafka/clients/producer.py +97 -0
- cledar/kafka/config/schemas.py +262 -0
- cledar/kafka/exceptions.py +17 -0
- cledar/kafka/handlers/dead_letter.py +88 -0
- cledar/kafka/handlers/parser.py +83 -0
- cledar/kafka/logger.py +5 -0
- cledar/kafka/models/input.py +17 -0
- cledar/kafka/models/message.py +14 -0
- cledar/kafka/models/output.py +12 -0
- cledar/kafka/tests/.env.test.kafka +3 -0
- cledar/kafka/tests/README.md +216 -0
- cledar/kafka/tests/conftest.py +104 -0
- cledar/kafka/tests/integration/__init__.py +1 -0
- cledar/kafka/tests/integration/conftest.py +78 -0
- cledar/kafka/tests/integration/helpers.py +47 -0
- cledar/kafka/tests/integration/test_consumer_integration.py +375 -0
- cledar/kafka/tests/integration/test_integration.py +394 -0
- cledar/kafka/tests/integration/test_producer_consumer_interaction.py +388 -0
- cledar/kafka/tests/integration/test_producer_integration.py +217 -0
- cledar/kafka/tests/unit/__init__.py +1 -0
- cledar/kafka/tests/unit/test_base_kafka_client.py +391 -0
- cledar/kafka/tests/unit/test_config_validation.py +609 -0
- cledar/kafka/tests/unit/test_dead_letter_handler.py +443 -0
- cledar/kafka/tests/unit/test_error_handling.py +674 -0
- cledar/kafka/tests/unit/test_input_parser.py +310 -0
- cledar/kafka/tests/unit/test_input_parser_comprehensive.py +489 -0
- cledar/kafka/tests/unit/test_utils.py +25 -0
- cledar/kafka/tests/unit/test_utils_comprehensive.py +408 -0
- cledar/kafka/utils/callbacks.py +28 -0
- cledar/kafka/utils/messages.py +39 -0
- cledar/kafka/utils/topics.py +15 -0
- cledar/kserve/README.md +352 -0
- cledar/kserve/__init__.py +5 -0
- cledar/kserve/tests/__init__.py +0 -0
- cledar/kserve/tests/test_utils.py +64 -0
- cledar/kserve/utils.py +30 -0
- cledar/logging/README.md +53 -0
- cledar/logging/__init__.py +5 -0
- cledar/logging/tests/test_universal_plaintext_formatter.py +249 -0
- cledar/logging/universal_plaintext_formatter.py +99 -0
- cledar/monitoring/README.md +71 -0
- cledar/monitoring/__init__.py +5 -0
- cledar/monitoring/monitoring_server.py +156 -0
- cledar/monitoring/tests/integration/test_monitoring_server_int.py +162 -0
- cledar/monitoring/tests/test_monitoring_server.py +59 -0
- cledar/nonce/README.md +99 -0
- cledar/nonce/__init__.py +5 -0
- cledar/nonce/nonce_service.py +62 -0
- cledar/nonce/tests/__init__.py +0 -0
- cledar/nonce/tests/test_nonce_service.py +136 -0
- cledar/redis/README.md +536 -0
- cledar/redis/__init__.py +17 -0
- cledar/redis/async_example.py +112 -0
- cledar/redis/example.py +67 -0
- cledar/redis/exceptions.py +25 -0
- cledar/redis/logger.py +5 -0
- cledar/redis/model.py +14 -0
- cledar/redis/redis.py +764 -0
- cledar/redis/redis_config_store.py +333 -0
- cledar/redis/tests/test_async_integration_redis.py +158 -0
- cledar/redis/tests/test_async_redis_service.py +380 -0
- cledar/redis/tests/test_integration_redis.py +119 -0
- cledar/redis/tests/test_redis_service.py +319 -0
- cledar/storage/README.md +529 -0
- cledar/storage/__init__.py +6 -0
- cledar/storage/constants.py +5 -0
- cledar/storage/exceptions.py +79 -0
- cledar/storage/models.py +41 -0
- cledar/storage/object_storage.py +1274 -0
- cledar/storage/tests/conftest.py +18 -0
- cledar/storage/tests/test_abfs.py +164 -0
- cledar/storage/tests/test_integration_filesystem.py +359 -0
- cledar/storage/tests/test_integration_s3.py +453 -0
- cledar/storage/tests/test_local.py +384 -0
- cledar/storage/tests/test_s3.py +521 -0
- {cledar_sdk-2.0.2.dist-info → cledar_sdk-2.1.0.dist-info}/METADATA +1 -1
- cledar_sdk-2.1.0.dist-info/RECORD +84 -0
- cledar_sdk-2.0.2.dist-info/RECORD +0 -4
- {cledar_sdk-2.0.2.dist-info → cledar_sdk-2.1.0.dist-info}/WHEEL +0 -0
- {cledar_sdk-2.0.2.dist-info → cledar_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1274 @@
|
|
|
1
|
+
"""Service for interacting with S3, ABFS, and local filesystem storage."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, cast
|
|
6
|
+
|
|
7
|
+
import fsspec
|
|
8
|
+
from fsspec.exceptions import FSTimeoutError
|
|
9
|
+
|
|
10
|
+
from .constants import (
|
|
11
|
+
ABFS_PATH_PREFIX,
|
|
12
|
+
ABFSS_PATH_PREFIX,
|
|
13
|
+
S3_PATH_PREFIX,
|
|
14
|
+
)
|
|
15
|
+
from .exceptions import (
|
|
16
|
+
CheckFileExistenceError,
|
|
17
|
+
CopyFileError,
|
|
18
|
+
DeleteFileError,
|
|
19
|
+
DownloadFileError,
|
|
20
|
+
GetFileInfoError,
|
|
21
|
+
GetFileSizeError,
|
|
22
|
+
ListObjectsError,
|
|
23
|
+
MoveFileError,
|
|
24
|
+
ReadFileError,
|
|
25
|
+
RequiredBucketNotFoundError,
|
|
26
|
+
UploadBufferError,
|
|
27
|
+
UploadFileError,
|
|
28
|
+
)
|
|
29
|
+
from .models import ObjectStorageServiceConfig, TransferPath
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger("object_storage_service")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ObjectStorageService:
|
|
35
|
+
"""Service for managing object storage operations across multiple backends."""
|
|
36
|
+
|
|
37
|
+
client: Any = None
|
|
38
|
+
|
|
39
|
+
def __init__(self, config: ObjectStorageServiceConfig) -> None:
|
|
40
|
+
"""Initialize the object storage service.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config: Configuration for S3 and Azure storage backends.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
self.client = fsspec.filesystem(
|
|
47
|
+
"s3",
|
|
48
|
+
key=config.s3_access_key,
|
|
49
|
+
secret=config.s3_secret_key,
|
|
50
|
+
client_kwargs={"endpoint_url": config.s3_endpoint_url},
|
|
51
|
+
max_concurrency=config.s3_max_concurrency,
|
|
52
|
+
)
|
|
53
|
+
logger.info(
|
|
54
|
+
"Initiated filesystem", extra={"endpoint_url": config.s3_endpoint_url}
|
|
55
|
+
)
|
|
56
|
+
self.local_client = fsspec.filesystem("file")
|
|
57
|
+
|
|
58
|
+
if config.azure_account_name and config.azure_account_key:
|
|
59
|
+
self.azure_client = fsspec.filesystem(
|
|
60
|
+
"abfs",
|
|
61
|
+
account_name=config.azure_account_name,
|
|
62
|
+
account_key=config.azure_account_key,
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
self.azure_client = None
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _is_s3_path(path: str | None) -> bool:
|
|
69
|
+
"""Check if a path is an S3 path.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
path (str | None): Path to check.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
bool: True if the path starts with s3:// prefix.
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
if path is None:
|
|
79
|
+
return False
|
|
80
|
+
return path.lower().startswith(S3_PATH_PREFIX)
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def _is_abfs_path(path: str | None) -> bool:
|
|
84
|
+
"""Check if a path is an Azure Blob Storage path.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
path: Path to check.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
bool: True if the path starts with abfs:// or abfss:// prefix.
|
|
91
|
+
|
|
92
|
+
"""
|
|
93
|
+
if path is None:
|
|
94
|
+
return False
|
|
95
|
+
lower = path.lower()
|
|
96
|
+
return lower.startswith((ABFS_PATH_PREFIX, ABFSS_PATH_PREFIX))
|
|
97
|
+
|
|
98
|
+
def is_alive(self) -> bool:
|
|
99
|
+
"""Check if the storage service is accessible.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
bool: True if the service is accessible, False otherwise.
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
self.client.ls(path="")
|
|
107
|
+
return True
|
|
108
|
+
except (OSError, PermissionError, TimeoutError, FSTimeoutError):
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
def _write_buffer_to_s3_key(
|
|
112
|
+
self, buffer: io.BytesIO, bucket: str, key: str
|
|
113
|
+
) -> None:
|
|
114
|
+
"""Write buffer to S3 using bucket and key.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
buffer: Buffer containing data to write.
|
|
118
|
+
bucket: S3 bucket name.
|
|
119
|
+
key: S3 object key.
|
|
120
|
+
|
|
121
|
+
"""
|
|
122
|
+
buffer.seek(0)
|
|
123
|
+
with self.client.open(
|
|
124
|
+
path=f"{S3_PATH_PREFIX}{bucket}/{key}", mode="wb"
|
|
125
|
+
) as fobj:
|
|
126
|
+
fobj.write(buffer.getbuffer())
|
|
127
|
+
|
|
128
|
+
def _write_buffer_to_s3_path(
|
|
129
|
+
self, buffer: io.BytesIO, destination_path: str
|
|
130
|
+
) -> None:
|
|
131
|
+
"""Write buffer to S3 using full path.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
buffer: Buffer containing data to write.
|
|
135
|
+
destination_path: Full S3 path (e.g., s3://bucket/key).
|
|
136
|
+
|
|
137
|
+
"""
|
|
138
|
+
buffer.seek(0)
|
|
139
|
+
with self.client.open(path=destination_path, mode="wb") as fobj:
|
|
140
|
+
fobj.write(buffer.getbuffer())
|
|
141
|
+
|
|
142
|
+
def _write_buffer_to_abfs_path(
|
|
143
|
+
self, buffer: io.BytesIO, destination_path: str
|
|
144
|
+
) -> None:
|
|
145
|
+
"""Write buffer to Azure Blob Storage.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
buffer: Buffer containing data to write.
|
|
149
|
+
destination_path: Full ABFS path (e.g., abfs://container/path).
|
|
150
|
+
|
|
151
|
+
"""
|
|
152
|
+
buffer.seek(0)
|
|
153
|
+
with self.azure_client.open(path=destination_path, mode="wb") as fobj:
|
|
154
|
+
fobj.write(buffer.getbuffer())
|
|
155
|
+
|
|
156
|
+
def _write_buffer_to_local_path(
|
|
157
|
+
self, buffer: io.BytesIO, destination_path: str
|
|
158
|
+
) -> None:
|
|
159
|
+
"""Write buffer to local filesystem.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
buffer: Buffer containing data to write.
|
|
163
|
+
destination_path: Local filesystem path.
|
|
164
|
+
|
|
165
|
+
"""
|
|
166
|
+
buffer.seek(0)
|
|
167
|
+
with self.local_client.open(path=destination_path, mode="wb") as fobj:
|
|
168
|
+
fobj.write(buffer.getbuffer())
|
|
169
|
+
|
|
170
|
+
def _read_from_s3_key(self, bucket: str, key: str) -> bytes:
|
|
171
|
+
"""Read file from S3 using bucket and key.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
bucket: S3 bucket name.
|
|
175
|
+
key: S3 object key.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
bytes: File contents as bytes.
|
|
179
|
+
|
|
180
|
+
"""
|
|
181
|
+
with self.client.open(
|
|
182
|
+
path=f"{S3_PATH_PREFIX}{bucket}/{key}", mode="rb"
|
|
183
|
+
) as fobj:
|
|
184
|
+
data: bytes = fobj.read()
|
|
185
|
+
return data
|
|
186
|
+
|
|
187
|
+
def _read_from_s3_path(self, path: str) -> bytes:
|
|
188
|
+
"""Read file from S3 using full path.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
path: Full S3 path (e.g., s3://bucket/key).
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
bytes: File contents as bytes.
|
|
195
|
+
|
|
196
|
+
"""
|
|
197
|
+
with self.client.open(path=path, mode="rb") as fobj:
|
|
198
|
+
data: bytes = fobj.read()
|
|
199
|
+
return data
|
|
200
|
+
|
|
201
|
+
def _read_from_abfs_path(self, path: str) -> bytes:
|
|
202
|
+
"""Read file from Azure Blob Storage.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
path: Full ABFS path (e.g., abfs://container/path).
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
bytes: File contents as bytes.
|
|
209
|
+
|
|
210
|
+
"""
|
|
211
|
+
with self.azure_client.open(path=path, mode="rb") as fobj:
|
|
212
|
+
data: bytes = fobj.read()
|
|
213
|
+
return data
|
|
214
|
+
|
|
215
|
+
def _read_from_local_path(self, path: str) -> bytes:
|
|
216
|
+
"""Read file from local filesystem.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
path: Local filesystem path.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
bytes: File contents as bytes.
|
|
223
|
+
|
|
224
|
+
"""
|
|
225
|
+
with self.local_client.open(path=path, mode="rb") as fobj:
|
|
226
|
+
data: bytes = fobj.read()
|
|
227
|
+
return data
|
|
228
|
+
|
|
229
|
+
def _put_file(self, fs: Any, lpath: str, rpath: str) -> None:
|
|
230
|
+
"""Upload local file to remote storage.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
fs: Filesystem client.
|
|
234
|
+
lpath: Local file path.
|
|
235
|
+
rpath: Remote file path.
|
|
236
|
+
|
|
237
|
+
"""
|
|
238
|
+
fs.put(lpath=lpath, rpath=rpath)
|
|
239
|
+
|
|
240
|
+
def _get_file(self, fs: Any, src: str, dst: str) -> None:
|
|
241
|
+
"""Download remote file to local storage.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
fs: Filesystem client.
|
|
245
|
+
src: Remote source path.
|
|
246
|
+
dst: Local destination path.
|
|
247
|
+
|
|
248
|
+
"""
|
|
249
|
+
fs.get(src, dst)
|
|
250
|
+
|
|
251
|
+
def _list_via_find_or_ls(self, fs: Any, path: str, recursive: bool) -> list[str]:
|
|
252
|
+
"""List files using find (recursive) or ls (non-recursive).
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
fs: Filesystem client.
|
|
256
|
+
path: Path to list.
|
|
257
|
+
recursive: Whether to list recursively.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
list[str]: List of file paths.
|
|
261
|
+
|
|
262
|
+
"""
|
|
263
|
+
if recursive:
|
|
264
|
+
return cast(list[str], fs.find(path))
|
|
265
|
+
return cast(list[str], fs.ls(path, detail=False))
|
|
266
|
+
|
|
267
|
+
def _normalize_s3_keys(self, bucket: str, objects: list[str]) -> list[str]:
|
|
268
|
+
"""Normalize S3 object paths to keys by removing bucket prefix.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
bucket: S3 bucket name.
|
|
272
|
+
objects: List of full S3 paths.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
list[str]: List of normalized keys without bucket prefix.
|
|
276
|
+
|
|
277
|
+
"""
|
|
278
|
+
keys: list[str] = []
|
|
279
|
+
for obj in objects:
|
|
280
|
+
if obj.startswith(f"{S3_PATH_PREFIX}{bucket}/"):
|
|
281
|
+
keys.append(obj.replace(f"{S3_PATH_PREFIX}{bucket}/", ""))
|
|
282
|
+
elif obj.startswith(f"{bucket}/"):
|
|
283
|
+
keys.append(obj.replace(f"{bucket}/", ""))
|
|
284
|
+
else:
|
|
285
|
+
keys.append(obj)
|
|
286
|
+
return keys
|
|
287
|
+
|
|
288
|
+
def _size_from_info(self, info: dict[str, Any]) -> int:
|
|
289
|
+
"""Extract file size from file info dictionary.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
info: File info dictionary.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
int: File size in bytes.
|
|
296
|
+
|
|
297
|
+
"""
|
|
298
|
+
return int(info.get("size", 0))
|
|
299
|
+
|
|
300
|
+
def _copy_with_backend(self, backend: str, src: str, dst: str) -> None:
|
|
301
|
+
"""Copy file using the appropriate backend.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
backend: Backend type (s3, abfs, local, or mixed).
|
|
305
|
+
src: Source path.
|
|
306
|
+
dst: Destination path.
|
|
307
|
+
|
|
308
|
+
"""
|
|
309
|
+
if backend == "s3":
|
|
310
|
+
self.client.copy(src, dst)
|
|
311
|
+
return
|
|
312
|
+
if backend == "abfs":
|
|
313
|
+
self.azure_client.copy(src, dst)
|
|
314
|
+
return
|
|
315
|
+
if backend == "local":
|
|
316
|
+
self.local_client.copy(src, dst)
|
|
317
|
+
return
|
|
318
|
+
with (
|
|
319
|
+
fsspec.open(src, mode="rb") as src_f,
|
|
320
|
+
fsspec.open(dst, mode="wb") as dst_f,
|
|
321
|
+
):
|
|
322
|
+
dst_f.write(src_f.read())
|
|
323
|
+
|
|
324
|
+
def _move_with_backend(self, backend: str, src: str, dst: str) -> None:
|
|
325
|
+
"""Move file using the appropriate backend.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
backend: Backend type (s3, abfs, local, or mixed).
|
|
329
|
+
src: Source path.
|
|
330
|
+
dst: Destination path.
|
|
331
|
+
|
|
332
|
+
"""
|
|
333
|
+
if backend == "s3":
|
|
334
|
+
self.client.move(src, dst)
|
|
335
|
+
return
|
|
336
|
+
if backend == "abfs":
|
|
337
|
+
self.azure_client.move(src, dst)
|
|
338
|
+
return
|
|
339
|
+
if backend == "local":
|
|
340
|
+
self.local_client.move(src, dst)
|
|
341
|
+
return
|
|
342
|
+
with (
|
|
343
|
+
fsspec.open(src, mode="rb") as src_f,
|
|
344
|
+
fsspec.open(dst, mode="wb") as dst_f,
|
|
345
|
+
):
|
|
346
|
+
dst_f.write(src_f.read())
|
|
347
|
+
if self._is_s3_path(src):
|
|
348
|
+
self.client.rm(src)
|
|
349
|
+
elif self._is_abfs_path(src):
|
|
350
|
+
self.azure_client.rm(src)
|
|
351
|
+
else:
|
|
352
|
+
self.local_client.rm(src)
|
|
353
|
+
|
|
354
|
+
def _get_fs_for_backend(self, backend: str) -> Any:
|
|
355
|
+
"""Get filesystem client for the specified backend.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
backend: Backend type (s3, abfs, or local).
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
Any: Filesystem client for the backend.
|
|
362
|
+
|
|
363
|
+
"""
|
|
364
|
+
if backend == "s3":
|
|
365
|
+
return self.client
|
|
366
|
+
if backend == "abfs":
|
|
367
|
+
return self.azure_client
|
|
368
|
+
return self.local_client
|
|
369
|
+
|
|
370
|
+
def _resolve_source_backend_and_path(
|
|
371
|
+
self, bucket: str | None, key: str | None, path: str | None
|
|
372
|
+
) -> TransferPath:
|
|
373
|
+
"""Resolve source backend and path from various input formats.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
bucket: S3 bucket name.
|
|
377
|
+
key: S3 object key.
|
|
378
|
+
path: Full path (can be S3, ABFS, or local).
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
TransferPath: Transfer path with backend type and resolved path.
|
|
382
|
+
|
|
383
|
+
Raises:
|
|
384
|
+
ValueError: If neither path nor bucket+key are provided.
|
|
385
|
+
|
|
386
|
+
"""
|
|
387
|
+
if bucket and key:
|
|
388
|
+
return TransferPath(backend="s3", path=f"{S3_PATH_PREFIX}{bucket}/{key}")
|
|
389
|
+
if path and self._is_s3_path(path):
|
|
390
|
+
return TransferPath(backend="s3", path=path)
|
|
391
|
+
if path and self._is_abfs_path(path):
|
|
392
|
+
return TransferPath(backend="abfs", path=path)
|
|
393
|
+
if path:
|
|
394
|
+
return TransferPath(backend="local", path=path)
|
|
395
|
+
raise ValueError("Either path or bucket and key must be provided")
|
|
396
|
+
|
|
397
|
+
def _resolve_dest_backend_and_path(
|
|
398
|
+
self, bucket: str | None, key: str | None, destination_path: str | None
|
|
399
|
+
) -> TransferPath:
|
|
400
|
+
"""Resolve destination backend and path from various input formats.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
bucket: S3 bucket name.
|
|
404
|
+
key: S3 object key.
|
|
405
|
+
destination_path: Full destination path (can be S3, ABFS, or local).
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
TransferPath: Transfer path with backend type and resolved path.
|
|
409
|
+
|
|
410
|
+
Raises:
|
|
411
|
+
ValueError: If neither destination_path nor bucket+key are provided.
|
|
412
|
+
|
|
413
|
+
"""
|
|
414
|
+
if bucket and key:
|
|
415
|
+
return TransferPath(backend="s3", path=f"{S3_PATH_PREFIX}{bucket}/{key}")
|
|
416
|
+
if destination_path and self._is_s3_path(destination_path):
|
|
417
|
+
return TransferPath(backend="s3", path=destination_path)
|
|
418
|
+
if destination_path and self._is_abfs_path(destination_path):
|
|
419
|
+
return TransferPath(backend="abfs", path=destination_path)
|
|
420
|
+
if destination_path:
|
|
421
|
+
return TransferPath(backend="local", path=destination_path)
|
|
422
|
+
raise ValueError("Either destination_path or bucket and key must be provided")
|
|
423
|
+
|
|
424
|
+
def _resolve_path_backend(self, path: str | None) -> TransferPath:
|
|
425
|
+
"""Resolve backend type from path.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
path: Full path (can be S3, ABFS, or local).
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Transfer path with backend type and path.
|
|
432
|
+
|
|
433
|
+
Raises:
|
|
434
|
+
ValueError: If path is not provided.
|
|
435
|
+
|
|
436
|
+
"""
|
|
437
|
+
if path and self._is_s3_path(path):
|
|
438
|
+
return TransferPath(backend="s3", path=path)
|
|
439
|
+
if path and self._is_abfs_path(path):
|
|
440
|
+
return TransferPath(backend="abfs", path=path)
|
|
441
|
+
if path:
|
|
442
|
+
return TransferPath(backend="local", path=path)
|
|
443
|
+
raise ValueError("Path must be provided")
|
|
444
|
+
|
|
445
|
+
def _read_from_backend_path(self, backend: str, src_path: str) -> bytes:
|
|
446
|
+
"""Read file from the specified backend.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
backend: Backend type (s3, abfs, or local).
|
|
450
|
+
src_path: Source path to read from.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
bytes: File contents as bytes.
|
|
454
|
+
|
|
455
|
+
"""
|
|
456
|
+
if backend == "s3":
|
|
457
|
+
return self._read_from_s3_path(src_path)
|
|
458
|
+
if backend == "abfs":
|
|
459
|
+
return self._read_from_abfs_path(src_path)
|
|
460
|
+
return self._read_from_local_path(src_path)
|
|
461
|
+
|
|
462
|
+
def has_bucket(self, bucket: str, throw: bool = False) -> bool:
|
|
463
|
+
"""Check if an S3 bucket exists and is accessible.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
bucket: S3 bucket name.
|
|
467
|
+
throw: Whether to raise an exception if bucket is not found.
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
bool: True if bucket exists and is accessible, False otherwise.
|
|
471
|
+
|
|
472
|
+
Raises:
|
|
473
|
+
RequiredBucketNotFoundError: If throw=True and bucket is not found.
|
|
474
|
+
|
|
475
|
+
"""
|
|
476
|
+
try:
|
|
477
|
+
self.client.ls(path=bucket)
|
|
478
|
+
return True
|
|
479
|
+
except (
|
|
480
|
+
FileNotFoundError,
|
|
481
|
+
PermissionError,
|
|
482
|
+
OSError,
|
|
483
|
+
TimeoutError,
|
|
484
|
+
FSTimeoutError,
|
|
485
|
+
) as exception:
|
|
486
|
+
if throw:
|
|
487
|
+
logger.exception("Bucket not found", extra={"bucket": bucket})
|
|
488
|
+
raise RequiredBucketNotFoundError from exception
|
|
489
|
+
return False
|
|
490
|
+
|
|
491
|
+
def upload_buffer(
|
|
492
|
+
self,
|
|
493
|
+
buffer: io.BytesIO,
|
|
494
|
+
bucket: str | None = None,
|
|
495
|
+
key: str | None = None,
|
|
496
|
+
destination_path: str | None = None,
|
|
497
|
+
) -> None:
|
|
498
|
+
"""Upload a buffer to storage.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
buffer: Buffer containing data to upload.
|
|
502
|
+
bucket: S3 bucket name (for S3 destination).
|
|
503
|
+
key: S3 object key (for S3 destination).
|
|
504
|
+
destination_path: Full destination path (can be S3, ABFS, or local).
|
|
505
|
+
|
|
506
|
+
Raises:
|
|
507
|
+
UploadBufferError: If upload fails.
|
|
508
|
+
ValueError: If neither destination_path nor bucket+key are provided.
|
|
509
|
+
|
|
510
|
+
"""
|
|
511
|
+
try:
|
|
512
|
+
if bucket and key:
|
|
513
|
+
self._write_buffer_to_s3_key(buffer=buffer, bucket=bucket, key=key)
|
|
514
|
+
logger.debug(
|
|
515
|
+
"Uploaded file from buffer", extra={"bucket": bucket, "key": key}
|
|
516
|
+
)
|
|
517
|
+
elif destination_path and self._is_s3_path(destination_path):
|
|
518
|
+
logger.debug(
|
|
519
|
+
"Uploading file from buffer to S3 via path",
|
|
520
|
+
extra={"destination_path": destination_path},
|
|
521
|
+
)
|
|
522
|
+
self._write_buffer_to_s3_path(
|
|
523
|
+
buffer=buffer, destination_path=destination_path
|
|
524
|
+
)
|
|
525
|
+
elif destination_path and self._is_abfs_path(destination_path):
|
|
526
|
+
logger.debug(
|
|
527
|
+
"Uploading file from buffer to ABFS via path",
|
|
528
|
+
extra={"destination_path": destination_path},
|
|
529
|
+
)
|
|
530
|
+
self._write_buffer_to_abfs_path(
|
|
531
|
+
buffer=buffer, destination_path=destination_path
|
|
532
|
+
)
|
|
533
|
+
elif destination_path:
|
|
534
|
+
logger.debug(
|
|
535
|
+
"Uploading file from buffer to local filesystem",
|
|
536
|
+
extra={"destination_path": destination_path},
|
|
537
|
+
)
|
|
538
|
+
self._write_buffer_to_local_path(
|
|
539
|
+
buffer=buffer, destination_path=destination_path
|
|
540
|
+
)
|
|
541
|
+
else:
|
|
542
|
+
raise ValueError(
|
|
543
|
+
"Either destination_path or bucket and key must be provided"
|
|
544
|
+
)
|
|
545
|
+
except (OSError, PermissionError, TimeoutError, FSTimeoutError) as exception:
|
|
546
|
+
logger.exception(
|
|
547
|
+
"Failed to upload buffer",
|
|
548
|
+
extra={
|
|
549
|
+
"bucket": bucket,
|
|
550
|
+
"key": key,
|
|
551
|
+
"destination_path": destination_path,
|
|
552
|
+
},
|
|
553
|
+
)
|
|
554
|
+
raise UploadBufferError(
|
|
555
|
+
f"Failed to upload buffer (bucket={bucket}, key={key}, "
|
|
556
|
+
f"destination_path={destination_path})"
|
|
557
|
+
) from exception
|
|
558
|
+
|
|
559
|
+
def read_file(
|
|
560
|
+
self,
|
|
561
|
+
bucket: str | None = None,
|
|
562
|
+
key: str | None = None,
|
|
563
|
+
path: str | None = None,
|
|
564
|
+
max_tries: int = 3,
|
|
565
|
+
) -> bytes:
|
|
566
|
+
"""Read file from storage.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
bucket: S3 bucket name (for S3 source).
|
|
570
|
+
key: S3 object key (for S3 source).
|
|
571
|
+
path: Full source path (can be S3, ABFS, or local).
|
|
572
|
+
max_tries: Number of retry attempts on failure.
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
bytes: File contents as bytes.
|
|
576
|
+
|
|
577
|
+
Raises:
|
|
578
|
+
ReadFileError: If read fails after all retries.
|
|
579
|
+
NotImplementedError: If this should never be reached.
|
|
580
|
+
|
|
581
|
+
"""
|
|
582
|
+
transfer_path: TransferPath = self._resolve_source_backend_and_path(
|
|
583
|
+
bucket=bucket, key=key, path=path
|
|
584
|
+
)
|
|
585
|
+
backend_name: str = transfer_path.backend
|
|
586
|
+
src_path: str = transfer_path.path
|
|
587
|
+
for attempt in range(max_tries):
|
|
588
|
+
try:
|
|
589
|
+
logger.debug(
|
|
590
|
+
"Reading file",
|
|
591
|
+
extra={"backend": backend_name, "source": src_path},
|
|
592
|
+
)
|
|
593
|
+
content = self._read_from_backend_path(backend_name, src_path)
|
|
594
|
+
logger.debug(
|
|
595
|
+
"File read",
|
|
596
|
+
extra={"backend": backend_name, "source": src_path},
|
|
597
|
+
)
|
|
598
|
+
return content
|
|
599
|
+
except OSError as exception:
|
|
600
|
+
if attempt == max_tries - 1:
|
|
601
|
+
logger.exception(
|
|
602
|
+
"Failed to read file after %d retries",
|
|
603
|
+
max_tries,
|
|
604
|
+
extra={"bucket": bucket, "key": key, "path": path},
|
|
605
|
+
)
|
|
606
|
+
raise ReadFileError(
|
|
607
|
+
f"Failed to read file after {max_tries} retries "
|
|
608
|
+
f"(bucket={bucket}, key={key}, path={path})"
|
|
609
|
+
) from exception
|
|
610
|
+
logger.warning(
|
|
611
|
+
"Failed to read file, retrying...",
|
|
612
|
+
extra={"attempt": attempt + 1},
|
|
613
|
+
)
|
|
614
|
+
raise NotImplementedError("This should never be reached")
|
|
615
|
+
|
|
616
|
+
def upload_file(
|
|
617
|
+
self,
|
|
618
|
+
file_path: str,
|
|
619
|
+
bucket: str | None = None,
|
|
620
|
+
key: str | None = None,
|
|
621
|
+
destination_path: str | None = None,
|
|
622
|
+
) -> None:
|
|
623
|
+
"""Upload a local file to storage.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
file_path: Local file path to upload.
|
|
627
|
+
bucket: S3 bucket name (for S3 destination).
|
|
628
|
+
key: S3 object key (for S3 destination).
|
|
629
|
+
destination_path: Full destination path (can be S3, ABFS, or local).
|
|
630
|
+
|
|
631
|
+
Raises:
|
|
632
|
+
UploadFileError: If upload fails.
|
|
633
|
+
|
|
634
|
+
"""
|
|
635
|
+
try:
|
|
636
|
+
transfer_path: TransferPath = self._resolve_dest_backend_and_path(
|
|
637
|
+
bucket=bucket, key=key, destination_path=destination_path
|
|
638
|
+
)
|
|
639
|
+
backend_name: str = transfer_path.backend
|
|
640
|
+
dst_path: str = transfer_path.path
|
|
641
|
+
logger.debug(
|
|
642
|
+
"Uploading file",
|
|
643
|
+
extra={
|
|
644
|
+
"backend": backend_name,
|
|
645
|
+
"destination": dst_path,
|
|
646
|
+
"file": file_path,
|
|
647
|
+
},
|
|
648
|
+
)
|
|
649
|
+
fs = self._get_fs_for_backend(backend_name)
|
|
650
|
+
self._put_file(fs, lpath=file_path, rpath=dst_path)
|
|
651
|
+
logger.debug(
|
|
652
|
+
"Uploaded file",
|
|
653
|
+
extra={
|
|
654
|
+
"backend": backend_name,
|
|
655
|
+
"destination": dst_path,
|
|
656
|
+
"file": file_path,
|
|
657
|
+
},
|
|
658
|
+
)
|
|
659
|
+
except (OSError, PermissionError, TimeoutError, FSTimeoutError) as exception:
|
|
660
|
+
logger.exception(
|
|
661
|
+
"Failed to upload file",
|
|
662
|
+
extra={
|
|
663
|
+
"bucket": bucket,
|
|
664
|
+
"key": key,
|
|
665
|
+
"destination_path": destination_path,
|
|
666
|
+
"file_path": file_path,
|
|
667
|
+
},
|
|
668
|
+
)
|
|
669
|
+
raise UploadFileError(
|
|
670
|
+
f"Failed to upload file {file_path} "
|
|
671
|
+
f"(bucket={bucket}, key={key}, destination_path={destination_path})"
|
|
672
|
+
) from exception
|
|
673
|
+
|
|
674
|
+
def list_objects(
|
|
675
|
+
self,
|
|
676
|
+
bucket: str | None = None,
|
|
677
|
+
prefix: str = "",
|
|
678
|
+
path: str | None = None,
|
|
679
|
+
recursive: bool = True,
|
|
680
|
+
) -> list[str]:
|
|
681
|
+
"""List objects in storage with optional prefix filtering.
|
|
682
|
+
|
|
683
|
+
Args:
|
|
684
|
+
bucket: The bucket name (for S3)
|
|
685
|
+
prefix: Optional prefix to filter objects (for S3)
|
|
686
|
+
path: The filesystem path. Uses S3 if starts with s3://, otherwise local
|
|
687
|
+
recursive: If True, list all objects recursively
|
|
688
|
+
|
|
689
|
+
Returns:
|
|
690
|
+
list[str]: List of object keys/paths
|
|
691
|
+
|
|
692
|
+
Raises:
|
|
693
|
+
ListObjectsError: If listing objects fails.
|
|
694
|
+
ValueError: If neither path nor bucket are provided.
|
|
695
|
+
|
|
696
|
+
"""
|
|
697
|
+
try:
|
|
698
|
+
if path:
|
|
699
|
+
transfer_path: TransferPath = self._resolve_path_backend(path)
|
|
700
|
+
backend_name: str = transfer_path.backend
|
|
701
|
+
resolved_path: str = transfer_path.path
|
|
702
|
+
logger.debug(
|
|
703
|
+
"Listing objects",
|
|
704
|
+
extra={
|
|
705
|
+
"backend": backend_name,
|
|
706
|
+
"path": resolved_path,
|
|
707
|
+
"recursive": recursive,
|
|
708
|
+
},
|
|
709
|
+
)
|
|
710
|
+
fs = self._get_fs_for_backend(backend_name)
|
|
711
|
+
objects = self._list_via_find_or_ls(fs, resolved_path, recursive)
|
|
712
|
+
logger.debug(
|
|
713
|
+
"Listed objects",
|
|
714
|
+
extra={
|
|
715
|
+
"backend": backend_name,
|
|
716
|
+
"path": resolved_path,
|
|
717
|
+
"count": len(objects),
|
|
718
|
+
},
|
|
719
|
+
)
|
|
720
|
+
return objects
|
|
721
|
+
if bucket:
|
|
722
|
+
s3_path = (
|
|
723
|
+
f"{S3_PATH_PREFIX}{bucket}/{prefix}"
|
|
724
|
+
if prefix
|
|
725
|
+
else f"{S3_PATH_PREFIX}{bucket}/"
|
|
726
|
+
)
|
|
727
|
+
logger.debug(
|
|
728
|
+
"Listing objects from S3",
|
|
729
|
+
extra={"bucket": bucket, "prefix": prefix, "recursive": recursive},
|
|
730
|
+
)
|
|
731
|
+
objects = self._list_via_find_or_ls(self.client, s3_path, recursive)
|
|
732
|
+
keys = self._normalize_s3_keys(bucket, objects)
|
|
733
|
+
logger.debug(
|
|
734
|
+
"Listed objects from S3",
|
|
735
|
+
extra={
|
|
736
|
+
"bucket": bucket,
|
|
737
|
+
"prefix": prefix,
|
|
738
|
+
"count": len(keys),
|
|
739
|
+
},
|
|
740
|
+
)
|
|
741
|
+
return keys
|
|
742
|
+
raise ValueError("Either path or bucket must be provided")
|
|
743
|
+
except (
|
|
744
|
+
FileNotFoundError,
|
|
745
|
+
PermissionError,
|
|
746
|
+
OSError,
|
|
747
|
+
TimeoutError,
|
|
748
|
+
FSTimeoutError,
|
|
749
|
+
) as exception:
|
|
750
|
+
logger.exception(
|
|
751
|
+
"Failed to list objects",
|
|
752
|
+
extra={"bucket": bucket, "prefix": prefix, "path": path},
|
|
753
|
+
)
|
|
754
|
+
raise ListObjectsError(
|
|
755
|
+
f"Failed to list objects (bucket={bucket}, prefix={prefix}, "
|
|
756
|
+
f"path={path})"
|
|
757
|
+
) from exception
|
|
758
|
+
|
|
759
|
+
def delete_file(
|
|
760
|
+
self, bucket: str | None = None, key: str | None = None, path: str | None = None
|
|
761
|
+
) -> None:
|
|
762
|
+
"""Delete a single object from storage.
|
|
763
|
+
|
|
764
|
+
Args:
|
|
765
|
+
bucket: The bucket name (for S3)
|
|
766
|
+
key: The object key to delete (for S3)
|
|
767
|
+
path: The filesystem path. Uses S3 if starts with s3://, otherwise local
|
|
768
|
+
|
|
769
|
+
Raises:
|
|
770
|
+
DeleteFileError: If deleting the file fails.
|
|
771
|
+
ValueError: If neither path nor bucket+key are provided.
|
|
772
|
+
|
|
773
|
+
"""
|
|
774
|
+
try:
|
|
775
|
+
if bucket and key:
|
|
776
|
+
s3_path = f"{S3_PATH_PREFIX}{bucket}/{key}"
|
|
777
|
+
logger.debug(
|
|
778
|
+
"Deleting file from S3", extra={"bucket": bucket, "key": key}
|
|
779
|
+
)
|
|
780
|
+
self.client.rm(s3_path)
|
|
781
|
+
logger.debug(
|
|
782
|
+
"Deleted file from S3", extra={"bucket": bucket, "key": key}
|
|
783
|
+
)
|
|
784
|
+
elif path and self._is_s3_path(path):
|
|
785
|
+
logger.debug("Deleting file from S3 via path", extra={"path": path})
|
|
786
|
+
self.client.rm(path)
|
|
787
|
+
logger.debug("Deleted file from S3 via path", extra={"path": path})
|
|
788
|
+
elif path and self._is_abfs_path(path):
|
|
789
|
+
logger.debug("Deleting file from ABFS via path", extra={"path": path})
|
|
790
|
+
self.azure_client.rm(path)
|
|
791
|
+
logger.debug("Deleted file from ABFS via path", extra={"path": path})
|
|
792
|
+
elif path:
|
|
793
|
+
logger.debug(
|
|
794
|
+
"Deleting file from local filesystem", extra={"path": path}
|
|
795
|
+
)
|
|
796
|
+
self.local_client.rm(path)
|
|
797
|
+
logger.debug("Deleted file from local filesystem", extra={"path": path})
|
|
798
|
+
else:
|
|
799
|
+
raise ValueError("Either path or bucket and key must be provided")
|
|
800
|
+
except (
|
|
801
|
+
FileNotFoundError,
|
|
802
|
+
PermissionError,
|
|
803
|
+
OSError,
|
|
804
|
+
TimeoutError,
|
|
805
|
+
FSTimeoutError,
|
|
806
|
+
) as exception:
|
|
807
|
+
logger.exception(
|
|
808
|
+
"Failed to delete file",
|
|
809
|
+
extra={"bucket": bucket, "key": key, "path": path},
|
|
810
|
+
)
|
|
811
|
+
raise DeleteFileError(
|
|
812
|
+
f"Failed to delete file (bucket={bucket}, key={key}, path={path})"
|
|
813
|
+
) from exception
|
|
814
|
+
|
|
815
|
+
def file_exists(
|
|
816
|
+
self, bucket: str | None = None, key: str | None = None, path: str | None = None
|
|
817
|
+
) -> bool:
|
|
818
|
+
"""Check if a specific file exists in storage.
|
|
819
|
+
|
|
820
|
+
Args:
|
|
821
|
+
bucket: The bucket name (for S3)
|
|
822
|
+
key: The object key to check (for S3)
|
|
823
|
+
path: The filesystem path. Uses S3 if starts with s3://, otherwise local
|
|
824
|
+
|
|
825
|
+
Returns:
|
|
826
|
+
bool: True if the file exists, False otherwise
|
|
827
|
+
|
|
828
|
+
Raises:
|
|
829
|
+
CheckFileExistenceError: If checking file existence fails.
|
|
830
|
+
ValueError: If neither path nor bucket+key are provided.
|
|
831
|
+
|
|
832
|
+
"""
|
|
833
|
+
try:
|
|
834
|
+
if bucket and key:
|
|
835
|
+
s3_path = f"{S3_PATH_PREFIX}{bucket}/{key}"
|
|
836
|
+
exists = self.client.exists(s3_path)
|
|
837
|
+
logger.debug(
|
|
838
|
+
"Checked file existence in S3",
|
|
839
|
+
extra={"bucket": bucket, "key": key, "exists": exists},
|
|
840
|
+
)
|
|
841
|
+
return bool(exists)
|
|
842
|
+
if path and self._is_s3_path(path):
|
|
843
|
+
exists = self.client.exists(path)
|
|
844
|
+
logger.debug(
|
|
845
|
+
"Checked file existence in S3 via path",
|
|
846
|
+
extra={"path": path, "exists": exists},
|
|
847
|
+
)
|
|
848
|
+
return bool(exists)
|
|
849
|
+
if path and self._is_abfs_path(path):
|
|
850
|
+
exists = self.azure_client.exists(path)
|
|
851
|
+
logger.debug(
|
|
852
|
+
"Checked file existence in ABFS via path",
|
|
853
|
+
extra={"path": path, "exists": exists},
|
|
854
|
+
)
|
|
855
|
+
return bool(exists)
|
|
856
|
+
if path:
|
|
857
|
+
exists = self.local_client.exists(path)
|
|
858
|
+
logger.debug(
|
|
859
|
+
"Checked file existence in local filesystem",
|
|
860
|
+
extra={"path": path, "exists": exists},
|
|
861
|
+
)
|
|
862
|
+
return bool(exists)
|
|
863
|
+
raise ValueError("Either path or bucket and key must be provided")
|
|
864
|
+
except (OSError, PermissionError, TimeoutError, FSTimeoutError) as exception:
|
|
865
|
+
logger.exception(
|
|
866
|
+
"Failed to check file existence",
|
|
867
|
+
extra={"bucket": bucket, "key": key, "path": path},
|
|
868
|
+
)
|
|
869
|
+
raise CheckFileExistenceError(
|
|
870
|
+
f"Failed to check file existence (bucket={bucket}, key={key}, "
|
|
871
|
+
f"path={path})"
|
|
872
|
+
) from exception
|
|
873
|
+
|
|
874
|
+
def download_file(
|
|
875
|
+
self,
|
|
876
|
+
dest_path: str,
|
|
877
|
+
bucket: str | None = None,
|
|
878
|
+
key: str | None = None,
|
|
879
|
+
source_path: str | None = None,
|
|
880
|
+
max_tries: int = 3,
|
|
881
|
+
) -> None:
|
|
882
|
+
"""Download a file from storage to local filesystem.
|
|
883
|
+
|
|
884
|
+
Args:
|
|
885
|
+
dest_path: The destination local path where the file should be saved
|
|
886
|
+
bucket: The bucket name (for S3)
|
|
887
|
+
key: The object key to download (for S3)
|
|
888
|
+
source_path: The source path. Uses S3 if starts with s3://, otherwise local
|
|
889
|
+
max_tries: Number of retry attempts on failure
|
|
890
|
+
|
|
891
|
+
Raises:
|
|
892
|
+
DownloadFileError: If download fails after all retries.
|
|
893
|
+
|
|
894
|
+
"""
|
|
895
|
+
transfer_path: TransferPath = self._resolve_source_backend_and_path(
|
|
896
|
+
bucket=bucket, key=key, path=source_path
|
|
897
|
+
)
|
|
898
|
+
backend_name: str = transfer_path.backend
|
|
899
|
+
src_path: str = transfer_path.path
|
|
900
|
+
for attempt in range(max_tries):
|
|
901
|
+
try:
|
|
902
|
+
logger.debug(
|
|
903
|
+
"Downloading file",
|
|
904
|
+
extra={
|
|
905
|
+
"backend": backend_name,
|
|
906
|
+
"source": src_path,
|
|
907
|
+
"dest_path": dest_path,
|
|
908
|
+
},
|
|
909
|
+
)
|
|
910
|
+
fs = self._get_fs_for_backend(backend_name)
|
|
911
|
+
self._get_file(fs, src_path, dest_path)
|
|
912
|
+
logger.debug(
|
|
913
|
+
"Downloaded file",
|
|
914
|
+
extra={
|
|
915
|
+
"backend": backend_name,
|
|
916
|
+
"source": src_path,
|
|
917
|
+
"dest_path": dest_path,
|
|
918
|
+
},
|
|
919
|
+
)
|
|
920
|
+
return
|
|
921
|
+
except OSError as exception:
|
|
922
|
+
if attempt == max_tries - 1:
|
|
923
|
+
logger.exception(
|
|
924
|
+
"Failed to download file after %d retries",
|
|
925
|
+
max_tries,
|
|
926
|
+
extra={
|
|
927
|
+
"bucket": bucket,
|
|
928
|
+
"key": key,
|
|
929
|
+
"source_path": source_path,
|
|
930
|
+
"dest_path": dest_path,
|
|
931
|
+
},
|
|
932
|
+
)
|
|
933
|
+
raise DownloadFileError(
|
|
934
|
+
f"Failed to download file after {max_tries} retries "
|
|
935
|
+
f"(bucket={bucket}, key={key}, source_path={source_path}, "
|
|
936
|
+
f"dest_path={dest_path})"
|
|
937
|
+
) from exception
|
|
938
|
+
logger.warning(
|
|
939
|
+
"Failed to download file, retrying...",
|
|
940
|
+
extra={"attempt": attempt + 1},
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
def get_file_size(
|
|
944
|
+
self, bucket: str | None = None, key: str | None = None, path: str | None = None
|
|
945
|
+
) -> int:
|
|
946
|
+
"""Get the size of a file without downloading it.
|
|
947
|
+
|
|
948
|
+
Args:
|
|
949
|
+
bucket: The bucket name (for S3)
|
|
950
|
+
key: The object key (for S3)
|
|
951
|
+
path: The filesystem path. Uses S3 if starts with s3://, otherwise local
|
|
952
|
+
|
|
953
|
+
Returns:
|
|
954
|
+
int: File size in bytes
|
|
955
|
+
|
|
956
|
+
Raises:
|
|
957
|
+
GetFileSizeError: If getting file size fails.
|
|
958
|
+
ValueError: If neither path nor bucket+key are provided.
|
|
959
|
+
|
|
960
|
+
"""
|
|
961
|
+
try:
|
|
962
|
+
if bucket and key:
|
|
963
|
+
s3_path = f"s3://{bucket}/{key}"
|
|
964
|
+
logger.debug(
|
|
965
|
+
"Getting file size from S3", extra={"bucket": bucket, "key": key}
|
|
966
|
+
)
|
|
967
|
+
info = cast(dict[str, Any], self.client.info(s3_path))
|
|
968
|
+
size = self._size_from_info(info)
|
|
969
|
+
logger.debug(
|
|
970
|
+
"Got file size from S3",
|
|
971
|
+
extra={"bucket": bucket, "key": key, "size": size},
|
|
972
|
+
)
|
|
973
|
+
return size
|
|
974
|
+
if path and self._is_s3_path(path):
|
|
975
|
+
logger.debug("Getting file size from S3 via path", extra={"path": path})
|
|
976
|
+
info = cast(dict[str, Any], self.client.info(path))
|
|
977
|
+
size = self._size_from_info(info)
|
|
978
|
+
logger.debug(
|
|
979
|
+
"Got file size from S3 via path",
|
|
980
|
+
extra={"path": path, "size": size},
|
|
981
|
+
)
|
|
982
|
+
return size
|
|
983
|
+
if path and self._is_abfs_path(path):
|
|
984
|
+
logger.debug(
|
|
985
|
+
"Getting file size from ABFS via path", extra={"path": path}
|
|
986
|
+
)
|
|
987
|
+
info = self.azure_client.info(path)
|
|
988
|
+
size = self._size_from_info(info)
|
|
989
|
+
logger.debug(
|
|
990
|
+
"Got file size from ABFS via path",
|
|
991
|
+
extra={"path": path, "size": size},
|
|
992
|
+
)
|
|
993
|
+
return size
|
|
994
|
+
if path:
|
|
995
|
+
logger.debug(
|
|
996
|
+
"Getting file size from local filesystem", extra={"path": path}
|
|
997
|
+
)
|
|
998
|
+
info = cast(dict[str, Any], self.local_client.info(path))
|
|
999
|
+
size = self._size_from_info(info)
|
|
1000
|
+
logger.debug(
|
|
1001
|
+
"Got file size from local filesystem",
|
|
1002
|
+
extra={"path": path, "size": size},
|
|
1003
|
+
)
|
|
1004
|
+
return size
|
|
1005
|
+
|
|
1006
|
+
raise ValueError("Either path or bucket and key must be provided")
|
|
1007
|
+
except (
|
|
1008
|
+
FileNotFoundError,
|
|
1009
|
+
PermissionError,
|
|
1010
|
+
OSError,
|
|
1011
|
+
TimeoutError,
|
|
1012
|
+
FSTimeoutError,
|
|
1013
|
+
) as exception:
|
|
1014
|
+
logger.exception(
|
|
1015
|
+
"Failed to get file size",
|
|
1016
|
+
extra={"bucket": bucket, "key": key, "path": path},
|
|
1017
|
+
)
|
|
1018
|
+
raise GetFileSizeError(
|
|
1019
|
+
f"Failed to get file size (bucket={bucket}, key={key}, path={path})"
|
|
1020
|
+
) from exception
|
|
1021
|
+
|
|
1022
|
+
def get_file_info(
|
|
1023
|
+
self, bucket: str | None = None, key: str | None = None, path: str | None = None
|
|
1024
|
+
) -> dict[str, Any]:
|
|
1025
|
+
"""Get metadata information about a file.
|
|
1026
|
+
|
|
1027
|
+
Args:
|
|
1028
|
+
bucket: The bucket name (for S3)
|
|
1029
|
+
key: The object key (for S3)
|
|
1030
|
+
path: The filesystem path. Uses S3 if starts with s3://, otherwise local
|
|
1031
|
+
|
|
1032
|
+
Returns:
|
|
1033
|
+
dict[str, Any]: Dictionary containing file metadata (size, modified
|
|
1034
|
+
time, etc.)
|
|
1035
|
+
|
|
1036
|
+
Raises:
|
|
1037
|
+
GetFileInfoError: If getting file info fails.
|
|
1038
|
+
ValueError: If neither path nor bucket+key are provided.
|
|
1039
|
+
|
|
1040
|
+
"""
|
|
1041
|
+
try:
|
|
1042
|
+
if bucket and key:
|
|
1043
|
+
s3_path = f"{S3_PATH_PREFIX}{bucket}/{key}"
|
|
1044
|
+
logger.debug(
|
|
1045
|
+
"Getting file info from S3", extra={"bucket": bucket, "key": key}
|
|
1046
|
+
)
|
|
1047
|
+
info = cast(dict[str, Any], self.client.info(s3_path))
|
|
1048
|
+
logger.debug(
|
|
1049
|
+
"Got file info from S3",
|
|
1050
|
+
extra={"bucket": bucket, "key": key},
|
|
1051
|
+
)
|
|
1052
|
+
return info
|
|
1053
|
+
if path and self._is_s3_path(path):
|
|
1054
|
+
logger.debug("Getting file info from S3 via path", extra={"path": path})
|
|
1055
|
+
info = cast(dict[str, Any], self.client.info(path))
|
|
1056
|
+
logger.debug(
|
|
1057
|
+
"Got file info from S3 via path",
|
|
1058
|
+
extra={"path": path},
|
|
1059
|
+
)
|
|
1060
|
+
return info
|
|
1061
|
+
if path and self._is_abfs_path(path):
|
|
1062
|
+
logger.debug(
|
|
1063
|
+
"Getting file info from ABFS via path", extra={"path": path}
|
|
1064
|
+
)
|
|
1065
|
+
info = cast(dict[str, Any], self.azure_client.info(path))
|
|
1066
|
+
logger.debug(
|
|
1067
|
+
"Got file info from ABFS via path",
|
|
1068
|
+
extra={"path": path},
|
|
1069
|
+
)
|
|
1070
|
+
return info
|
|
1071
|
+
if path:
|
|
1072
|
+
logger.debug(
|
|
1073
|
+
"Getting file info from local filesystem", extra={"path": path}
|
|
1074
|
+
)
|
|
1075
|
+
info = cast(dict[str, Any], self.local_client.info(path))
|
|
1076
|
+
logger.debug(
|
|
1077
|
+
"Got file info from local filesystem",
|
|
1078
|
+
extra={"path": path},
|
|
1079
|
+
)
|
|
1080
|
+
return info
|
|
1081
|
+
|
|
1082
|
+
raise ValueError("Either path or bucket and key must be provided")
|
|
1083
|
+
except (
|
|
1084
|
+
FileNotFoundError,
|
|
1085
|
+
PermissionError,
|
|
1086
|
+
OSError,
|
|
1087
|
+
TimeoutError,
|
|
1088
|
+
FSTimeoutError,
|
|
1089
|
+
) as exception:
|
|
1090
|
+
logger.exception(
|
|
1091
|
+
"Failed to get file info",
|
|
1092
|
+
extra={"bucket": bucket, "key": key, "path": path},
|
|
1093
|
+
)
|
|
1094
|
+
raise GetFileInfoError(
|
|
1095
|
+
f"Failed to get file info (bucket={bucket}, key={key}, path={path})"
|
|
1096
|
+
) from exception
|
|
1097
|
+
|
|
1098
|
+
def _resolve_transfer_paths(
|
|
1099
|
+
self,
|
|
1100
|
+
source_bucket: str | None,
|
|
1101
|
+
source_key: str | None,
|
|
1102
|
+
source_path: str | None,
|
|
1103
|
+
dest_bucket: str | None,
|
|
1104
|
+
dest_key: str | None,
|
|
1105
|
+
dest_path: str | None,
|
|
1106
|
+
) -> tuple[str, str, str]:
|
|
1107
|
+
"""Resolve source and destination paths for copy/move operations.
|
|
1108
|
+
|
|
1109
|
+
Args:
|
|
1110
|
+
source_bucket: Source S3 bucket name.
|
|
1111
|
+
source_key: Source S3 object key.
|
|
1112
|
+
source_path: Full source path (can be S3, ABFS, or local).
|
|
1113
|
+
dest_bucket: Destination S3 bucket name.
|
|
1114
|
+
dest_key: Destination S3 object key.
|
|
1115
|
+
dest_path: Full destination path (can be S3, ABFS, or local).
|
|
1116
|
+
|
|
1117
|
+
Returns:
|
|
1118
|
+
tuple[str, str, str]: Tuple of (source_path, destination_path,
|
|
1119
|
+
backend_type).
|
|
1120
|
+
|
|
1121
|
+
Raises:
|
|
1122
|
+
ValueError: If source or destination parameters are missing.
|
|
1123
|
+
|
|
1124
|
+
"""
|
|
1125
|
+
src_is_s3 = False
|
|
1126
|
+
src_is_abfs = False
|
|
1127
|
+
if source_bucket and source_key:
|
|
1128
|
+
src: str = f"{S3_PATH_PREFIX}{source_bucket}/{source_key}"
|
|
1129
|
+
src_is_s3 = True
|
|
1130
|
+
elif self._is_s3_path(source_path):
|
|
1131
|
+
src = cast(str, source_path)
|
|
1132
|
+
src_is_s3 = True
|
|
1133
|
+
elif self._is_abfs_path(source_path):
|
|
1134
|
+
src = cast(str, source_path)
|
|
1135
|
+
src_is_abfs = True
|
|
1136
|
+
elif source_path:
|
|
1137
|
+
src = source_path
|
|
1138
|
+
else:
|
|
1139
|
+
raise ValueError(
|
|
1140
|
+
"Either source_path or source_bucket and source_key must be provided"
|
|
1141
|
+
)
|
|
1142
|
+
|
|
1143
|
+
dst_is_s3 = False
|
|
1144
|
+
dst_is_abfs = False
|
|
1145
|
+
if dest_bucket and dest_key:
|
|
1146
|
+
dst: str = f"{S3_PATH_PREFIX}{dest_bucket}/{dest_key}"
|
|
1147
|
+
dst_is_s3 = True
|
|
1148
|
+
elif self._is_s3_path(dest_path):
|
|
1149
|
+
dst = cast(str, dest_path)
|
|
1150
|
+
dst_is_s3 = True
|
|
1151
|
+
elif self._is_abfs_path(dest_path):
|
|
1152
|
+
dst = cast(str, dest_path)
|
|
1153
|
+
dst_is_abfs = True
|
|
1154
|
+
elif dest_path:
|
|
1155
|
+
dst = dest_path
|
|
1156
|
+
else:
|
|
1157
|
+
raise ValueError(
|
|
1158
|
+
"Either dest_path or dest_bucket and dest_key must be provided"
|
|
1159
|
+
)
|
|
1160
|
+
|
|
1161
|
+
if (src_is_s3 or dst_is_s3) and not (src_is_abfs or dst_is_abfs):
|
|
1162
|
+
backend = "s3"
|
|
1163
|
+
elif (src_is_abfs or dst_is_abfs) and not (src_is_s3 or dst_is_s3):
|
|
1164
|
+
backend = "abfs"
|
|
1165
|
+
elif (src_is_s3 or dst_is_s3) and (src_is_abfs or dst_is_abfs):
|
|
1166
|
+
backend = "mixed"
|
|
1167
|
+
else:
|
|
1168
|
+
backend = "local"
|
|
1169
|
+
|
|
1170
|
+
return src, dst, backend
|
|
1171
|
+
|
|
1172
|
+
def copy_file(
|
|
1173
|
+
self,
|
|
1174
|
+
source_bucket: str | None = None,
|
|
1175
|
+
source_key: str | None = None,
|
|
1176
|
+
source_path: str | None = None,
|
|
1177
|
+
dest_bucket: str | None = None,
|
|
1178
|
+
dest_key: str | None = None,
|
|
1179
|
+
dest_path: str | None = None,
|
|
1180
|
+
) -> None:
|
|
1181
|
+
"""Copy a file from one location to another.
|
|
1182
|
+
|
|
1183
|
+
Args:
|
|
1184
|
+
source_bucket: Source bucket name (for S3 source)
|
|
1185
|
+
source_key: Source object key (for S3 source)
|
|
1186
|
+
source_path: Source path. Uses S3 if starts with s3://, otherwise local
|
|
1187
|
+
dest_bucket: Destination bucket name (for S3 destination)
|
|
1188
|
+
dest_key: Destination object key (for S3 destination)
|
|
1189
|
+
dest_path: Destination path. Uses S3 if starts with s3://, otherwise local
|
|
1190
|
+
|
|
1191
|
+
Raises:
|
|
1192
|
+
CopyFileError: If copying the file fails.
|
|
1193
|
+
|
|
1194
|
+
"""
|
|
1195
|
+
try:
|
|
1196
|
+
src, dst, backend = self._resolve_transfer_paths(
|
|
1197
|
+
source_bucket=source_bucket,
|
|
1198
|
+
source_key=source_key,
|
|
1199
|
+
source_path=source_path,
|
|
1200
|
+
dest_bucket=dest_bucket,
|
|
1201
|
+
dest_key=dest_key,
|
|
1202
|
+
dest_path=dest_path,
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
logger.debug("Copying file", extra={"source": src, "destination": dst})
|
|
1206
|
+
self._copy_with_backend(backend=backend, src=src, dst=dst)
|
|
1207
|
+
|
|
1208
|
+
logger.debug("Copied file", extra={"source": src, "destination": dst})
|
|
1209
|
+
except (
|
|
1210
|
+
FileNotFoundError,
|
|
1211
|
+
PermissionError,
|
|
1212
|
+
OSError,
|
|
1213
|
+
TimeoutError,
|
|
1214
|
+
FSTimeoutError,
|
|
1215
|
+
) as exception:
|
|
1216
|
+
logger.exception(
|
|
1217
|
+
"Failed to copy file",
|
|
1218
|
+
extra={"source": src, "destination": dst},
|
|
1219
|
+
)
|
|
1220
|
+
raise CopyFileError(
|
|
1221
|
+
f"Failed to copy file (source={src}, destination={dst})"
|
|
1222
|
+
) from exception
|
|
1223
|
+
|
|
1224
|
+
def move_file(
|
|
1225
|
+
self,
|
|
1226
|
+
source_bucket: str | None = None,
|
|
1227
|
+
source_key: str | None = None,
|
|
1228
|
+
source_path: str | None = None,
|
|
1229
|
+
dest_bucket: str | None = None,
|
|
1230
|
+
dest_key: str | None = None,
|
|
1231
|
+
dest_path: str | None = None,
|
|
1232
|
+
) -> None:
|
|
1233
|
+
"""Move/rename a file from one location to another.
|
|
1234
|
+
|
|
1235
|
+
Args:
|
|
1236
|
+
source_bucket: Source bucket name (for S3 source)
|
|
1237
|
+
source_key: Source object key (for S3 source)
|
|
1238
|
+
source_path: Source path. Uses S3 if starts with s3://, otherwise local
|
|
1239
|
+
dest_bucket: Destination bucket name (for S3 destination)
|
|
1240
|
+
dest_key: Destination object key (for S3 destination)
|
|
1241
|
+
dest_path: Destination path. Uses S3 if starts with s3://, otherwise local
|
|
1242
|
+
|
|
1243
|
+
Raises:
|
|
1244
|
+
MoveFileError: If moving the file fails.
|
|
1245
|
+
|
|
1246
|
+
"""
|
|
1247
|
+
try:
|
|
1248
|
+
src, dst, backend = self._resolve_transfer_paths(
|
|
1249
|
+
source_bucket=source_bucket,
|
|
1250
|
+
source_key=source_key,
|
|
1251
|
+
source_path=source_path,
|
|
1252
|
+
dest_bucket=dest_bucket,
|
|
1253
|
+
dest_key=dest_key,
|
|
1254
|
+
dest_path=dest_path,
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1257
|
+
logger.debug("Moving file", extra={"source": src, "destination": dst})
|
|
1258
|
+
self._move_with_backend(backend=backend, src=src, dst=dst)
|
|
1259
|
+
|
|
1260
|
+
logger.debug("Moved file", extra={"source": src, "destination": dst})
|
|
1261
|
+
except (
|
|
1262
|
+
FileNotFoundError,
|
|
1263
|
+
PermissionError,
|
|
1264
|
+
OSError,
|
|
1265
|
+
TimeoutError,
|
|
1266
|
+
FSTimeoutError,
|
|
1267
|
+
) as exception:
|
|
1268
|
+
logger.exception(
|
|
1269
|
+
"Failed to move file",
|
|
1270
|
+
extra={"source": src, "destination": dst},
|
|
1271
|
+
)
|
|
1272
|
+
raise MoveFileError(
|
|
1273
|
+
f"Failed to move file (source={src}, destination={dst})"
|
|
1274
|
+
) from exception
|