tracdap-runtime 0.5.29__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/_exec/dev_mode.py +2 -1
- tracdap/rt/_impl/data.py +1 -28
- tracdap/rt/_impl/static_api.py +5 -1
- tracdap/rt/_impl/storage.py +586 -10
- tracdap/rt/_impl/util.py +24 -3
- tracdap/rt/_plugins/_helpers.py +26 -25
- tracdap/rt/_plugins/storage_aws.py +162 -76
- tracdap/rt/_plugins/storage_azure.py +155 -0
- tracdap/rt/_plugins/storage_gcp.py +183 -0
- tracdap/rt/_plugins/storage_local.py +249 -98
- tracdap/rt/_version.py +1 -1
- tracdap/rt/api/static_api.py +2 -1
- tracdap/rt/config/__init__.py +8 -13
- tracdap/rt/config/common.py +10 -0
- tracdap/rt/config/common_pb2.py +38 -31
- tracdap/rt/config/job_pb2.py +21 -20
- tracdap/rt/config/platform.py +60 -25
- tracdap/rt/config/platform_pb2.py +52 -45
- tracdap/rt/config/result_pb2.py +15 -14
- tracdap/rt/config/runtime.py +0 -1
- tracdap/rt/config/runtime_pb2.py +24 -24
- tracdap/rt/exceptions.py +9 -0
- tracdap/rt/ext/plugins.py +0 -12
- tracdap/rt/ext/storage.py +47 -29
- tracdap/rt/metadata/__init__.py +19 -19
- tracdap/rt/metadata/common_pb2.py +15 -14
- tracdap/rt/metadata/custom_pb2.py +9 -8
- tracdap/rt/metadata/data_pb2.py +31 -30
- tracdap/rt/metadata/file_pb2.py +9 -8
- tracdap/rt/metadata/flow_pb2.py +33 -32
- tracdap/rt/metadata/job_pb2.py +55 -54
- tracdap/rt/metadata/model_pb2.py +31 -30
- tracdap/rt/metadata/object_id_pb2.py +13 -12
- tracdap/rt/metadata/object_pb2.py +9 -8
- tracdap/rt/metadata/search_pb2.py +19 -18
- tracdap/rt/metadata/stoarge_pb2.py +31 -30
- tracdap/rt/metadata/tag_pb2.py +13 -12
- tracdap/rt/metadata/tag_update_pb2.py +11 -10
- tracdap/rt/metadata/type_pb2.py +29 -28
- {tracdap_runtime-0.5.29.dist-info → tracdap_runtime-0.6.0.dist-info}/METADATA +27 -15
- {tracdap_runtime-0.5.29.dist-info → tracdap_runtime-0.6.0.dist-info}/RECORD +44 -44
- {tracdap_runtime-0.5.29.dist-info → tracdap_runtime-0.6.0.dist-info}/WHEEL +1 -1
- tracdap/rt/config/gateway.py +0 -104
- tracdap/rt/config/gateway_pb2.py +0 -45
- {tracdap_runtime-0.5.29.dist-info → tracdap_runtime-0.6.0.dist-info}/LICENSE +0 -0
- {tracdap_runtime-0.5.29.dist-info → tracdap_runtime-0.6.0.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/storage.py
CHANGED
@@ -12,11 +12,17 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
import datetime as dt
|
16
|
+
import enum
|
15
17
|
import pathlib
|
16
18
|
import re
|
19
|
+
import sys
|
17
20
|
import typing as tp
|
21
|
+
import traceback as tb
|
18
22
|
|
19
23
|
import pyarrow as pa
|
24
|
+
import pyarrow.fs as pa_fs
|
25
|
+
import pyarrow.lib as pa_lib
|
20
26
|
|
21
27
|
import tracdap.rt.metadata as _meta
|
22
28
|
import tracdap.rt.config as _cfg
|
@@ -26,7 +32,7 @@ import tracdap.rt._impl.data as _data
|
|
26
32
|
import tracdap.rt._impl.util as _util
|
27
33
|
|
28
34
|
# Import storage interfaces
|
29
|
-
from tracdap.rt.ext.storage import
|
35
|
+
from tracdap.rt.ext.storage import *
|
30
36
|
|
31
37
|
|
32
38
|
class FormatManager:
|
@@ -69,10 +75,8 @@ class StorageManager:
|
|
69
75
|
self.__data_storage: tp.Dict[str, IDataStorage] = dict()
|
70
76
|
self.__settings = sys_config.storage
|
71
77
|
|
72
|
-
storage_options = dict()
|
73
|
-
|
74
78
|
for storage_key, storage_config in sys_config.storage.buckets.items():
|
75
|
-
self.create_storage(storage_key, storage_config
|
79
|
+
self.create_storage(storage_key, storage_config)
|
76
80
|
|
77
81
|
def default_storage_key(self):
|
78
82
|
return self.__settings.defaultBucket
|
@@ -80,7 +84,44 @@ class StorageManager:
|
|
80
84
|
def default_storage_format(self):
|
81
85
|
return self.__settings.defaultFormat
|
82
86
|
|
83
|
-
def create_storage(self, storage_key: str, storage_config: _cfg.PluginConfig
|
87
|
+
def create_storage(self, storage_key: str, storage_config: _cfg.PluginConfig):
|
88
|
+
|
89
|
+
if plugins.PluginManager.is_plugin_available(IStorageProvider, storage_config.protocol):
|
90
|
+
self._create_storage_from_provider(storage_key, storage_config)
|
91
|
+
else:
|
92
|
+
self._create_storage_from_impl(storage_key, storage_config)
|
93
|
+
|
94
|
+
def _create_storage_from_provider(self, storage_key: str, storage_config: _cfg.PluginConfig):
|
95
|
+
|
96
|
+
provider = plugins.PluginManager.load_plugin(IStorageProvider, storage_config)
|
97
|
+
|
98
|
+
if provider.has_file_storage():
|
99
|
+
file_storage = provider.get_file_storage()
|
100
|
+
elif provider.has_arrow_native():
|
101
|
+
fs = provider.get_arrow_native()
|
102
|
+
file_storage = CommonFileStorage(storage_key, storage_config, fs)
|
103
|
+
else:
|
104
|
+
file_storage = None
|
105
|
+
|
106
|
+
if provider.has_data_storage():
|
107
|
+
data_storage = provider.get_data_storage()
|
108
|
+
elif file_storage is not None:
|
109
|
+
data_storage = CommonDataStorage(storage_config, file_storage)
|
110
|
+
else:
|
111
|
+
data_storage = None
|
112
|
+
|
113
|
+
if file_storage is None and data_storage is None:
|
114
|
+
err = f"Storage type [{storage_config.protocol}] is not available"
|
115
|
+
self.__log.error(err)
|
116
|
+
raise _ex.EStorageConfig(err)
|
117
|
+
|
118
|
+
if file_storage is not None:
|
119
|
+
self.__file_storage[storage_key] = file_storage
|
120
|
+
|
121
|
+
if data_storage is not None:
|
122
|
+
self.__data_storage[storage_key] = data_storage
|
123
|
+
|
124
|
+
def _create_storage_from_impl(self, storage_key: str, storage_config: _cfg.PluginConfig):
|
84
125
|
|
85
126
|
if storage_config is None:
|
86
127
|
err = f"Missing config for storage key [{storage_key}]"
|
@@ -97,6 +138,9 @@ class StorageManager:
|
|
97
138
|
self.__log.error(err)
|
98
139
|
raise _ex.EStorageConfig(err)
|
99
140
|
|
141
|
+
# Unused
|
142
|
+
storage_options = dict()
|
143
|
+
|
100
144
|
file_storage = file_impl(storage_config, storage_options)
|
101
145
|
data_storage = data_impl(storage_config, file_storage)
|
102
146
|
|
@@ -131,7 +175,536 @@ class StorageManager:
|
|
131
175
|
|
132
176
|
|
133
177
|
# ----------------------------------------------------------------------------------------------------------------------
|
134
|
-
# COMMON STORAGE IMPLEMENTATION
|
178
|
+
# COMMON FILE STORAGE IMPLEMENTATION
|
179
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
180
|
+
|
181
|
+
|
182
|
+
class _NativeFileContext(tp.ContextManager[tp.BinaryIO]):
|
183
|
+
|
184
|
+
def __init__(self, nf: pa_lib.NativeFile, close_func: tp.Callable):
|
185
|
+
super().__init__()
|
186
|
+
self.__nf = nf
|
187
|
+
self.__close_func = close_func
|
188
|
+
|
189
|
+
def __enter__(self):
|
190
|
+
return self.__nf
|
191
|
+
|
192
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
193
|
+
try:
|
194
|
+
self.__close_func()
|
195
|
+
finally:
|
196
|
+
self.__nf.close()
|
197
|
+
|
198
|
+
|
199
|
+
class CommonFileStorage(IFileStorage):
|
200
|
+
|
201
|
+
_TRAC_DIR_MARKER = "/.trac_dir"
|
202
|
+
|
203
|
+
FILE_SEMANTICS_FS_TYPES = ["local"]
|
204
|
+
BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs", "abfs"]
|
205
|
+
|
206
|
+
def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig, fs: pa_fs.SubTreeFileSystem):
|
207
|
+
|
208
|
+
self._log = _util.logger_for_object(self)
|
209
|
+
self._key = storage_key
|
210
|
+
self._config = storage_config
|
211
|
+
self._fs = fs
|
212
|
+
|
213
|
+
fs_type = fs.base_fs.type_name
|
214
|
+
fs_impl = "arrow"
|
215
|
+
fs_root = fs.base_path
|
216
|
+
|
217
|
+
# If this is an FSSpec implementation, take the protocol from FSSpec as the FS type
|
218
|
+
base_fs = fs.base_fs
|
219
|
+
if isinstance(base_fs, pa_fs.PyFileSystem):
|
220
|
+
handler = base_fs.handler
|
221
|
+
if isinstance(handler, pa_fs.FSSpecHandler):
|
222
|
+
fs_type = handler.fs.protocol[0] if isinstance(handler.fs.protocol, tuple) else handler.fs.protocol
|
223
|
+
fs_impl = "fsspec"
|
224
|
+
|
225
|
+
# Some optimization is possible if the underlying storage semantics are known
|
226
|
+
self._file_semantics = True if fs_type in self.FILE_SEMANTICS_FS_TYPES else False
|
227
|
+
self._bucket_semantics = True if fs_type in self.BUCKET_SEMANTICS_FS_TYPES else False
|
228
|
+
self._explicit_dir_semantics = True if self._bucket_semantics and fs_impl == "fsspec" else False
|
229
|
+
|
230
|
+
self._log.info(
|
231
|
+
f"INIT [{self._key}]: Common file storage, " +
|
232
|
+
f"fs = [{fs_type}], " +
|
233
|
+
f"impl = [{fs_impl}], " +
|
234
|
+
f"root = [{fs_root}]")
|
235
|
+
|
236
|
+
def exists(self, storage_path: str) -> bool:
|
237
|
+
|
238
|
+
return self._wrap_operation(self._exists, "EXISTS", storage_path)
|
239
|
+
|
240
|
+
def _exists(self, operation_name: str, storage_path: str) -> bool:
|
241
|
+
|
242
|
+
resolved_path = self._resolve_path(operation_name, storage_path, True)
|
243
|
+
|
244
|
+
file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
245
|
+
return file_info.type != pa_fs.FileType.NotFound
|
246
|
+
|
247
|
+
def size(self, storage_path: str) -> int:
|
248
|
+
|
249
|
+
return self._wrap_operation(self._size, "SIZE", storage_path)
|
250
|
+
|
251
|
+
def _size(self, operation_name: str, storage_path: str) -> int:
|
252
|
+
|
253
|
+
resolved_path = self._resolve_path(operation_name, storage_path, True)
|
254
|
+
file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
255
|
+
|
256
|
+
if file_info.type == pa_fs.FileType.NotFound:
|
257
|
+
raise self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
|
258
|
+
|
259
|
+
if not file_info.is_file:
|
260
|
+
raise self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
|
261
|
+
|
262
|
+
return file_info.size
|
263
|
+
|
264
|
+
def stat(self, storage_path: str) -> FileStat:
|
265
|
+
|
266
|
+
return self._wrap_operation(self._stat, "STAT", storage_path)
|
267
|
+
|
268
|
+
def _stat(self, operation_name: str, storage_path: str) -> FileStat:
|
269
|
+
|
270
|
+
resolved_path = self._resolve_path(operation_name, storage_path, True)
|
271
|
+
|
272
|
+
file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
273
|
+
|
274
|
+
if file_info.type == pa_fs.FileType.NotFound:
|
275
|
+
raise self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
|
276
|
+
|
277
|
+
if file_info.type != pa_fs.FileType.File and file_info.type != pa_fs.FileType.Directory:
|
278
|
+
raise self._explicit_error(self.ExplicitError.NOT_A_FILE_OR_DIRECTORY, operation_name, storage_path)
|
279
|
+
|
280
|
+
return self._info_to_stat(file_info)
|
281
|
+
|
282
|
+
@staticmethod
|
283
|
+
def _info_to_stat(file_info: pa_fs.FileInfo):
|
284
|
+
|
285
|
+
if file_info.path == "":
|
286
|
+
file_name = "."
|
287
|
+
storage_path = "."
|
288
|
+
elif file_info.path.startswith("./"):
|
289
|
+
file_name = file_info.base_name
|
290
|
+
storage_path = file_info.path[2:]
|
291
|
+
else:
|
292
|
+
file_name = file_info.base_name
|
293
|
+
storage_path = file_info.path
|
294
|
+
|
295
|
+
file_type = FileType.FILE if file_info.is_file else FileType.DIRECTORY
|
296
|
+
file_size = file_info.size if file_info.is_file else 0
|
297
|
+
|
298
|
+
# Normalization in case the impl gives back directory entries with a trailing slash
|
299
|
+
if file_type == FileType.DIRECTORY and storage_path.endswith("/"):
|
300
|
+
storage_path = storage_path[:-1]
|
301
|
+
separator = storage_path.rfind("/")
|
302
|
+
file_name = storage_path[separator+1:]
|
303
|
+
|
304
|
+
mtime = file_info.mtime.astimezone(dt.timezone.utc) if file_info.mtime is not None else None
|
305
|
+
|
306
|
+
return FileStat(
|
307
|
+
file_name,
|
308
|
+
file_type,
|
309
|
+
storage_path,
|
310
|
+
file_size,
|
311
|
+
mtime=mtime,
|
312
|
+
atime=None)
|
313
|
+
|
314
|
+
def ls(self, storage_path: str, recursive: bool = False) -> tp.List[FileStat]:
|
315
|
+
|
316
|
+
return self._wrap_operation(self._ls, "LS", storage_path, recursive)
|
317
|
+
|
318
|
+
def _ls(self, operation_name: str, storage_path: str, recursive: bool) -> tp.List[FileStat]:
|
319
|
+
|
320
|
+
resolved_path = self._resolve_path(operation_name, storage_path, True)
|
321
|
+
|
322
|
+
# _stat() will fail for file not found, or if the path is not a file/directory
|
323
|
+
stat = self._stat(operation_name, storage_path)
|
324
|
+
|
325
|
+
# Calling LS on a file should return a list with one entry for just that file
|
326
|
+
if stat.file_type == FileType.FILE:
|
327
|
+
return [stat]
|
328
|
+
|
329
|
+
# Otherwise do a normal directory listing
|
330
|
+
else:
|
331
|
+
# A trailing slash prevents some implementations including the directory in its own listing
|
332
|
+
selector = pa_fs.FileSelector(resolved_path + "/", recursive=recursive) # noqa
|
333
|
+
file_infos = self._fs.get_file_info(selector)
|
334
|
+
file_infos = filter(lambda fi: not fi.path.endswith(self._TRAC_DIR_MARKER), file_infos)
|
335
|
+
return list(map(self._info_to_stat, file_infos))
|
336
|
+
|
337
|
+
def mkdir(self, storage_path: str, recursive: bool = False):
|
338
|
+
|
339
|
+
return self._wrap_operation(self._mkdir, "MKDIR", storage_path, recursive)
|
340
|
+
|
341
|
+
def _mkdir(self, operation_name: str, storage_path: str, recursive: bool):
|
342
|
+
|
343
|
+
resolved_path = self._resolve_path(operation_name, storage_path, False)
|
344
|
+
|
345
|
+
# Try to prevent MKDIR if a file or file-like object already exists
|
346
|
+
# In cloud bucket semantics a file and dir can both exist with the same name - very confusing!
|
347
|
+
# There is a race condition here because a file could be created by another process
|
348
|
+
# But, given the very structured way TRAC uses file storage, this is extremely unlikely
|
349
|
+
|
350
|
+
prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
351
|
+
if prior_stat.type == pa_fs.FileType.File or prior_stat.type == pa_fs.FileType.Unknown:
|
352
|
+
raise self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
|
353
|
+
|
354
|
+
# For most FS types, it is fine to use the Arrow create_dir() method
|
355
|
+
# For bucket-like storage, this will normally create an empty blob with a name like "my_dir/"
|
356
|
+
|
357
|
+
if not self._explicit_dir_semantics:
|
358
|
+
self._fs.create_dir(resolved_path, recursive=recursive)
|
359
|
+
return
|
360
|
+
|
361
|
+
# Some FS backends for bucket-like storage do not allow empty blobs as directories
|
362
|
+
# For these backends, we have to create an explicit marker file inside the directory
|
363
|
+
# In this case it is also necessary to check parents explicitly for non-recursive requests
|
364
|
+
|
365
|
+
if not recursive and prior_stat.type == pa_fs.FileType.NotFound:
|
366
|
+
parent_path = self._resolve_parent(resolved_path)
|
367
|
+
if parent_path is not None:
|
368
|
+
parent_stat: pa_fs.FileInfo = self._fs.get_file_info(parent_path)
|
369
|
+
if parent_stat.type != pa_fs.FileType.Directory:
|
370
|
+
raise FileNotFoundError
|
371
|
+
|
372
|
+
dir_marker = resolved_path + self._TRAC_DIR_MARKER
|
373
|
+
with self._fs.open_output_stream(dir_marker) as stream:
|
374
|
+
stream.write(b"")
|
375
|
+
|
376
|
+
def rm(self, storage_path: str):
|
377
|
+
|
378
|
+
return self._wrap_operation(self._rm, "RM", storage_path)
|
379
|
+
|
380
|
+
def _rm(self, operation_name: str, storage_path: str):
|
381
|
+
|
382
|
+
resolved_path = self._resolve_path(operation_name, storage_path, False)
|
383
|
+
|
384
|
+
file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
385
|
+
if file_info.type == pa_fs.FileType.Directory:
|
386
|
+
raise self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
|
387
|
+
|
388
|
+
self._fs.delete_file(resolved_path)
|
389
|
+
|
390
|
+
def rmdir(self, storage_path: str):
|
391
|
+
|
392
|
+
return self._wrap_operation(self._rmdir, "RMDIR", storage_path)
|
393
|
+
|
394
|
+
def _rmdir(self, operation_name: str, storage_path: str):
|
395
|
+
|
396
|
+
resolved_path = self._resolve_path(operation_name, storage_path, False)
|
397
|
+
|
398
|
+
file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
399
|
+
if file_info.type == pa_fs.FileType.File:
|
400
|
+
raise self._explicit_error(self.ExplicitError.NOT_A_DIRECTORY, operation_name, storage_path)
|
401
|
+
|
402
|
+
self._fs.delete_dir(resolved_path)
|
403
|
+
|
404
|
+
def read_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
405
|
+
|
406
|
+
return self._wrap_operation(self._read_byte_stream, "OPEN BYTE STREAM (READ)", storage_path)
|
407
|
+
|
408
|
+
def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
409
|
+
|
410
|
+
resolved_path = self._resolve_path(operation_name, storage_path, False)
|
411
|
+
|
412
|
+
# Check some information about the file before attempting the read
|
413
|
+
# There is a race condition here so open_input_file() can still fail
|
414
|
+
# Even so, prior_stat gives more meaningful error information in the common case
|
415
|
+
# If the file is changed before open_input_file, errors will be raised but might be less meaningful
|
416
|
+
prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
417
|
+
if prior_stat.type == pa_fs.FileType.NotFound:
|
418
|
+
raise self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
|
419
|
+
if prior_stat.type != pa_fs.FileType.File:
|
420
|
+
raise self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
|
421
|
+
|
422
|
+
# Since the size is known, log it now rather than calling stream.seek() and stream.tell()
|
423
|
+
self._log.info(f"File size [{self._key}]: {prior_stat.size} [{storage_path}]")
|
424
|
+
|
425
|
+
# Open the stream
|
426
|
+
stream = self._fs.open_input_file(resolved_path)
|
427
|
+
|
428
|
+
# Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
|
429
|
+
return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, False)) # noqa
|
430
|
+
|
431
|
+
def write_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
432
|
+
|
433
|
+
return self._wrap_operation(self._write_byte_stream, "OPEN BYTE STREAM (WRITE)", storage_path)
|
434
|
+
|
435
|
+
def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
436
|
+
|
437
|
+
resolved_path = self._resolve_path(operation_name, storage_path, False)
|
438
|
+
|
439
|
+
# Make sure the parent directory exists
|
440
|
+
# In bucket semantics this is not needed and creating a 0-byte object for every real object is a bad idea
|
441
|
+
# For file semantics, or if semantics are not known, create the parent dir to avoid failures
|
442
|
+
if not self._bucket_semantics:
|
443
|
+
parent_path = self._resolve_parent(resolved_path)
|
444
|
+
if parent_path is not None:
|
445
|
+
self._mkdir(operation_name, parent_path, recursive=True)
|
446
|
+
|
447
|
+
# Try to prevent WRITE if the object is already defined as a directory or other non-file object
|
448
|
+
# In cloud bucket semantics a file and dir can both exist with the same name - very confusing!
|
449
|
+
# There is a race condition here because a directory could be created by another process
|
450
|
+
# But, given the very structured way TRAC uses file storage, this is extremely unlikely
|
451
|
+
prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
452
|
+
if prior_stat.type != pa_fs.FileType.NotFound and prior_stat.type != pa_fs.FileType.File:
|
453
|
+
raise self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
|
454
|
+
|
455
|
+
# If the file does not already exist and the write operation fails, try to clean it up
|
456
|
+
delete_on_error = prior_stat.type == pa_fs.FileType.NotFound
|
457
|
+
|
458
|
+
# Open the stream
|
459
|
+
stream = self._fs.open_output_stream(resolved_path)
|
460
|
+
|
461
|
+
# Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
|
462
|
+
return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, True, delete_on_error)) # noqa
|
463
|
+
|
464
|
+
def _close_byte_stream(self, storage_path: str, stream: tp.BinaryIO, is_write: bool, delete_on_error: bool = False):
|
465
|
+
|
466
|
+
# If there has been an error, log it
|
467
|
+
exc_info = sys.exc_info()
|
468
|
+
error = exc_info[1] if exc_info is not None else None
|
469
|
+
|
470
|
+
if error is not None:
|
471
|
+
self._log.exception(str(error))
|
472
|
+
|
473
|
+
# For successful write streams, log the total size written
|
474
|
+
if is_write and not error:
|
475
|
+
file_size = _util.format_file_size(stream.tell())
|
476
|
+
self._log.info(f"File size [{self._key}]: {file_size} [{storage_path}]")
|
477
|
+
|
478
|
+
# Close the stream - this may take time for write streams that are not flushed
|
479
|
+
# Closing here gives better logs, because any pause is before the close message
|
480
|
+
# As a fail-safe, _NativeFileResource always calls close() in a "finally" block
|
481
|
+
if not stream.closed:
|
482
|
+
stream.close()
|
483
|
+
|
484
|
+
# Log closing of the stream
|
485
|
+
if is_write:
|
486
|
+
self._log.info(f"CLOSE BYTE STREAM (WRITE) [{self._key}]: [{storage_path}]")
|
487
|
+
|
488
|
+
else:
|
489
|
+
self._log.info(f"CLOSE BYTE STREAM (READ) [{self._key}]: [{storage_path}]")
|
490
|
+
|
491
|
+
# If there is an error and cleanup is requested, try to remove the partially written file
|
492
|
+
# This is best-efforts, don't blow up if the cleanup fails
|
493
|
+
if error is not None and delete_on_error:
|
494
|
+
try:
|
495
|
+
file_info = self._fs.get_file_info(storage_path)
|
496
|
+
if file_info.type != pa_fs.FileType.NotFound:
|
497
|
+
self._fs.delete_file(storage_path)
|
498
|
+
# different implementations can throw different errors here
|
499
|
+
except Exception: # noqa
|
500
|
+
pass
|
501
|
+
|
502
|
+
# Stream implementations can raise various types of error during stream operations
|
503
|
+
# Errors can have different causes (access, communication, missing / duplicate files etc.)
|
504
|
+
# Also, other errors can occur inside the stream context manager, unrelated to IO
|
505
|
+
|
506
|
+
# In the case of an IO error we want to raise EStorage, other errors should propagate as they are
|
507
|
+
# This handler tries to spot IO errors from inside the PyArrow library, it is probably not fail-safe
|
508
|
+
# If an IO error is not spotted, the original error will propagate and get reported as EUnexpected
|
509
|
+
# Anyway this handler is only for errors that happen after the stream is opened
|
510
|
+
|
511
|
+
# The alternative is to override every method in _NativeFileResource and try to catch there
|
512
|
+
# However, different implementations raise different error types, so we still need some kind of inspection
|
513
|
+
|
514
|
+
if error is not None:
|
515
|
+
|
516
|
+
if isinstance(error, OSError):
|
517
|
+
raise _ex.EStorage from error
|
518
|
+
|
519
|
+
stack = tb.extract_tb(exc_info[2])
|
520
|
+
stack = filter(lambda frame: frame.filename is not None, stack)
|
521
|
+
|
522
|
+
if any(filter(lambda frame: frame.filename.startswith("pyarrow/"), stack)):
|
523
|
+
raise _ex.EStorage from error
|
524
|
+
|
525
|
+
def _wrap_operation(self, func: tp.Callable, operation_name: str, storage_path: str, *args, **kwargs) -> tp.Any:
|
526
|
+
|
527
|
+
operation = f"{operation_name} {self._key} [{storage_path}]"
|
528
|
+
|
529
|
+
try:
|
530
|
+
self._log.info(operation)
|
531
|
+
return func(operation_name, storage_path, *args, **kwargs)
|
532
|
+
|
533
|
+
# ETrac means the error is already handled, log the message as-is
|
534
|
+
|
535
|
+
except _ex.ETrac as e:
|
536
|
+
self._log.exception(f"{operation}: {str(e)}")
|
537
|
+
raise
|
538
|
+
|
539
|
+
# Arrow maps filesystem errors into native Python OS errors
|
540
|
+
|
541
|
+
except FileNotFoundError as e:
|
542
|
+
error = self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
|
543
|
+
self._log.exception(f"{operation}: {str(error)}")
|
544
|
+
raise error from e
|
545
|
+
|
546
|
+
except FileExistsError as e:
|
547
|
+
error = self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
|
548
|
+
self._log.exception(f"{operation}: {str(error)}")
|
549
|
+
raise error from e
|
550
|
+
|
551
|
+
except IsADirectoryError as e:
|
552
|
+
error = self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
|
553
|
+
self._log.exception(f"{operation}: {str(error)}")
|
554
|
+
raise error from e
|
555
|
+
|
556
|
+
except NotADirectoryError as e:
|
557
|
+
error = self._explicit_error(self.ExplicitError.NOT_A_DIRECTORY, operation_name, storage_path)
|
558
|
+
self._log.exception(f"{operation}: {str(error)}")
|
559
|
+
raise error from e
|
560
|
+
|
561
|
+
except PermissionError as e:
|
562
|
+
error = self._explicit_error(self.ExplicitError.ACCESS_DENIED, operation_name, storage_path)
|
563
|
+
self._log.exception(f"{operation}: {str(error)}")
|
564
|
+
raise error from e
|
565
|
+
|
566
|
+
# OSError is the top-level error for IO exceptions
|
567
|
+
# This is raised on some platforms if there is not a recognized errno from the low-level operation
|
568
|
+
|
569
|
+
except OSError as e:
|
570
|
+
error = self._explicit_error(self.ExplicitError.IO_ERROR, operation_name, storage_path)
|
571
|
+
self._log.error(f"{operation}: {str(e)}")
|
572
|
+
self._log.exception(f"{operation}: {str(error)}")
|
573
|
+
raise error from e
|
574
|
+
|
575
|
+
# Other types of exception are not expected - report these as internal errors
|
576
|
+
|
577
|
+
except Exception as e:
|
578
|
+
error = self._explicit_error(self.ExplicitError.UNKNOWN_ERROR, operation_name, storage_path)
|
579
|
+
self._log.exception(f"{operation}: {str(error)}")
|
580
|
+
raise error from e
|
581
|
+
|
582
|
+
def _resolve_path(self, operation_name: str, storage_path: str, allow_root_dir: bool) -> str:
|
583
|
+
|
584
|
+
try:
|
585
|
+
|
586
|
+
if storage_path is None or len(storage_path.strip()) == 0:
|
587
|
+
raise self._explicit_error(self.ExplicitError.STORAGE_PATH_NULL_OR_BLANK, operation_name, storage_path)
|
588
|
+
|
589
|
+
if self._ILLEGAL_PATH_CHARS.match(storage_path):
|
590
|
+
raise self._explicit_error(self.ExplicitError.STORAGE_PATH_INVALID, operation_name, storage_path)
|
591
|
+
|
592
|
+
relative_path = pathlib.Path(storage_path)
|
593
|
+
|
594
|
+
if relative_path.is_absolute():
|
595
|
+
raise self._explicit_error(self.ExplicitError.STORAGE_PATH_NOT_RELATIVE, operation_name, storage_path)
|
596
|
+
|
597
|
+
root_path = pathlib.Path("C:\\root") if _util.is_windows() else pathlib.Path("/root")
|
598
|
+
absolute_path = root_path.joinpath(relative_path).resolve(False)
|
599
|
+
|
600
|
+
if absolute_path == root_path:
|
601
|
+
if not allow_root_dir:
|
602
|
+
raise self._explicit_error(self.ExplicitError.STORAGE_PATH_IS_ROOT, operation_name, storage_path)
|
603
|
+
else:
|
604
|
+
return ""
|
605
|
+
|
606
|
+
# is_relative_to only supported in Python 3.9+, we need to support 3.7
|
607
|
+
if root_path not in absolute_path.parents:
|
608
|
+
raise self._explicit_error(self.ExplicitError.STORAGE_PATH_OUTSIDE_ROOT, operation_name, storage_path)
|
609
|
+
else:
|
610
|
+
return absolute_path.relative_to(root_path).as_posix()
|
611
|
+
|
612
|
+
except ValueError as e:
|
613
|
+
|
614
|
+
raise self._explicit_error(self.ExplicitError.STORAGE_PATH_INVALID, operation_name, storage_path) from e
|
615
|
+
|
616
|
+
@staticmethod
|
617
|
+
def _resolve_parent(storage_path: str) -> tp.Optional[str]:
|
618
|
+
|
619
|
+
root_path = pathlib.Path("C:\\root") if _util.is_windows() else pathlib.Path("/root")
|
620
|
+
absolute_path = root_path.joinpath(storage_path).resolve(False)
|
621
|
+
|
622
|
+
if absolute_path == root_path or absolute_path.parent == root_path:
|
623
|
+
return None
|
624
|
+
|
625
|
+
else:
|
626
|
+
return pathlib.Path(storage_path).parent.as_posix()
|
627
|
+
|
628
|
+
def _explicit_error(self, error, operation_name, storage_path):
|
629
|
+
|
630
|
+
message_template = self._ERROR_MESSAGE_MAP.get(error)
|
631
|
+
message = message_template.format(operation_name, self._key, storage_path)
|
632
|
+
|
633
|
+
err_type = self._ERROR_TYPE_MAP.get(error)
|
634
|
+
err = err_type(message)
|
635
|
+
|
636
|
+
return err
|
637
|
+
|
638
|
+
_ILLEGAL_PATH_CHARS_WINDOWS = re.compile(r".*[\x00<>:\"\'|?*].*")
|
639
|
+
_ILLEGAL_PATH_CHARS_POSIX = re.compile(r".*[\x00<>:\"\'|?*\\].*")
|
640
|
+
_ILLEGAL_PATH_CHARS = _ILLEGAL_PATH_CHARS_WINDOWS if _util.is_windows() else _ILLEGAL_PATH_CHARS_POSIX
|
641
|
+
|
642
|
+
class ExplicitError(enum.Enum):
|
643
|
+
|
644
|
+
# Validation failures
|
645
|
+
STORAGE_PATH_NULL_OR_BLANK = 1
|
646
|
+
STORAGE_PATH_NOT_RELATIVE = 2
|
647
|
+
STORAGE_PATH_OUTSIDE_ROOT = 3
|
648
|
+
STORAGE_PATH_IS_ROOT = 4
|
649
|
+
STORAGE_PATH_INVALID = 5
|
650
|
+
|
651
|
+
# Exceptions
|
652
|
+
OBJECT_NOT_FOUND = 10
|
653
|
+
OBJECT_ALREADY_EXISTS = 11
|
654
|
+
NOT_A_FILE = 12
|
655
|
+
NOT_A_DIRECTORY = 13
|
656
|
+
NOT_A_FILE_OR_DIRECTORY = 14
|
657
|
+
IO_ERROR = 15
|
658
|
+
|
659
|
+
# Permissions
|
660
|
+
ACCESS_DENIED = 20
|
661
|
+
|
662
|
+
# Unhandled / unexpected error
|
663
|
+
UNKNOWN_ERROR = 30
|
664
|
+
|
665
|
+
_ERROR_MESSAGE_MAP = {
|
666
|
+
|
667
|
+
ExplicitError.STORAGE_PATH_NULL_OR_BLANK: "Requested storage path is null or blank: {} {} [{}]",
|
668
|
+
ExplicitError.STORAGE_PATH_NOT_RELATIVE: "Requested storage path is not a relative path: {} {} [{}]",
|
669
|
+
ExplicitError.STORAGE_PATH_OUTSIDE_ROOT: "Requested storage path is outside the storage root directory: {} {} [{}]", # noqa
|
670
|
+
ExplicitError.STORAGE_PATH_IS_ROOT: "Requested operation not allowed on the storage root directory: {} {} [{}]",
|
671
|
+
ExplicitError.STORAGE_PATH_INVALID: "Requested storage path is invalid: {} {} [{}]",
|
672
|
+
|
673
|
+
ExplicitError.OBJECT_NOT_FOUND: "Object not found in storage layer: {} {} [{}]",
|
674
|
+
ExplicitError.OBJECT_ALREADY_EXISTS: "Object already exists in storage layer: {} {} [{}]",
|
675
|
+
ExplicitError.NOT_A_FILE: "Object is not a file: {} {} [{}]",
|
676
|
+
ExplicitError.NOT_A_DIRECTORY: "Object is not a directory: {} {} [{}]",
|
677
|
+
ExplicitError.NOT_A_FILE_OR_DIRECTORY: "Object is not a file or directory: {} {} [{}]",
|
678
|
+
ExplicitError.IO_ERROR: "An IO error occurred in the storage layer: {} {} [{}]",
|
679
|
+
|
680
|
+
ExplicitError.ACCESS_DENIED: "Access denied in storage layer: {} {} [{}]",
|
681
|
+
|
682
|
+
ExplicitError.UNKNOWN_ERROR: "An unexpected error occurred in the storage layer: {} {} [{}]",
|
683
|
+
}
|
684
|
+
|
685
|
+
_ERROR_TYPE_MAP = {
|
686
|
+
|
687
|
+
ExplicitError.STORAGE_PATH_NULL_OR_BLANK: _ex.EStorageValidation,
|
688
|
+
ExplicitError.STORAGE_PATH_NOT_RELATIVE: _ex.EStorageValidation,
|
689
|
+
ExplicitError.STORAGE_PATH_OUTSIDE_ROOT: _ex.EStorageValidation,
|
690
|
+
ExplicitError.STORAGE_PATH_IS_ROOT: _ex.EStorageValidation,
|
691
|
+
ExplicitError.STORAGE_PATH_INVALID: _ex.EStorageValidation,
|
692
|
+
|
693
|
+
ExplicitError.OBJECT_NOT_FOUND: _ex.EStorageRequest,
|
694
|
+
ExplicitError.OBJECT_ALREADY_EXISTS: _ex.EStorageRequest,
|
695
|
+
ExplicitError.NOT_A_FILE: _ex.EStorageRequest,
|
696
|
+
ExplicitError.NOT_A_DIRECTORY: _ex.EStorageRequest,
|
697
|
+
ExplicitError.NOT_A_FILE_OR_DIRECTORY: _ex.EStorageRequest,
|
698
|
+
ExplicitError.IO_ERROR: _ex.EStorageRequest,
|
699
|
+
|
700
|
+
ExplicitError.ACCESS_DENIED: _ex.EStorageAccess,
|
701
|
+
|
702
|
+
ExplicitError.UNKNOWN_ERROR: _ex.ETracInternal
|
703
|
+
}
|
704
|
+
|
705
|
+
|
706
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
707
|
+
# COMMON DATA STORAGE IMPLEMENTATION
|
135
708
|
# ----------------------------------------------------------------------------------------------------------------------
|
136
709
|
|
137
710
|
|
@@ -178,7 +751,7 @@ class CommonDataStorage(IDataStorage):
|
|
178
751
|
dir_content = self.__file_storage.ls(storage_path)
|
179
752
|
|
180
753
|
if len(dir_content) == 1:
|
181
|
-
storage_path =
|
754
|
+
storage_path = dir_content[0].storage_path
|
182
755
|
else:
|
183
756
|
raise NotImplementedError("Directory storage format not available yet")
|
184
757
|
|
@@ -219,13 +792,16 @@ class CommonDataStorage(IDataStorage):
|
|
219
792
|
if not storage_path.endswith(extension):
|
220
793
|
parent_dir_ = storage_path
|
221
794
|
storage_path_ = storage_path.rstrip("/\\") + f"/chunk-0.{extension}"
|
222
|
-
self.__file_storage.mkdir(parent_dir_, True
|
795
|
+
self.__file_storage.mkdir(parent_dir_, True)
|
223
796
|
else:
|
224
797
|
parent_dir_ = str(pathlib.PurePath(storage_path).parent)
|
225
798
|
storage_path_ = storage_path
|
226
|
-
self.__file_storage.mkdir(parent_dir_, True
|
799
|
+
self.__file_storage.mkdir(parent_dir_, True)
|
800
|
+
|
801
|
+
if not overwrite and self.__file_storage.exists(storage_path_):
|
802
|
+
raise _ex.EStorageRequest(f"File already exists: [{storage_path_}]")
|
227
803
|
|
228
|
-
with self.__file_storage.write_byte_stream(storage_path_
|
804
|
+
with self.__file_storage.write_byte_stream(storage_path_) as byte_stream:
|
229
805
|
codec.write_table(byte_stream, table)
|
230
806
|
|
231
807
|
except (_ex.EStorage, _ex.EData) as e:
|