tracdap-runtime 0.5.30__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. tracdap/rt/_exec/dev_mode.py +2 -1
  2. tracdap/rt/_impl/data.py +1 -28
  3. tracdap/rt/_impl/static_api.py +5 -1
  4. tracdap/rt/_impl/storage.py +586 -10
  5. tracdap/rt/_impl/util.py +24 -3
  6. tracdap/rt/_plugins/_helpers.py +26 -25
  7. tracdap/rt/_plugins/storage_aws.py +162 -76
  8. tracdap/rt/_plugins/storage_azure.py +155 -0
  9. tracdap/rt/_plugins/storage_gcp.py +183 -0
  10. tracdap/rt/_plugins/storage_local.py +249 -98
  11. tracdap/rt/_version.py +1 -1
  12. tracdap/rt/api/static_api.py +2 -1
  13. tracdap/rt/config/__init__.py +8 -13
  14. tracdap/rt/config/common.py +10 -0
  15. tracdap/rt/config/common_pb2.py +38 -31
  16. tracdap/rt/config/job_pb2.py +21 -20
  17. tracdap/rt/config/platform.py +60 -25
  18. tracdap/rt/config/platform_pb2.py +52 -45
  19. tracdap/rt/config/result_pb2.py +15 -14
  20. tracdap/rt/config/runtime.py +0 -1
  21. tracdap/rt/config/runtime_pb2.py +24 -24
  22. tracdap/rt/exceptions.py +9 -0
  23. tracdap/rt/ext/plugins.py +0 -12
  24. tracdap/rt/ext/storage.py +47 -29
  25. tracdap/rt/metadata/common_pb2.py +15 -14
  26. tracdap/rt/metadata/custom_pb2.py +9 -8
  27. tracdap/rt/metadata/data_pb2.py +31 -30
  28. tracdap/rt/metadata/file_pb2.py +9 -8
  29. tracdap/rt/metadata/flow_pb2.py +33 -32
  30. tracdap/rt/metadata/job_pb2.py +55 -54
  31. tracdap/rt/metadata/model_pb2.py +31 -30
  32. tracdap/rt/metadata/object_id_pb2.py +13 -12
  33. tracdap/rt/metadata/object_pb2.py +9 -8
  34. tracdap/rt/metadata/search_pb2.py +19 -18
  35. tracdap/rt/metadata/stoarge_pb2.py +31 -30
  36. tracdap/rt/metadata/tag_pb2.py +13 -12
  37. tracdap/rt/metadata/tag_update_pb2.py +11 -10
  38. tracdap/rt/metadata/type_pb2.py +29 -28
  39. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dist-info}/METADATA +26 -15
  40. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dist-info}/RECORD +43 -43
  41. tracdap/rt/config/gateway.py +0 -104
  42. tracdap/rt/config/gateway_pb2.py +0 -45
  43. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dist-info}/LICENSE +0 -0
  44. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dist-info}/WHEEL +0 -0
  45. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dist-info}/top_level.txt +0 -0
@@ -12,11 +12,17 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import datetime as dt
16
+ import enum
15
17
  import pathlib
16
18
  import re
19
+ import sys
17
20
  import typing as tp
21
+ import traceback as tb
18
22
 
19
23
  import pyarrow as pa
24
+ import pyarrow.fs as pa_fs
25
+ import pyarrow.lib as pa_lib
20
26
 
21
27
  import tracdap.rt.metadata as _meta
22
28
  import tracdap.rt.config as _cfg
@@ -26,7 +32,7 @@ import tracdap.rt._impl.data as _data
26
32
  import tracdap.rt._impl.util as _util
27
33
 
28
34
  # Import storage interfaces
29
- from tracdap.rt.ext.storage import IDataFormat, IDataStorage, IFileStorage, FileType
35
+ from tracdap.rt.ext.storage import *
30
36
 
31
37
 
32
38
  class FormatManager:
@@ -69,10 +75,8 @@ class StorageManager:
69
75
  self.__data_storage: tp.Dict[str, IDataStorage] = dict()
70
76
  self.__settings = sys_config.storage
71
77
 
72
- storage_options = dict()
73
-
74
78
  for storage_key, storage_config in sys_config.storage.buckets.items():
75
- self.create_storage(storage_key, storage_config, storage_options)
79
+ self.create_storage(storage_key, storage_config)
76
80
 
77
81
  def default_storage_key(self):
78
82
  return self.__settings.defaultBucket
@@ -80,7 +84,44 @@ class StorageManager:
80
84
  def default_storage_format(self):
81
85
  return self.__settings.defaultFormat
82
86
 
83
- def create_storage(self, storage_key: str, storage_config: _cfg.PluginConfig, storage_options: dict = None):
87
+ def create_storage(self, storage_key: str, storage_config: _cfg.PluginConfig):
88
+
89
+ if plugins.PluginManager.is_plugin_available(IStorageProvider, storage_config.protocol):
90
+ self._create_storage_from_provider(storage_key, storage_config)
91
+ else:
92
+ self._create_storage_from_impl(storage_key, storage_config)
93
+
94
+ def _create_storage_from_provider(self, storage_key: str, storage_config: _cfg.PluginConfig):
95
+
96
+ provider = plugins.PluginManager.load_plugin(IStorageProvider, storage_config)
97
+
98
+ if provider.has_file_storage():
99
+ file_storage = provider.get_file_storage()
100
+ elif provider.has_arrow_native():
101
+ fs = provider.get_arrow_native()
102
+ file_storage = CommonFileStorage(storage_key, storage_config, fs)
103
+ else:
104
+ file_storage = None
105
+
106
+ if provider.has_data_storage():
107
+ data_storage = provider.get_data_storage()
108
+ elif file_storage is not None:
109
+ data_storage = CommonDataStorage(storage_config, file_storage)
110
+ else:
111
+ data_storage = None
112
+
113
+ if file_storage is None and data_storage is None:
114
+ err = f"Storage type [{storage_config.protocol}] is not available"
115
+ self.__log.error(err)
116
+ raise _ex.EStorageConfig(err)
117
+
118
+ if file_storage is not None:
119
+ self.__file_storage[storage_key] = file_storage
120
+
121
+ if data_storage is not None:
122
+ self.__data_storage[storage_key] = data_storage
123
+
124
+ def _create_storage_from_impl(self, storage_key: str, storage_config: _cfg.PluginConfig):
84
125
 
85
126
  if storage_config is None:
86
127
  err = f"Missing config for storage key [{storage_key}]"
@@ -97,6 +138,9 @@ class StorageManager:
97
138
  self.__log.error(err)
98
139
  raise _ex.EStorageConfig(err)
99
140
 
141
+ # Unused
142
+ storage_options = dict()
143
+
100
144
  file_storage = file_impl(storage_config, storage_options)
101
145
  data_storage = data_impl(storage_config, file_storage)
102
146
 
@@ -131,7 +175,536 @@ class StorageManager:
131
175
 
132
176
 
133
177
  # ----------------------------------------------------------------------------------------------------------------------
134
- # COMMON STORAGE IMPLEMENTATION
178
+ # COMMON FILE STORAGE IMPLEMENTATION
179
+ # ----------------------------------------------------------------------------------------------------------------------
180
+
181
+
182
+ class _NativeFileContext(tp.ContextManager[tp.BinaryIO]):
183
+
184
+ def __init__(self, nf: pa_lib.NativeFile, close_func: tp.Callable):
185
+ super().__init__()
186
+ self.__nf = nf
187
+ self.__close_func = close_func
188
+
189
+ def __enter__(self):
190
+ return self.__nf
191
+
192
+ def __exit__(self, exc_type, exc_val, exc_tb):
193
+ try:
194
+ self.__close_func()
195
+ finally:
196
+ self.__nf.close()
197
+
198
+
199
+ class CommonFileStorage(IFileStorage):
200
+
201
+ _TRAC_DIR_MARKER = "/.trac_dir"
202
+
203
+ FILE_SEMANTICS_FS_TYPES = ["local"]
204
+ BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs", "abfs"]
205
+
206
+ def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig, fs: pa_fs.SubTreeFileSystem):
207
+
208
+ self._log = _util.logger_for_object(self)
209
+ self._key = storage_key
210
+ self._config = storage_config
211
+ self._fs = fs
212
+
213
+ fs_type = fs.base_fs.type_name
214
+ fs_impl = "arrow"
215
+ fs_root = fs.base_path
216
+
217
+ # If this is an FSSpec implementation, take the protocol from FSSpec as the FS type
218
+ base_fs = fs.base_fs
219
+ if isinstance(base_fs, pa_fs.PyFileSystem):
220
+ handler = base_fs.handler
221
+ if isinstance(handler, pa_fs.FSSpecHandler):
222
+ fs_type = handler.fs.protocol[0] if isinstance(handler.fs.protocol, tuple) else handler.fs.protocol
223
+ fs_impl = "fsspec"
224
+
225
+ # Some optimization is possible if the underlying storage semantics are known
226
+ self._file_semantics = True if fs_type in self.FILE_SEMANTICS_FS_TYPES else False
227
+ self._bucket_semantics = True if fs_type in self.BUCKET_SEMANTICS_FS_TYPES else False
228
+ self._explicit_dir_semantics = True if self._bucket_semantics and fs_impl == "fsspec" else False
229
+
230
+ self._log.info(
231
+ f"INIT [{self._key}]: Common file storage, " +
232
+ f"fs = [{fs_type}], " +
233
+ f"impl = [{fs_impl}], " +
234
+ f"root = [{fs_root}]")
235
+
236
+ def exists(self, storage_path: str) -> bool:
237
+
238
+ return self._wrap_operation(self._exists, "EXISTS", storage_path)
239
+
240
+ def _exists(self, operation_name: str, storage_path: str) -> bool:
241
+
242
+ resolved_path = self._resolve_path(operation_name, storage_path, True)
243
+
244
+ file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
245
+ return file_info.type != pa_fs.FileType.NotFound
246
+
247
+ def size(self, storage_path: str) -> int:
248
+
249
+ return self._wrap_operation(self._size, "SIZE", storage_path)
250
+
251
+ def _size(self, operation_name: str, storage_path: str) -> int:
252
+
253
+ resolved_path = self._resolve_path(operation_name, storage_path, True)
254
+ file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
255
+
256
+ if file_info.type == pa_fs.FileType.NotFound:
257
+ raise self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
258
+
259
+ if not file_info.is_file:
260
+ raise self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
261
+
262
+ return file_info.size
263
+
264
+ def stat(self, storage_path: str) -> FileStat:
265
+
266
+ return self._wrap_operation(self._stat, "STAT", storage_path)
267
+
268
+ def _stat(self, operation_name: str, storage_path: str) -> FileStat:
269
+
270
+ resolved_path = self._resolve_path(operation_name, storage_path, True)
271
+
272
+ file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
273
+
274
+ if file_info.type == pa_fs.FileType.NotFound:
275
+ raise self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
276
+
277
+ if file_info.type != pa_fs.FileType.File and file_info.type != pa_fs.FileType.Directory:
278
+ raise self._explicit_error(self.ExplicitError.NOT_A_FILE_OR_DIRECTORY, operation_name, storage_path)
279
+
280
+ return self._info_to_stat(file_info)
281
+
282
+ @staticmethod
283
+ def _info_to_stat(file_info: pa_fs.FileInfo):
284
+
285
+ if file_info.path == "":
286
+ file_name = "."
287
+ storage_path = "."
288
+ elif file_info.path.startswith("./"):
289
+ file_name = file_info.base_name
290
+ storage_path = file_info.path[2:]
291
+ else:
292
+ file_name = file_info.base_name
293
+ storage_path = file_info.path
294
+
295
+ file_type = FileType.FILE if file_info.is_file else FileType.DIRECTORY
296
+ file_size = file_info.size if file_info.is_file else 0
297
+
298
+ # Normalization in case the impl gives back directory entries with a trailing slash
299
+ if file_type == FileType.DIRECTORY and storage_path.endswith("/"):
300
+ storage_path = storage_path[:-1]
301
+ separator = storage_path.rfind("/")
302
+ file_name = storage_path[separator+1:]
303
+
304
+ mtime = file_info.mtime.astimezone(dt.timezone.utc) if file_info.mtime is not None else None
305
+
306
+ return FileStat(
307
+ file_name,
308
+ file_type,
309
+ storage_path,
310
+ file_size,
311
+ mtime=mtime,
312
+ atime=None)
313
+
314
+ def ls(self, storage_path: str, recursive: bool = False) -> tp.List[FileStat]:
315
+
316
+ return self._wrap_operation(self._ls, "LS", storage_path, recursive)
317
+
318
+ def _ls(self, operation_name: str, storage_path: str, recursive: bool) -> tp.List[FileStat]:
319
+
320
+ resolved_path = self._resolve_path(operation_name, storage_path, True)
321
+
322
+ # _stat() will fail for file not found, or if the path is not a file/directory
323
+ stat = self._stat(operation_name, storage_path)
324
+
325
+ # Calling LS on a file should return a list with one entry for just that file
326
+ if stat.file_type == FileType.FILE:
327
+ return [stat]
328
+
329
+ # Otherwise do a normal directory listing
330
+ else:
331
+ # A trailing slash prevents some implementations including the directory in its own listing
332
+ selector = pa_fs.FileSelector(resolved_path + "/", recursive=recursive) # noqa
333
+ file_infos = self._fs.get_file_info(selector)
334
+ file_infos = filter(lambda fi: not fi.path.endswith(self._TRAC_DIR_MARKER), file_infos)
335
+ return list(map(self._info_to_stat, file_infos))
336
+
337
+ def mkdir(self, storage_path: str, recursive: bool = False):
338
+
339
+ return self._wrap_operation(self._mkdir, "MKDIR", storage_path, recursive)
340
+
341
+ def _mkdir(self, operation_name: str, storage_path: str, recursive: bool):
342
+
343
+ resolved_path = self._resolve_path(operation_name, storage_path, False)
344
+
345
+ # Try to prevent MKDIR if a file or file-like object already exists
346
+ # In cloud bucket semantics a file and dir can both exist with the same name - very confusing!
347
+ # There is a race condition here because a file could be created by another process
348
+ # But, given the very structured way TRAC uses file storage, this is extremely unlikely
349
+
350
+ prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
351
+ if prior_stat.type == pa_fs.FileType.File or prior_stat.type == pa_fs.FileType.Unknown:
352
+ raise self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
353
+
354
+ # For most FS types, it is fine to use the Arrow create_dir() method
355
+ # For bucket-like storage, this will normally create an empty blob with a name like "my_dir/"
356
+
357
+ if not self._explicit_dir_semantics:
358
+ self._fs.create_dir(resolved_path, recursive=recursive)
359
+ return
360
+
361
+ # Some FS backends for bucket-like storage do not allow empty blobs as directories
362
+ # For these backends, we have to create an explicit marker file inside the directory
363
+ # In this case it is also necessary to check parents explicitly for non-recursive requests
364
+
365
+ if not recursive and prior_stat.type == pa_fs.FileType.NotFound:
366
+ parent_path = self._resolve_parent(resolved_path)
367
+ if parent_path is not None:
368
+ parent_stat: pa_fs.FileInfo = self._fs.get_file_info(parent_path)
369
+ if parent_stat.type != pa_fs.FileType.Directory:
370
+ raise FileNotFoundError
371
+
372
+ dir_marker = resolved_path + self._TRAC_DIR_MARKER
373
+ with self._fs.open_output_stream(dir_marker) as stream:
374
+ stream.write(b"")
375
+
376
+ def rm(self, storage_path: str):
377
+
378
+ return self._wrap_operation(self._rm, "RM", storage_path)
379
+
380
+ def _rm(self, operation_name: str, storage_path: str):
381
+
382
+ resolved_path = self._resolve_path(operation_name, storage_path, False)
383
+
384
+ file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
385
+ if file_info.type == pa_fs.FileType.Directory:
386
+ raise self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
387
+
388
+ self._fs.delete_file(resolved_path)
389
+
390
+ def rmdir(self, storage_path: str):
391
+
392
+ return self._wrap_operation(self._rmdir, "RMDIR", storage_path)
393
+
394
+ def _rmdir(self, operation_name: str, storage_path: str):
395
+
396
+ resolved_path = self._resolve_path(operation_name, storage_path, False)
397
+
398
+ file_info: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
399
+ if file_info.type == pa_fs.FileType.File:
400
+ raise self._explicit_error(self.ExplicitError.NOT_A_DIRECTORY, operation_name, storage_path)
401
+
402
+ self._fs.delete_dir(resolved_path)
403
+
404
+ def read_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
405
+
406
+ return self._wrap_operation(self._read_byte_stream, "OPEN BYTE STREAM (READ)", storage_path)
407
+
408
+ def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
409
+
410
+ resolved_path = self._resolve_path(operation_name, storage_path, False)
411
+
412
+ # Check some information about the file before attempting the read
413
+ # There is a race condition here so open_input_file() can still fail
414
+ # Even so, prior_stat gives more meaningful error information in the common case
415
+ # If the file is changed before open_input_file, errors will be raised but might be less meaningful
416
+ prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
417
+ if prior_stat.type == pa_fs.FileType.NotFound:
418
+ raise self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
419
+ if prior_stat.type != pa_fs.FileType.File:
420
+ raise self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
421
+
422
+ # Since the size is known, log it now rather than calling stream.seek() and stream.tell()
423
+ self._log.info(f"File size [{self._key}]: {prior_stat.size} [{storage_path}]")
424
+
425
+ # Open the stream
426
+ stream = self._fs.open_input_file(resolved_path)
427
+
428
+ # Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
429
+ return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, False)) # noqa
430
+
431
+ def write_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
432
+
433
+ return self._wrap_operation(self._write_byte_stream, "OPEN BYTE STREAM (WRITE)", storage_path)
434
+
435
+ def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
436
+
437
+ resolved_path = self._resolve_path(operation_name, storage_path, False)
438
+
439
+ # Make sure the parent directory exists
440
+ # In bucket semantics this is not needed and creating a 0-byte object for every real object is a bad idea
441
+ # For file semantics, or if semantics are not known, create the parent dir to avoid failures
442
+ if not self._bucket_semantics:
443
+ parent_path = self._resolve_parent(resolved_path)
444
+ if parent_path is not None:
445
+ self._mkdir(operation_name, parent_path, recursive=True)
446
+
447
+ # Try to prevent WRITE if the object is already defined as a directory or other non-file object
448
+ # In cloud bucket semantics a file and dir can both exist with the same name - very confusing!
449
+ # There is a race condition here because a directory could be created by another process
450
+ # But, given the very structured way TRAC uses file storage, this is extremely unlikely
451
+ prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
452
+ if prior_stat.type != pa_fs.FileType.NotFound and prior_stat.type != pa_fs.FileType.File:
453
+ raise self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
454
+
455
+ # If the file does not already exist and the write operation fails, try to clean it up
456
+ delete_on_error = prior_stat.type == pa_fs.FileType.NotFound
457
+
458
+ # Open the stream
459
+ stream = self._fs.open_output_stream(resolved_path)
460
+
461
+ # Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
462
+ return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, True, delete_on_error)) # noqa
463
+
464
+ def _close_byte_stream(self, storage_path: str, stream: tp.BinaryIO, is_write: bool, delete_on_error: bool = False):
465
+
466
+ # If there has been an error, log it
467
+ exc_info = sys.exc_info()
468
+ error = exc_info[1] if exc_info is not None else None
469
+
470
+ if error is not None:
471
+ self._log.exception(str(error))
472
+
473
+ # For successful write streams, log the total size written
474
+ if is_write and not error:
475
+ file_size = _util.format_file_size(stream.tell())
476
+ self._log.info(f"File size [{self._key}]: {file_size} [{storage_path}]")
477
+
478
+ # Close the stream - this may take time for write streams that are not flushed
479
+ # Closing here gives better logs, because any pause is before the close message
480
+ # As a fail-safe, _NativeFileResource always calls close() in a "finally" block
481
+ if not stream.closed:
482
+ stream.close()
483
+
484
+ # Log closing of the stream
485
+ if is_write:
486
+ self._log.info(f"CLOSE BYTE STREAM (WRITE) [{self._key}]: [{storage_path}]")
487
+
488
+ else:
489
+ self._log.info(f"CLOSE BYTE STREAM (READ) [{self._key}]: [{storage_path}]")
490
+
491
+ # If there is an error and cleanup is requested, try to remove the partially written file
492
+ # This is best-efforts, don't blow up if the cleanup fails
493
+ if error is not None and delete_on_error:
494
+ try:
495
+ file_info = self._fs.get_file_info(storage_path)
496
+ if file_info.type != pa_fs.FileType.NotFound:
497
+ self._fs.delete_file(storage_path)
498
+ # different implementations can throw different errors here
499
+ except Exception: # noqa
500
+ pass
501
+
502
+ # Stream implementations can raise various types of error during stream operations
503
+ # Errors can have different causes (access, communication, missing / duplicate files etc.)
504
+ # Also, other errors can occur inside the stream context manager, unrelated to IO
505
+
506
+ # In the case of an IO error we want to raise EStorage, other errors should propagate as they are
507
+ # This handler tries to spot IO errors from inside the PyArrow library, it is probably not fail-safe
508
+ # If an IO error is not spotted, the original error will propagate and get reported as EUnexpected
509
+ # Anyway this handler is only for errors that happen after the stream is opened
510
+
511
+ # The alternative is to override every method in _NativeFileResource and try to catch there
512
+ # However, different implementations raise different error types, so we still need some kind of inspection
513
+
514
+ if error is not None:
515
+
516
+ if isinstance(error, OSError):
517
+ raise _ex.EStorage from error
518
+
519
+ stack = tb.extract_tb(exc_info[2])
520
+ stack = filter(lambda frame: frame.filename is not None, stack)
521
+
522
+ if any(filter(lambda frame: frame.filename.startswith("pyarrow/"), stack)):
523
+ raise _ex.EStorage from error
524
+
525
+ def _wrap_operation(self, func: tp.Callable, operation_name: str, storage_path: str, *args, **kwargs) -> tp.Any:
526
+
527
+ operation = f"{operation_name} {self._key} [{storage_path}]"
528
+
529
+ try:
530
+ self._log.info(operation)
531
+ return func(operation_name, storage_path, *args, **kwargs)
532
+
533
+ # ETrac means the error is already handled, log the message as-is
534
+
535
+ except _ex.ETrac as e:
536
+ self._log.exception(f"{operation}: {str(e)}")
537
+ raise
538
+
539
+ # Arrow maps filesystem errors into native Python OS errors
540
+
541
+ except FileNotFoundError as e:
542
+ error = self._explicit_error(self.ExplicitError.OBJECT_NOT_FOUND, operation_name, storage_path)
543
+ self._log.exception(f"{operation}: {str(error)}")
544
+ raise error from e
545
+
546
+ except FileExistsError as e:
547
+ error = self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
548
+ self._log.exception(f"{operation}: {str(error)}")
549
+ raise error from e
550
+
551
+ except IsADirectoryError as e:
552
+ error = self._explicit_error(self.ExplicitError.NOT_A_FILE, operation_name, storage_path)
553
+ self._log.exception(f"{operation}: {str(error)}")
554
+ raise error from e
555
+
556
+ except NotADirectoryError as e:
557
+ error = self._explicit_error(self.ExplicitError.NOT_A_DIRECTORY, operation_name, storage_path)
558
+ self._log.exception(f"{operation}: {str(error)}")
559
+ raise error from e
560
+
561
+ except PermissionError as e:
562
+ error = self._explicit_error(self.ExplicitError.ACCESS_DENIED, operation_name, storage_path)
563
+ self._log.exception(f"{operation}: {str(error)}")
564
+ raise error from e
565
+
566
+ # OSError is the top-level error for IO exceptions
567
+ # This is raised on some platforms if there is not a recognized errno from the low-level operation
568
+
569
+ except OSError as e:
570
+ error = self._explicit_error(self.ExplicitError.IO_ERROR, operation_name, storage_path)
571
+ self._log.error(f"{operation}: {str(e)}")
572
+ self._log.exception(f"{operation}: {str(error)}")
573
+ raise error from e
574
+
575
+ # Other types of exception are not expected - report these as internal errors
576
+
577
+ except Exception as e:
578
+ error = self._explicit_error(self.ExplicitError.UNKNOWN_ERROR, operation_name, storage_path)
579
+ self._log.exception(f"{operation}: {str(error)}")
580
+ raise error from e
581
+
582
+ def _resolve_path(self, operation_name: str, storage_path: str, allow_root_dir: bool) -> str:
583
+
584
+ try:
585
+
586
+ if storage_path is None or len(storage_path.strip()) == 0:
587
+ raise self._explicit_error(self.ExplicitError.STORAGE_PATH_NULL_OR_BLANK, operation_name, storage_path)
588
+
589
+ if self._ILLEGAL_PATH_CHARS.match(storage_path):
590
+ raise self._explicit_error(self.ExplicitError.STORAGE_PATH_INVALID, operation_name, storage_path)
591
+
592
+ relative_path = pathlib.Path(storage_path)
593
+
594
+ if relative_path.is_absolute():
595
+ raise self._explicit_error(self.ExplicitError.STORAGE_PATH_NOT_RELATIVE, operation_name, storage_path)
596
+
597
+ root_path = pathlib.Path("C:\\root") if _util.is_windows() else pathlib.Path("/root")
598
+ absolute_path = root_path.joinpath(relative_path).resolve(False)
599
+
600
+ if absolute_path == root_path:
601
+ if not allow_root_dir:
602
+ raise self._explicit_error(self.ExplicitError.STORAGE_PATH_IS_ROOT, operation_name, storage_path)
603
+ else:
604
+ return ""
605
+
606
+ # is_relative_to only supported in Python 3.9+, we need to support 3.7
607
+ if root_path not in absolute_path.parents:
608
+ raise self._explicit_error(self.ExplicitError.STORAGE_PATH_OUTSIDE_ROOT, operation_name, storage_path)
609
+ else:
610
+ return absolute_path.relative_to(root_path).as_posix()
611
+
612
+ except ValueError as e:
613
+
614
+ raise self._explicit_error(self.ExplicitError.STORAGE_PATH_INVALID, operation_name, storage_path) from e
615
+
616
+ @staticmethod
617
+ def _resolve_parent(storage_path: str) -> tp.Optional[str]:
618
+
619
+ root_path = pathlib.Path("C:\\root") if _util.is_windows() else pathlib.Path("/root")
620
+ absolute_path = root_path.joinpath(storage_path).resolve(False)
621
+
622
+ if absolute_path == root_path or absolute_path.parent == root_path:
623
+ return None
624
+
625
+ else:
626
+ return pathlib.Path(storage_path).parent.as_posix()
627
+
628
+ def _explicit_error(self, error, operation_name, storage_path):
629
+
630
+ message_template = self._ERROR_MESSAGE_MAP.get(error)
631
+ message = message_template.format(operation_name, self._key, storage_path)
632
+
633
+ err_type = self._ERROR_TYPE_MAP.get(error)
634
+ err = err_type(message)
635
+
636
+ return err
637
+
638
+ _ILLEGAL_PATH_CHARS_WINDOWS = re.compile(r".*[\x00<>:\"\'|?*].*")
639
+ _ILLEGAL_PATH_CHARS_POSIX = re.compile(r".*[\x00<>:\"\'|?*\\].*")
640
+ _ILLEGAL_PATH_CHARS = _ILLEGAL_PATH_CHARS_WINDOWS if _util.is_windows() else _ILLEGAL_PATH_CHARS_POSIX
641
+
642
+ class ExplicitError(enum.Enum):
643
+
644
+ # Validation failures
645
+ STORAGE_PATH_NULL_OR_BLANK = 1
646
+ STORAGE_PATH_NOT_RELATIVE = 2
647
+ STORAGE_PATH_OUTSIDE_ROOT = 3
648
+ STORAGE_PATH_IS_ROOT = 4
649
+ STORAGE_PATH_INVALID = 5
650
+
651
+ # Exceptions
652
+ OBJECT_NOT_FOUND = 10
653
+ OBJECT_ALREADY_EXISTS = 11
654
+ NOT_A_FILE = 12
655
+ NOT_A_DIRECTORY = 13
656
+ NOT_A_FILE_OR_DIRECTORY = 14
657
+ IO_ERROR = 15
658
+
659
+ # Permissions
660
+ ACCESS_DENIED = 20
661
+
662
+ # Unhandled / unexpected error
663
+ UNKNOWN_ERROR = 30
664
+
665
+ _ERROR_MESSAGE_MAP = {
666
+
667
+ ExplicitError.STORAGE_PATH_NULL_OR_BLANK: "Requested storage path is null or blank: {} {} [{}]",
668
+ ExplicitError.STORAGE_PATH_NOT_RELATIVE: "Requested storage path is not a relative path: {} {} [{}]",
669
+ ExplicitError.STORAGE_PATH_OUTSIDE_ROOT: "Requested storage path is outside the storage root directory: {} {} [{}]", # noqa
670
+ ExplicitError.STORAGE_PATH_IS_ROOT: "Requested operation not allowed on the storage root directory: {} {} [{}]",
671
+ ExplicitError.STORAGE_PATH_INVALID: "Requested storage path is invalid: {} {} [{}]",
672
+
673
+ ExplicitError.OBJECT_NOT_FOUND: "Object not found in storage layer: {} {} [{}]",
674
+ ExplicitError.OBJECT_ALREADY_EXISTS: "Object already exists in storage layer: {} {} [{}]",
675
+ ExplicitError.NOT_A_FILE: "Object is not a file: {} {} [{}]",
676
+ ExplicitError.NOT_A_DIRECTORY: "Object is not a directory: {} {} [{}]",
677
+ ExplicitError.NOT_A_FILE_OR_DIRECTORY: "Object is not a file or directory: {} {} [{}]",
678
+ ExplicitError.IO_ERROR: "An IO error occurred in the storage layer: {} {} [{}]",
679
+
680
+ ExplicitError.ACCESS_DENIED: "Access denied in storage layer: {} {} [{}]",
681
+
682
+ ExplicitError.UNKNOWN_ERROR: "An unexpected error occurred in the storage layer: {} {} [{}]",
683
+ }
684
+
685
+ _ERROR_TYPE_MAP = {
686
+
687
+ ExplicitError.STORAGE_PATH_NULL_OR_BLANK: _ex.EStorageValidation,
688
+ ExplicitError.STORAGE_PATH_NOT_RELATIVE: _ex.EStorageValidation,
689
+ ExplicitError.STORAGE_PATH_OUTSIDE_ROOT: _ex.EStorageValidation,
690
+ ExplicitError.STORAGE_PATH_IS_ROOT: _ex.EStorageValidation,
691
+ ExplicitError.STORAGE_PATH_INVALID: _ex.EStorageValidation,
692
+
693
+ ExplicitError.OBJECT_NOT_FOUND: _ex.EStorageRequest,
694
+ ExplicitError.OBJECT_ALREADY_EXISTS: _ex.EStorageRequest,
695
+ ExplicitError.NOT_A_FILE: _ex.EStorageRequest,
696
+ ExplicitError.NOT_A_DIRECTORY: _ex.EStorageRequest,
697
+ ExplicitError.NOT_A_FILE_OR_DIRECTORY: _ex.EStorageRequest,
698
+ ExplicitError.IO_ERROR: _ex.EStorageRequest,
699
+
700
+ ExplicitError.ACCESS_DENIED: _ex.EStorageAccess,
701
+
702
+ ExplicitError.UNKNOWN_ERROR: _ex.ETracInternal
703
+ }
704
+
705
+
706
+ # ----------------------------------------------------------------------------------------------------------------------
707
+ # COMMON DATA STORAGE IMPLEMENTATION
135
708
  # ----------------------------------------------------------------------------------------------------------------------
136
709
 
137
710
 
@@ -178,7 +751,7 @@ class CommonDataStorage(IDataStorage):
178
751
  dir_content = self.__file_storage.ls(storage_path)
179
752
 
180
753
  if len(dir_content) == 1:
181
- storage_path = storage_path.rstrip("/\\") + "/" + dir_content[0]
754
+ storage_path = dir_content[0].storage_path
182
755
  else:
183
756
  raise NotImplementedError("Directory storage format not available yet")
184
757
 
@@ -219,13 +792,16 @@ class CommonDataStorage(IDataStorage):
219
792
  if not storage_path.endswith(extension):
220
793
  parent_dir_ = storage_path
221
794
  storage_path_ = storage_path.rstrip("/\\") + f"/chunk-0.{extension}"
222
- self.__file_storage.mkdir(parent_dir_, True, exists_ok=overwrite)
795
+ self.__file_storage.mkdir(parent_dir_, True)
223
796
  else:
224
797
  parent_dir_ = str(pathlib.PurePath(storage_path).parent)
225
798
  storage_path_ = storage_path
226
- self.__file_storage.mkdir(parent_dir_, True, True)
799
+ self.__file_storage.mkdir(parent_dir_, True)
800
+
801
+ if not overwrite and self.__file_storage.exists(storage_path_):
802
+ raise _ex.EStorageRequest(f"File already exists: [{storage_path_}]")
227
803
 
228
- with self.__file_storage.write_byte_stream(storage_path_, overwrite=overwrite) as byte_stream:
804
+ with self.__file_storage.write_byte_stream(storage_path_) as byte_stream:
229
805
  codec.write_table(byte_stream, table)
230
806
 
231
807
  except (_ex.EStorage, _ex.EData) as e: