tracdap-runtime 0.6.0rc1__py3-none-any.whl → 0.6.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. tracdap/rt/_impl/data.py +53 -16
  2. tracdap/rt/_impl/storage.py +92 -27
  3. tracdap/rt/_plugins/storage_aws.py +158 -142
  4. tracdap/rt/_plugins/storage_azure.py +155 -0
  5. tracdap/rt/_plugins/storage_gcp.py +72 -15
  6. tracdap/rt/_plugins/storage_local.py +11 -6
  7. tracdap/rt/_version.py +1 -1
  8. tracdap/rt/config/__init__.py +12 -17
  9. tracdap/rt/config/common.py +10 -0
  10. tracdap/rt/config/common_pb2.py +38 -31
  11. tracdap/rt/config/job_pb2.py +21 -20
  12. tracdap/rt/config/platform.py +60 -25
  13. tracdap/rt/config/platform_pb2.py +52 -45
  14. tracdap/rt/config/result_pb2.py +15 -14
  15. tracdap/rt/config/runtime.py +0 -1
  16. tracdap/rt/config/runtime_pb2.py +24 -24
  17. tracdap/rt/ext/storage.py +2 -2
  18. tracdap/rt/metadata/__init__.py +20 -20
  19. tracdap/rt/metadata/common_pb2.py +15 -14
  20. tracdap/rt/metadata/custom_pb2.py +9 -8
  21. tracdap/rt/metadata/data_pb2.py +31 -30
  22. tracdap/rt/metadata/file_pb2.py +9 -8
  23. tracdap/rt/metadata/flow_pb2.py +33 -32
  24. tracdap/rt/metadata/job_pb2.py +55 -54
  25. tracdap/rt/metadata/model_pb2.py +31 -30
  26. tracdap/rt/metadata/object_id_pb2.py +13 -12
  27. tracdap/rt/metadata/object_pb2.py +9 -8
  28. tracdap/rt/metadata/search_pb2.py +19 -18
  29. tracdap/rt/metadata/stoarge_pb2.py +31 -30
  30. tracdap/rt/metadata/tag_pb2.py +13 -12
  31. tracdap/rt/metadata/tag_update_pb2.py +11 -10
  32. tracdap/rt/metadata/type_pb2.py +29 -28
  33. {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/METADATA +27 -15
  34. {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/RECORD +37 -38
  35. {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/WHEEL +1 -1
  36. tracdap/rt/config/gateway.py +0 -104
  37. tracdap/rt/config/gateway_pb2.py +0 -45
  38. {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/LICENSE +0 -0
  39. {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/data.py CHANGED
@@ -74,11 +74,7 @@ class DataView:
74
74
 
75
75
 
76
76
  class _DataInternal:
77
-
78
- @staticmethod
79
- def float_dtype_check():
80
- if "Float64Dtype" not in pd.__dict__:
81
- raise _ex.EStartup("TRAC D.A.P. requires Pandas >= 1.2")
77
+ pass
82
78
 
83
79
 
84
80
  class DataMapping:
@@ -111,8 +107,40 @@ class DataMapping:
111
107
  }
112
108
 
113
109
  # Check the Pandas dtypes for handling floats are available before setting up the type mapping
114
- __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
115
- __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype
110
+ __PANDAS_VERSION_ELEMENTS = pd.__version__.split(".")
111
+ __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
112
+ __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
113
+
114
+ if __PANDAS_MAJOR_VERSION == 2:
115
+
116
+ __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
117
+ __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
118
+
119
+ @classmethod
120
+ def __pandas_datetime_type(cls, tz, unit):
121
+ if tz is None and unit is None:
122
+ return cls.__PANDAS_DATETIME_TYPE
123
+ _unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
124
+ if tz is None:
125
+ return pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
126
+ else:
127
+ return pd.DatetimeTZDtype(tz=tz, unit=_unit)
128
+
129
+ # Minimum supported version for Pandas is 1.2, when pd.Float64Dtype was introduced
130
+ elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
131
+
132
+ __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).dtype
133
+ __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
134
+
135
+ @classmethod
136
+ def __pandas_datetime_type(cls, tz, unit): # noqa
137
+ if tz is None:
138
+ return cls.__PANDAS_DATETIME_TYPE
139
+ else:
140
+ return pd.DatetimeTZDtype(tz=tz)
141
+
142
+ else:
143
+ raise _ex.EStartup(f"Pandas version not supported: [{pd.__version__}]")
116
144
 
117
145
  # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
118
146
  __ARROW_TO_PANDAS_TYPE_MAPPING = {
@@ -224,8 +252,12 @@ class DataMapping:
224
252
  cls.__TRAC_DECIMAL_SCALE)
225
253
 
226
254
  @classmethod
227
- def pandas_datetime_type(cls):
228
- return cls.__PANDAS_DATETIME_TYPE
255
+ def pandas_date_type(cls):
256
+ return cls.__PANDAS_DATE_TYPE
257
+
258
+ @classmethod
259
+ def pandas_datetime_type(cls, tz=None, unit=None):
260
+ return cls.__pandas_datetime_type(tz, unit)
229
261
 
230
262
  @classmethod
231
263
  def view_to_pandas(
@@ -297,6 +329,7 @@ class DataMapping:
297
329
  else:
298
330
  DataConformance.check_duplicate_fields(table.schema.names, False)
299
331
 
332
+ # Use Arrow's built-in function to convert to Pandas
300
333
  return table.to_pandas(
301
334
 
302
335
  # Mapping for arrow -> pandas types for core types
@@ -463,7 +496,7 @@ class DataConformance:
463
496
 
464
497
  table_column: pa.Array = table.column(table_index)
465
498
 
466
- pandas_type = pandas_types[table_index] \
499
+ pandas_type = pandas_types.iloc[table_index] \
467
500
  if pandas_types is not None \
468
501
  else None
469
502
 
@@ -691,16 +724,20 @@ class DataConformance:
691
724
  @classmethod
692
725
  def _coerce_date(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
693
726
 
694
- # Allow casting date32 -> date64, both range and precision are greater so there is no data loss
727
+ # The bit-width restriction could be removed here
728
+ # For date types there is never loss of precision and pa.cast will raise an error on overflow
729
+ # Impact to client code is unlikely, still this change should happen with a TRAC minor version update
695
730
  if pa.types.is_date(vector.type):
696
731
  if field.type.bit_width >= vector.type.bit_width:
697
732
  return pc.cast(vector, field.type)
698
733
 
699
- # Special handling for Pandas/NumPy date values
700
- # These are encoded as np.datetime64[ns] in Pandas -> pa.timestamp64[ns] in Arrow
701
- # Only allow this conversion if the vector is coming from Pandas with datetime type
702
- if pandas_type == DataMapping.pandas_datetime_type():
703
- if pa.types.is_timestamp(vector.type) and vector.type.unit == "ns":
734
+ # Special handling for date values coming from Pandas/NumPy
735
+ # Only allow these conversions if the vector is supplied with Pandas type info
736
+ # For Pandas 1.x, dates are always encoded as np.datetime64[ns]
737
+ # For Pandas 2.x dates are still np.datetime64 but can be in s, ms, us or ns
738
+ # This conversion will not apply to dates held in Pandas using the Python date object types
739
+ if pandas_type is not None:
740
+ if pa.types.is_timestamp(vector.type) and pd.api.types.is_datetime64_any_dtype(pandas_type):
704
741
  return pc.cast(vector, field.type)
705
742
 
706
743
  error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
@@ -18,6 +18,7 @@ import pathlib
18
18
  import re
19
19
  import sys
20
20
  import typing as tp
21
+ import traceback as tb
21
22
 
22
23
  import pyarrow as pa
23
24
  import pyarrow.fs as pa_fs
@@ -178,20 +179,17 @@ class StorageManager:
178
179
  # ----------------------------------------------------------------------------------------------------------------------
179
180
 
180
181
 
181
- class _NativeFileResource(pa_lib.NativeFile):
182
+ class _NativeFileContext(tp.ContextManager[tp.BinaryIO]):
182
183
 
183
184
  def __init__(self, nf: pa_lib.NativeFile, close_func: tp.Callable):
184
185
  super().__init__()
185
186
  self.__nf = nf
186
187
  self.__close_func = close_func
187
188
 
188
- def __getattribute__(self, item):
189
- if item == "close" or item == "_NativeFileResource__nf" or item == "_NativeFileResource__close_func":
190
- return object.__getattribute__(self, item)
191
- else:
192
- return object.__getattribute__(self.__nf, item)
189
+ def __enter__(self):
190
+ return self.__nf
193
191
 
194
- def close(self):
192
+ def __exit__(self, exc_type, exc_val, exc_tb):
195
193
  try:
196
194
  self.__close_func()
197
195
  finally:
@@ -200,24 +198,40 @@ class _NativeFileResource(pa_lib.NativeFile):
200
198
 
201
199
  class CommonFileStorage(IFileStorage):
202
200
 
201
+ _TRAC_DIR_MARKER = "/.trac_dir"
202
+
203
203
  FILE_SEMANTICS_FS_TYPES = ["local"]
204
- BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs"]
204
+ BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs", "abfs"]
205
205
 
206
- def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig, fs_impl: pa_fs.SubTreeFileSystem):
206
+ def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig, fs: pa_fs.SubTreeFileSystem):
207
207
 
208
208
  self._log = _util.logger_for_object(self)
209
209
  self._key = storage_key
210
210
  self._config = storage_config
211
- self._fs = fs_impl
211
+ self._fs = fs
212
+
213
+ fs_type = fs.base_fs.type_name
214
+ fs_impl = "arrow"
215
+ fs_root = fs.base_path
212
216
 
213
- fs_type = fs_impl.base_fs.type_name
214
- fs_root = fs_impl.base_path
217
+ # If this is an FSSpec implementation, take the protocol from FSSpec as the FS type
218
+ base_fs = fs.base_fs
219
+ if isinstance(base_fs, pa_fs.PyFileSystem):
220
+ handler = base_fs.handler
221
+ if isinstance(handler, pa_fs.FSSpecHandler):
222
+ fs_type = handler.fs.protocol[0] if isinstance(handler.fs.protocol, tuple) else handler.fs.protocol
223
+ fs_impl = "fsspec"
215
224
 
216
225
  # Some optimization is possible if the underlying storage semantics are known
217
226
  self._file_semantics = True if fs_type in self.FILE_SEMANTICS_FS_TYPES else False
218
227
  self._bucket_semantics = True if fs_type in self.BUCKET_SEMANTICS_FS_TYPES else False
228
+ self._explicit_dir_semantics = True if self._bucket_semantics and fs_impl == "fsspec" else False
219
229
 
220
- self._log.info(f"INIT [{self._key}]: Common file storage, fs = [{fs_type}], root = [{fs_root}]")
230
+ self._log.info(
231
+ f"INIT [{self._key}]: Common file storage, " +
232
+ f"fs = [{fs_type}], " +
233
+ f"impl = [{fs_impl}], " +
234
+ f"root = [{fs_root}]")
221
235
 
222
236
  def exists(self, storage_path: str) -> bool:
223
237
 
@@ -280,6 +294,13 @@ class CommonFileStorage(IFileStorage):
280
294
 
281
295
  file_type = FileType.FILE if file_info.is_file else FileType.DIRECTORY
282
296
  file_size = file_info.size if file_info.is_file else 0
297
+
298
+ # Normalization in case the impl gives back directory entries with a trailing slash
299
+ if file_type == FileType.DIRECTORY and storage_path.endswith("/"):
300
+ storage_path = storage_path[:-1]
301
+ separator = storage_path.rfind("/")
302
+ file_name = storage_path[separator+1:]
303
+
283
304
  mtime = file_info.mtime.astimezone(dt.timezone.utc) if file_info.mtime is not None else None
284
305
 
285
306
  return FileStat(
@@ -307,8 +328,10 @@ class CommonFileStorage(IFileStorage):
307
328
 
308
329
  # Otherwise do a normal directory listing
309
330
  else:
310
- selector = pa_fs.FileSelector(resolved_path, recursive=recursive) # noqa
331
+ # A trailing slash prevents some implementations including the directory in its own listing
332
+ selector = pa_fs.FileSelector(resolved_path + "/", recursive=recursive) # noqa
311
333
  file_infos = self._fs.get_file_info(selector)
334
+ file_infos = filter(lambda fi: not fi.path.endswith(self._TRAC_DIR_MARKER), file_infos)
312
335
  return list(map(self._info_to_stat, file_infos))
313
336
 
314
337
  def mkdir(self, storage_path: str, recursive: bool = False):
@@ -323,11 +346,32 @@ class CommonFileStorage(IFileStorage):
323
346
  # In cloud bucket semantics a file and dir can both exist with the same name - very confusing!
324
347
  # There is a race condition here because a file could be created by another process
325
348
  # But, given the very structured way TRAC uses file storage, this is extremely unlikely
349
+
326
350
  prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
327
351
  if prior_stat.type == pa_fs.FileType.File or prior_stat.type == pa_fs.FileType.Unknown:
328
352
  raise self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
329
353
 
330
- self._fs.create_dir(resolved_path, recursive=recursive)
354
+ # For most FS types, it is fine to use the Arrow create_dir() method
355
+ # For bucket-like storage, this will normally create an empty blob with a name like "my_dir/"
356
+
357
+ if not self._explicit_dir_semantics:
358
+ self._fs.create_dir(resolved_path, recursive=recursive)
359
+ return
360
+
361
+ # Some FS backends for bucket-like storage do not allow empty blobs as directories
362
+ # For these backends, we have to create an explicit marker file inside the directory
363
+ # In this case it is also necessary to check parents explicitly for non-recursive requests
364
+
365
+ if not recursive and prior_stat.type == pa_fs.FileType.NotFound:
366
+ parent_path = self._resolve_parent(resolved_path)
367
+ if parent_path is not None:
368
+ parent_stat: pa_fs.FileInfo = self._fs.get_file_info(parent_path)
369
+ if parent_stat.type != pa_fs.FileType.Directory:
370
+ raise FileNotFoundError
371
+
372
+ dir_marker = resolved_path + self._TRAC_DIR_MARKER
373
+ with self._fs.open_output_stream(dir_marker) as stream:
374
+ stream.write(b"")
331
375
 
332
376
  def rm(self, storage_path: str):
333
377
 
@@ -357,11 +401,11 @@ class CommonFileStorage(IFileStorage):
357
401
 
358
402
  self._fs.delete_dir(resolved_path)
359
403
 
360
- def read_byte_stream(self, storage_path: str) -> tp.BinaryIO:
404
+ def read_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
361
405
 
362
406
  return self._wrap_operation(self._read_byte_stream, "OPEN BYTE STREAM (READ)", storage_path)
363
407
 
364
- def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.BinaryIO:
408
+ def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
365
409
 
366
410
  resolved_path = self._resolve_path(operation_name, storage_path, False)
367
411
 
@@ -382,13 +426,13 @@ class CommonFileStorage(IFileStorage):
382
426
  stream = self._fs.open_input_file(resolved_path)
383
427
 
384
428
  # Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
385
- return _NativeFileResource(stream, lambda: self._close_byte_stream(storage_path, stream, False)) # noqa
429
+ return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, False)) # noqa
386
430
 
387
- def write_byte_stream(self, storage_path: str) -> tp.BinaryIO:
431
+ def write_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
388
432
 
389
433
  return self._wrap_operation(self._write_byte_stream, "OPEN BYTE STREAM (WRITE)", storage_path)
390
434
 
391
- def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.BinaryIO:
435
+ def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
392
436
 
393
437
  resolved_path = self._resolve_path(operation_name, storage_path, False)
394
438
 
@@ -415,14 +459,10 @@ class CommonFileStorage(IFileStorage):
415
459
  stream = self._fs.open_output_stream(resolved_path)
416
460
 
417
461
  # Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
418
- return _NativeFileResource(stream, lambda: self._close_byte_stream(storage_path, stream, True, delete_on_error)) # noqa
462
+ return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, True, delete_on_error)) # noqa
419
463
 
420
464
  def _close_byte_stream(self, storage_path: str, stream: tp.BinaryIO, is_write: bool, delete_on_error: bool = False):
421
465
 
422
- # Do not try to close the stream twice
423
- if stream.closed:
424
- return
425
-
426
466
  # If there has been an error, log it
427
467
  exc_info = sys.exc_info()
428
468
  error = exc_info[1] if exc_info is not None else None
@@ -438,7 +478,8 @@ class CommonFileStorage(IFileStorage):
438
478
  # Close the stream - this may take time for write streams that are not flushed
439
479
  # Closing here gives better logs, because any pause is before the close message
440
480
  # As a fail-safe, _NativeFileResource always calls close() in a "finally" block
441
- stream.close()
481
+ if not stream.closed:
482
+ stream.close()
442
483
 
443
484
  # Log closing of the stream
444
485
  if is_write:
@@ -454,9 +495,33 @@ class CommonFileStorage(IFileStorage):
454
495
  file_info = self._fs.get_file_info(storage_path)
455
496
  if file_info.type != pa_fs.FileType.NotFound:
456
497
  self._fs.delete_file(storage_path)
457
- except OSError:
498
+ # different implementations can throw different errors here
499
+ except Exception: # noqa
458
500
  pass
459
501
 
502
+ # Stream implementations can raise various types of error during stream operations
503
+ # Errors can have different causes (access, communication, missing / duplicate files etc.)
504
+ # Also, other errors can occur inside the stream context manager, unrelated to IO
505
+
506
+ # In the case of an IO error we want to raise EStorage, other errors should propagate as they are
507
+ # This handler tries to spot IO errors from inside the PyArrow library, it is probably not fail-safe
508
+ # If an IO error is not spotted, the original error will propagate and get reported as EUnexpected
509
+ # Anyway this handler is only for errors that happen after the stream is opened
510
+
511
+ # The alternative is to override every method in _NativeFileResource and try to catch there
512
+ # However, different implementations raise different error types, so we still need some kind of inspection
513
+
514
+ if error is not None:
515
+
516
+ if isinstance(error, OSError):
517
+ raise _ex.EStorage from error
518
+
519
+ stack = tb.extract_tb(exc_info[2])
520
+ stack = filter(lambda frame: frame.filename is not None, stack)
521
+
522
+ if any(filter(lambda frame: frame.filename.startswith("pyarrow/"), stack)):
523
+ raise _ex.EStorage from error
524
+
460
525
  def _wrap_operation(self, func: tp.Callable, operation_name: str, storage_path: str, *args, **kwargs) -> tp.Any:
461
526
 
462
527
  operation = f"{operation_name} {self._key} [{storage_path}]"