PyPI - tracdap-runtime - Versions diffs - 0.6.0rc1__py3-none-any.whl → 0.6.0rc3__py3-none-any.whl - Mend

tracdap-runtime 0.6.0rc1py3-none-any.whl → 0.6.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

tracdap/rt/_impl/data.py +53 -16
tracdap/rt/_impl/storage.py +92 -27
tracdap/rt/_plugins/storage_aws.py +158 -142
tracdap/rt/_plugins/storage_azure.py +155 -0
tracdap/rt/_plugins/storage_gcp.py +72 -15
tracdap/rt/_plugins/storage_local.py +11 -6
tracdap/rt/_version.py +1 -1
tracdap/rt/config/__init__.py +12 -17
tracdap/rt/config/common.py +10 -0
tracdap/rt/config/common_pb2.py +38 -31
tracdap/rt/config/job_pb2.py +21 -20
tracdap/rt/config/platform.py +60 -25
tracdap/rt/config/platform_pb2.py +52 -45
tracdap/rt/config/result_pb2.py +15 -14
tracdap/rt/config/runtime.py +0 -1
tracdap/rt/config/runtime_pb2.py +24 -24
tracdap/rt/ext/storage.py +2 -2
tracdap/rt/metadata/__init__.py +20 -20
tracdap/rt/metadata/common_pb2.py +15 -14
tracdap/rt/metadata/custom_pb2.py +9 -8
tracdap/rt/metadata/data_pb2.py +31 -30
tracdap/rt/metadata/file_pb2.py +9 -8
tracdap/rt/metadata/flow_pb2.py +33 -32
tracdap/rt/metadata/job_pb2.py +55 -54
tracdap/rt/metadata/model_pb2.py +31 -30
tracdap/rt/metadata/object_id_pb2.py +13 -12
tracdap/rt/metadata/object_pb2.py +9 -8
tracdap/rt/metadata/search_pb2.py +19 -18
tracdap/rt/metadata/stoarge_pb2.py +31 -30
tracdap/rt/metadata/tag_pb2.py +13 -12
tracdap/rt/metadata/tag_update_pb2.py +11 -10
tracdap/rt/metadata/type_pb2.py +29 -28
{tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/METADATA +27 -15
{tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/RECORD +37 -38
{tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/WHEEL +1 -1
tracdap/rt/config/gateway.py +0 -104
tracdap/rt/config/gateway_pb2.py +0 -45
{tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/LICENSE +0 -0
{tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/top_level.txt +0 -0

tracdap/rt/_impl/data.py CHANGED Viewed

@@ -74,11 +74,7 @@ class DataView:
 class _DataInternal:
-    @staticmethod
-    def float_dtype_check():
-        if "Float64Dtype" not in pd.__dict__:
-            raise _ex.EStartup("TRAC D.A.P. requires Pandas >= 1.2")
+    pass
 class DataMapping:
@@ -111,8 +107,40 @@ class DataMapping:
     }
     # Check the Pandas dtypes for handling floats are available before setting up the type mapping
-    __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
-    __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype
+    __PANDAS_VERSION_ELEMENTS = pd.__version__.split(".")
+    __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
+    __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
+    if __PANDAS_MAJOR_VERSION == 2:
+        __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
+        __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
+        @classmethod
+        def __pandas_datetime_type(cls, tz, unit):
+            if tz is None and unit is None:
+                return cls.__PANDAS_DATETIME_TYPE
+            _unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
+            if tz is None:
+                return pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
+            else:
+                return pd.DatetimeTZDtype(tz=tz, unit=_unit)
+    # Minimum supported version for Pandas is 1.2, when pd.Float64Dtype was introduced
+    elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
+        __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).dtype
+        __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
+        @classmethod
+        def __pandas_datetime_type(cls, tz, unit):  # noqa
+            if tz is None:
+                return cls.__PANDAS_DATETIME_TYPE
+            else:
+                return pd.DatetimeTZDtype(tz=tz)
+    else:
+        raise _ex.EStartup(f"Pandas version not supported: [{pd.__version__}]")
     # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
     __ARROW_TO_PANDAS_TYPE_MAPPING = {
@@ -224,8 +252,12 @@ class DataMapping:
             cls.__TRAC_DECIMAL_SCALE)
     @classmethod
-    def pandas_datetime_type(cls):
-        return cls.__PANDAS_DATETIME_TYPE
+    def pandas_date_type(cls):
+        return cls.__PANDAS_DATE_TYPE
+    @classmethod
+    def pandas_datetime_type(cls, tz=None, unit=None):
+        return cls.__pandas_datetime_type(tz, unit)
     @classmethod
     def view_to_pandas(
@@ -297,6 +329,7 @@ class DataMapping:
         else:
             DataConformance.check_duplicate_fields(table.schema.names, False)
+        # Use Arrow's built-in function to convert to Pandas
         return table.to_pandas(
             # Mapping for arrow -> pandas types for core types
@@ -463,7 +496,7 @@ class DataConformance:
                 table_column: pa.Array = table.column(table_index)
-                pandas_type = pandas_types[table_index] \
+                pandas_type = pandas_types.iloc[table_index] \
                     if pandas_types is not None \
                     else None
@@ -691,16 +724,20 @@ class DataConformance:
     @classmethod
     def _coerce_date(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
-        # Allow casting date32 -> date64, both range and precision are greater so there is no data loss
+        # The bit-width restriction could be removed here
+        # For date types there is never loss of precision and pa.cast will raise an error on overflow
+        # Impact to client code is unlikely, still this change should happen with a TRAC minor version update
         if pa.types.is_date(vector.type):
             if field.type.bit_width >= vector.type.bit_width:
                 return pc.cast(vector, field.type)
-        # Special handling for Pandas/NumPy date values
-        # These are encoded as np.datetime64[ns] in Pandas -> pa.timestamp64[ns] in Arrow
-        # Only allow this conversion if the vector is coming from Pandas with datetime type
-        if pandas_type == DataMapping.pandas_datetime_type():
-            if pa.types.is_timestamp(vector.type) and vector.type.unit == "ns":
+        # Special handling for date values coming from Pandas/NumPy
+        # Only allow these conversions if the vector is supplied with Pandas type info
+        # For Pandas 1.x, dates are always encoded as np.datetime64[ns]
+        # For Pandas 2.x dates are still np.datetime64 but can be in s, ms, us or ns
+        # This conversion will not apply to dates held in Pandas using the Python date object types
+        if pandas_type is not None:
+            if pa.types.is_timestamp(vector.type) and pd.api.types.is_datetime64_any_dtype(pandas_type):
                 return pc.cast(vector, field.type)
         error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)

tracdap/rt/_impl/storage.py CHANGED Viewed

@@ -18,6 +18,7 @@ import pathlib
 import re
 import sys
 import typing as tp
+import traceback as tb
 import pyarrow as pa
 import pyarrow.fs as pa_fs
@@ -178,20 +179,17 @@ class StorageManager:
 # ----------------------------------------------------------------------------------------------------------------------
-class _NativeFileResource(pa_lib.NativeFile):
+class _NativeFileContext(tp.ContextManager[tp.BinaryIO]):
     def __init__(self, nf: pa_lib.NativeFile, close_func: tp.Callable):
         super().__init__()
         self.__nf = nf
         self.__close_func = close_func
-    def __getattribute__(self, item):
-        if item == "close" or item == "_NativeFileResource__nf" or item == "_NativeFileResource__close_func":
-            return object.__getattribute__(self, item)
-        else:
-            return object.__getattribute__(self.__nf, item)
+    def __enter__(self):
+        return self.__nf
-    def close(self):
+    def __exit__(self, exc_type, exc_val, exc_tb):
         try:
             self.__close_func()
         finally:
@@ -200,24 +198,40 @@ class _NativeFileResource(pa_lib.NativeFile):
 class CommonFileStorage(IFileStorage):
+    _TRAC_DIR_MARKER = "/.trac_dir"
     FILE_SEMANTICS_FS_TYPES = ["local"]
-    BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs"]
+    BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs", "abfs"]
-    def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig, fs_impl: pa_fs.SubTreeFileSystem):
+    def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig, fs: pa_fs.SubTreeFileSystem):
         self._log = _util.logger_for_object(self)
         self._key = storage_key
         self._config = storage_config
-        self._fs = fs_impl
+        self._fs = fs
+        fs_type = fs.base_fs.type_name
+        fs_impl = "arrow"
+        fs_root = fs.base_path
-        fs_type = fs_impl.base_fs.type_name
-        fs_root = fs_impl.base_path
+        # If this is an FSSpec implementation, take the protocol from FSSpec as the FS type
+        base_fs = fs.base_fs
+        if isinstance(base_fs, pa_fs.PyFileSystem):
+            handler = base_fs.handler
+            if isinstance(handler, pa_fs.FSSpecHandler):
+                fs_type = handler.fs.protocol[0] if isinstance(handler.fs.protocol, tuple) else handler.fs.protocol
+                fs_impl = "fsspec"
         # Some optimization is possible if the underlying storage semantics are known
         self._file_semantics = True if fs_type in self.FILE_SEMANTICS_FS_TYPES else False
         self._bucket_semantics = True if fs_type in self.BUCKET_SEMANTICS_FS_TYPES else False
+        self._explicit_dir_semantics = True if self._bucket_semantics and fs_impl == "fsspec" else False
-        self._log.info(f"INIT [{self._key}]: Common file storage, fs = [{fs_type}], root = [{fs_root}]")
+        self._log.info(
+            f"INIT [{self._key}]: Common file storage, " +
+            f"fs = [{fs_type}], " +
+            f"impl = [{fs_impl}], " +
+            f"root = [{fs_root}]")
     def exists(self, storage_path: str) -> bool:
@@ -280,6 +294,13 @@ class CommonFileStorage(IFileStorage):
         file_type = FileType.FILE if file_info.is_file else FileType.DIRECTORY
         file_size = file_info.size if file_info.is_file else 0
+        # Normalization in case the impl gives back directory entries with a trailing slash
+        if file_type == FileType.DIRECTORY and storage_path.endswith("/"):
+            storage_path = storage_path[:-1]
+            separator = storage_path.rfind("/")
+            file_name = storage_path[separator+1:]
         mtime = file_info.mtime.astimezone(dt.timezone.utc) if file_info.mtime is not None else None
         return FileStat(
@@ -307,8 +328,10 @@ class CommonFileStorage(IFileStorage):
         # Otherwise do a normal directory listing
         else:
-            selector = pa_fs.FileSelector(resolved_path, recursive=recursive)  # noqa
+            # A trailing slash prevents some implementations including the directory in its own listing
+            selector = pa_fs.FileSelector(resolved_path + "/", recursive=recursive)  # noqa
             file_infos = self._fs.get_file_info(selector)
+            file_infos = filter(lambda fi: not fi.path.endswith(self._TRAC_DIR_MARKER), file_infos)
             return list(map(self._info_to_stat, file_infos))
     def mkdir(self, storage_path: str, recursive: bool = False):
@@ -323,11 +346,32 @@ class CommonFileStorage(IFileStorage):
         # In cloud bucket semantics a file and dir can both exist with the same name - very confusing!
         # There is a race condition here because a file could be created by another process
         # But, given the very structured way TRAC uses file storage, this is extremely unlikely
         prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
         if prior_stat.type == pa_fs.FileType.File or prior_stat.type == pa_fs.FileType.Unknown:
             raise self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
-        self._fs.create_dir(resolved_path, recursive=recursive)
+        # For most FS types, it is fine to use the Arrow create_dir() method
+        # For bucket-like storage, this will normally create an empty blob with a name like "my_dir/"
+        if not self._explicit_dir_semantics:
+            self._fs.create_dir(resolved_path, recursive=recursive)
+            return
+        # Some FS backends for bucket-like storage do not allow empty blobs as directories
+        # For these backends, we have to create an explicit marker file inside the directory
+        # In this case it is also necessary to check parents explicitly for non-recursive requests
+        if not recursive and prior_stat.type == pa_fs.FileType.NotFound:
+            parent_path = self._resolve_parent(resolved_path)
+            if parent_path is not None:
+                parent_stat: pa_fs.FileInfo = self._fs.get_file_info(parent_path)
+                if parent_stat.type != pa_fs.FileType.Directory:
+                    raise FileNotFoundError
+        dir_marker = resolved_path + self._TRAC_DIR_MARKER
+        with self._fs.open_output_stream(dir_marker) as stream:
+            stream.write(b"")
     def rm(self, storage_path: str):
@@ -357,11 +401,11 @@ class CommonFileStorage(IFileStorage):
         self._fs.delete_dir(resolved_path)
-    def read_byte_stream(self, storage_path: str) -> tp.BinaryIO:
+    def read_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
         return self._wrap_operation(self._read_byte_stream, "OPEN BYTE STREAM (READ)", storage_path)
-    def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.BinaryIO:
+    def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
         resolved_path = self._resolve_path(operation_name, storage_path, False)
@@ -382,13 +426,13 @@ class CommonFileStorage(IFileStorage):
         stream = self._fs.open_input_file(resolved_path)
         # Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
-        return _NativeFileResource(stream, lambda: self._close_byte_stream(storage_path, stream, False))  # noqa
+        return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, False))  # noqa
-    def write_byte_stream(self, storage_path: str) -> tp.BinaryIO:
+    def write_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
         return self._wrap_operation(self._write_byte_stream, "OPEN BYTE STREAM (WRITE)", storage_path)
-    def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.BinaryIO:
+    def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
         resolved_path = self._resolve_path(operation_name, storage_path, False)
@@ -415,14 +459,10 @@ class CommonFileStorage(IFileStorage):
         stream = self._fs.open_output_stream(resolved_path)
         # Return impl of  PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
-        return _NativeFileResource(stream, lambda: self._close_byte_stream(storage_path, stream, True, delete_on_error))  # noqa
+        return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, True, delete_on_error))  # noqa
     def _close_byte_stream(self, storage_path: str, stream: tp.BinaryIO, is_write: bool, delete_on_error: bool = False):
-        # Do not try to close the stream twice
-        if stream.closed:
-            return
         # If there has been an error, log it
         exc_info = sys.exc_info()
         error = exc_info[1] if exc_info is not None else None
@@ -438,7 +478,8 @@ class CommonFileStorage(IFileStorage):
         # Close the stream - this may take time for write streams that are not flushed
         # Closing here gives better logs, because any pause is before the close message
         # As a fail-safe, _NativeFileResource always calls close() in a "finally" block
-        stream.close()
+        if not stream.closed:
+            stream.close()
         # Log closing of the stream
         if is_write:
@@ -454,9 +495,33 @@ class CommonFileStorage(IFileStorage):
                 file_info = self._fs.get_file_info(storage_path)
                 if file_info.type != pa_fs.FileType.NotFound:
                     self._fs.delete_file(storage_path)
-            except OSError:
+            # different implementations can throw different errors here
+            except Exception:  # noqa
                 pass
+        # Stream implementations can raise various types of error during stream operations
+        # Errors can have different causes (access, communication, missing / duplicate files etc.)
+        # Also, other errors can occur inside the stream context manager, unrelated to IO
+        # In the case of an IO error we want to raise EStorage, other errors should propagate as they are
+        # This handler tries to spot IO errors from inside the PyArrow library, it is probably not fail-safe
+        # If an IO error is not spotted, the original error will propagate and get reported as EUnexpected
+        # Anyway this handler is only for errors that happen after the stream is opened
+        # The alternative is to override every method in _NativeFileResource and try to catch there
+        # However, different implementations raise different error types, so we still need some kind of inspection
+        if error is not None:
+            if isinstance(error, OSError):
+                raise _ex.EStorage from error
+            stack = tb.extract_tb(exc_info[2])
+            stack = filter(lambda frame: frame.filename is not None, stack)
+            if any(filter(lambda frame: frame.filename.startswith("pyarrow/"), stack)):
+                raise _ex.EStorage from error
     def _wrap_operation(self, func: tp.Callable, operation_name: str, storage_path: str, *args, **kwargs) -> tp.Any:
         operation = f"{operation_name} {self._key} [{storage_path}]"

tracdap-runtime 0.6.0rc1__py3-none-any.whl → 0.6.0rc3__py3-none-any.whl

tracdap-runtime 0.6.0rc1py3-none-any.whl → 0.6.0rc3py3-none-any.whl