tracdap-runtime 0.6.0rc1__py3-none-any.whl → 0.6.0rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/_impl/data.py +53 -16
- tracdap/rt/_impl/storage.py +92 -27
- tracdap/rt/_plugins/storage_aws.py +158 -142
- tracdap/rt/_plugins/storage_azure.py +155 -0
- tracdap/rt/_plugins/storage_gcp.py +72 -15
- tracdap/rt/_plugins/storage_local.py +11 -6
- tracdap/rt/_version.py +1 -1
- tracdap/rt/config/__init__.py +12 -17
- tracdap/rt/config/common.py +10 -0
- tracdap/rt/config/common_pb2.py +38 -31
- tracdap/rt/config/job_pb2.py +21 -20
- tracdap/rt/config/platform.py +60 -25
- tracdap/rt/config/platform_pb2.py +52 -45
- tracdap/rt/config/result_pb2.py +15 -14
- tracdap/rt/config/runtime.py +0 -1
- tracdap/rt/config/runtime_pb2.py +24 -24
- tracdap/rt/ext/storage.py +2 -2
- tracdap/rt/metadata/__init__.py +20 -20
- tracdap/rt/metadata/common_pb2.py +15 -14
- tracdap/rt/metadata/custom_pb2.py +9 -8
- tracdap/rt/metadata/data_pb2.py +31 -30
- tracdap/rt/metadata/file_pb2.py +9 -8
- tracdap/rt/metadata/flow_pb2.py +33 -32
- tracdap/rt/metadata/job_pb2.py +55 -54
- tracdap/rt/metadata/model_pb2.py +31 -30
- tracdap/rt/metadata/object_id_pb2.py +13 -12
- tracdap/rt/metadata/object_pb2.py +9 -8
- tracdap/rt/metadata/search_pb2.py +19 -18
- tracdap/rt/metadata/stoarge_pb2.py +31 -30
- tracdap/rt/metadata/tag_pb2.py +13 -12
- tracdap/rt/metadata/tag_update_pb2.py +11 -10
- tracdap/rt/metadata/type_pb2.py +29 -28
- {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/METADATA +27 -15
- {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/RECORD +37 -38
- {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/WHEEL +1 -1
- tracdap/rt/config/gateway.py +0 -104
- tracdap/rt/config/gateway_pb2.py +0 -45
- {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/LICENSE +0 -0
- {tracdap_runtime-0.6.0rc1.dist-info → tracdap_runtime-0.6.0rc3.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/data.py
CHANGED
@@ -74,11 +74,7 @@ class DataView:
|
|
74
74
|
|
75
75
|
|
76
76
|
class _DataInternal:
|
77
|
-
|
78
|
-
@staticmethod
|
79
|
-
def float_dtype_check():
|
80
|
-
if "Float64Dtype" not in pd.__dict__:
|
81
|
-
raise _ex.EStartup("TRAC D.A.P. requires Pandas >= 1.2")
|
77
|
+
pass
|
82
78
|
|
83
79
|
|
84
80
|
class DataMapping:
|
@@ -111,8 +107,40 @@ class DataMapping:
|
|
111
107
|
}
|
112
108
|
|
113
109
|
# Check the Pandas dtypes for handling floats are available before setting up the type mapping
|
114
|
-
|
115
|
-
|
110
|
+
__PANDAS_VERSION_ELEMENTS = pd.__version__.split(".")
|
111
|
+
__PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
|
112
|
+
__PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
|
113
|
+
|
114
|
+
if __PANDAS_MAJOR_VERSION == 2:
|
115
|
+
|
116
|
+
__PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
|
117
|
+
__PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
|
118
|
+
|
119
|
+
@classmethod
|
120
|
+
def __pandas_datetime_type(cls, tz, unit):
|
121
|
+
if tz is None and unit is None:
|
122
|
+
return cls.__PANDAS_DATETIME_TYPE
|
123
|
+
_unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
|
124
|
+
if tz is None:
|
125
|
+
return pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
|
126
|
+
else:
|
127
|
+
return pd.DatetimeTZDtype(tz=tz, unit=_unit)
|
128
|
+
|
129
|
+
# Minimum supported version for Pandas is 1.2, when pd.Float64Dtype was introduced
|
130
|
+
elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
|
131
|
+
|
132
|
+
__PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).dtype
|
133
|
+
__PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
|
134
|
+
|
135
|
+
@classmethod
|
136
|
+
def __pandas_datetime_type(cls, tz, unit): # noqa
|
137
|
+
if tz is None:
|
138
|
+
return cls.__PANDAS_DATETIME_TYPE
|
139
|
+
else:
|
140
|
+
return pd.DatetimeTZDtype(tz=tz)
|
141
|
+
|
142
|
+
else:
|
143
|
+
raise _ex.EStartup(f"Pandas version not supported: [{pd.__version__}]")
|
116
144
|
|
117
145
|
# Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
|
118
146
|
__ARROW_TO_PANDAS_TYPE_MAPPING = {
|
@@ -224,8 +252,12 @@ class DataMapping:
|
|
224
252
|
cls.__TRAC_DECIMAL_SCALE)
|
225
253
|
|
226
254
|
@classmethod
|
227
|
-
def
|
228
|
-
return cls.
|
255
|
+
def pandas_date_type(cls):
|
256
|
+
return cls.__PANDAS_DATE_TYPE
|
257
|
+
|
258
|
+
@classmethod
|
259
|
+
def pandas_datetime_type(cls, tz=None, unit=None):
|
260
|
+
return cls.__pandas_datetime_type(tz, unit)
|
229
261
|
|
230
262
|
@classmethod
|
231
263
|
def view_to_pandas(
|
@@ -297,6 +329,7 @@ class DataMapping:
|
|
297
329
|
else:
|
298
330
|
DataConformance.check_duplicate_fields(table.schema.names, False)
|
299
331
|
|
332
|
+
# Use Arrow's built-in function to convert to Pandas
|
300
333
|
return table.to_pandas(
|
301
334
|
|
302
335
|
# Mapping for arrow -> pandas types for core types
|
@@ -463,7 +496,7 @@ class DataConformance:
|
|
463
496
|
|
464
497
|
table_column: pa.Array = table.column(table_index)
|
465
498
|
|
466
|
-
pandas_type = pandas_types[table_index] \
|
499
|
+
pandas_type = pandas_types.iloc[table_index] \
|
467
500
|
if pandas_types is not None \
|
468
501
|
else None
|
469
502
|
|
@@ -691,16 +724,20 @@ class DataConformance:
|
|
691
724
|
@classmethod
|
692
725
|
def _coerce_date(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
|
693
726
|
|
694
|
-
#
|
727
|
+
# The bit-width restriction could be removed here
|
728
|
+
# For date types there is never loss of precision and pa.cast will raise an error on overflow
|
729
|
+
# Impact to client code is unlikely, still this change should happen with a TRAC minor version update
|
695
730
|
if pa.types.is_date(vector.type):
|
696
731
|
if field.type.bit_width >= vector.type.bit_width:
|
697
732
|
return pc.cast(vector, field.type)
|
698
733
|
|
699
|
-
# Special handling for Pandas/NumPy
|
700
|
-
#
|
701
|
-
#
|
702
|
-
|
703
|
-
|
734
|
+
# Special handling for date values coming from Pandas/NumPy
|
735
|
+
# Only allow these conversions if the vector is supplied with Pandas type info
|
736
|
+
# For Pandas 1.x, dates are always encoded as np.datetime64[ns]
|
737
|
+
# For Pandas 2.x dates are still np.datetime64 but can be in s, ms, us or ns
|
738
|
+
# This conversion will not apply to dates held in Pandas using the Python date object types
|
739
|
+
if pandas_type is not None:
|
740
|
+
if pa.types.is_timestamp(vector.type) and pd.api.types.is_datetime64_any_dtype(pandas_type):
|
704
741
|
return pc.cast(vector, field.type)
|
705
742
|
|
706
743
|
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
tracdap/rt/_impl/storage.py
CHANGED
@@ -18,6 +18,7 @@ import pathlib
|
|
18
18
|
import re
|
19
19
|
import sys
|
20
20
|
import typing as tp
|
21
|
+
import traceback as tb
|
21
22
|
|
22
23
|
import pyarrow as pa
|
23
24
|
import pyarrow.fs as pa_fs
|
@@ -178,20 +179,17 @@ class StorageManager:
|
|
178
179
|
# ----------------------------------------------------------------------------------------------------------------------
|
179
180
|
|
180
181
|
|
181
|
-
class
|
182
|
+
class _NativeFileContext(tp.ContextManager[tp.BinaryIO]):
|
182
183
|
|
183
184
|
def __init__(self, nf: pa_lib.NativeFile, close_func: tp.Callable):
|
184
185
|
super().__init__()
|
185
186
|
self.__nf = nf
|
186
187
|
self.__close_func = close_func
|
187
188
|
|
188
|
-
def
|
189
|
-
|
190
|
-
return object.__getattribute__(self, item)
|
191
|
-
else:
|
192
|
-
return object.__getattribute__(self.__nf, item)
|
189
|
+
def __enter__(self):
|
190
|
+
return self.__nf
|
193
191
|
|
194
|
-
def
|
192
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
195
193
|
try:
|
196
194
|
self.__close_func()
|
197
195
|
finally:
|
@@ -200,24 +198,40 @@ class _NativeFileResource(pa_lib.NativeFile):
|
|
200
198
|
|
201
199
|
class CommonFileStorage(IFileStorage):
|
202
200
|
|
201
|
+
_TRAC_DIR_MARKER = "/.trac_dir"
|
202
|
+
|
203
203
|
FILE_SEMANTICS_FS_TYPES = ["local"]
|
204
|
-
BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs"]
|
204
|
+
BUCKET_SEMANTICS_FS_TYPES = ["s3", "gcs", "abfs"]
|
205
205
|
|
206
|
-
def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig,
|
206
|
+
def __init__(self, storage_key: str, storage_config: _cfg.PluginConfig, fs: pa_fs.SubTreeFileSystem):
|
207
207
|
|
208
208
|
self._log = _util.logger_for_object(self)
|
209
209
|
self._key = storage_key
|
210
210
|
self._config = storage_config
|
211
|
-
self._fs =
|
211
|
+
self._fs = fs
|
212
|
+
|
213
|
+
fs_type = fs.base_fs.type_name
|
214
|
+
fs_impl = "arrow"
|
215
|
+
fs_root = fs.base_path
|
212
216
|
|
213
|
-
|
214
|
-
|
217
|
+
# If this is an FSSpec implementation, take the protocol from FSSpec as the FS type
|
218
|
+
base_fs = fs.base_fs
|
219
|
+
if isinstance(base_fs, pa_fs.PyFileSystem):
|
220
|
+
handler = base_fs.handler
|
221
|
+
if isinstance(handler, pa_fs.FSSpecHandler):
|
222
|
+
fs_type = handler.fs.protocol[0] if isinstance(handler.fs.protocol, tuple) else handler.fs.protocol
|
223
|
+
fs_impl = "fsspec"
|
215
224
|
|
216
225
|
# Some optimization is possible if the underlying storage semantics are known
|
217
226
|
self._file_semantics = True if fs_type in self.FILE_SEMANTICS_FS_TYPES else False
|
218
227
|
self._bucket_semantics = True if fs_type in self.BUCKET_SEMANTICS_FS_TYPES else False
|
228
|
+
self._explicit_dir_semantics = True if self._bucket_semantics and fs_impl == "fsspec" else False
|
219
229
|
|
220
|
-
self._log.info(
|
230
|
+
self._log.info(
|
231
|
+
f"INIT [{self._key}]: Common file storage, " +
|
232
|
+
f"fs = [{fs_type}], " +
|
233
|
+
f"impl = [{fs_impl}], " +
|
234
|
+
f"root = [{fs_root}]")
|
221
235
|
|
222
236
|
def exists(self, storage_path: str) -> bool:
|
223
237
|
|
@@ -280,6 +294,13 @@ class CommonFileStorage(IFileStorage):
|
|
280
294
|
|
281
295
|
file_type = FileType.FILE if file_info.is_file else FileType.DIRECTORY
|
282
296
|
file_size = file_info.size if file_info.is_file else 0
|
297
|
+
|
298
|
+
# Normalization in case the impl gives back directory entries with a trailing slash
|
299
|
+
if file_type == FileType.DIRECTORY and storage_path.endswith("/"):
|
300
|
+
storage_path = storage_path[:-1]
|
301
|
+
separator = storage_path.rfind("/")
|
302
|
+
file_name = storage_path[separator+1:]
|
303
|
+
|
283
304
|
mtime = file_info.mtime.astimezone(dt.timezone.utc) if file_info.mtime is not None else None
|
284
305
|
|
285
306
|
return FileStat(
|
@@ -307,8 +328,10 @@ class CommonFileStorage(IFileStorage):
|
|
307
328
|
|
308
329
|
# Otherwise do a normal directory listing
|
309
330
|
else:
|
310
|
-
|
331
|
+
# A trailing slash prevents some implementations including the directory in its own listing
|
332
|
+
selector = pa_fs.FileSelector(resolved_path + "/", recursive=recursive) # noqa
|
311
333
|
file_infos = self._fs.get_file_info(selector)
|
334
|
+
file_infos = filter(lambda fi: not fi.path.endswith(self._TRAC_DIR_MARKER), file_infos)
|
312
335
|
return list(map(self._info_to_stat, file_infos))
|
313
336
|
|
314
337
|
def mkdir(self, storage_path: str, recursive: bool = False):
|
@@ -323,11 +346,32 @@ class CommonFileStorage(IFileStorage):
|
|
323
346
|
# In cloud bucket semantics a file and dir can both exist with the same name - very confusing!
|
324
347
|
# There is a race condition here because a file could be created by another process
|
325
348
|
# But, given the very structured way TRAC uses file storage, this is extremely unlikely
|
349
|
+
|
326
350
|
prior_stat: pa_fs.FileInfo = self._fs.get_file_info(resolved_path)
|
327
351
|
if prior_stat.type == pa_fs.FileType.File or prior_stat.type == pa_fs.FileType.Unknown:
|
328
352
|
raise self._explicit_error(self.ExplicitError.OBJECT_ALREADY_EXISTS, operation_name, storage_path)
|
329
353
|
|
330
|
-
|
354
|
+
# For most FS types, it is fine to use the Arrow create_dir() method
|
355
|
+
# For bucket-like storage, this will normally create an empty blob with a name like "my_dir/"
|
356
|
+
|
357
|
+
if not self._explicit_dir_semantics:
|
358
|
+
self._fs.create_dir(resolved_path, recursive=recursive)
|
359
|
+
return
|
360
|
+
|
361
|
+
# Some FS backends for bucket-like storage do not allow empty blobs as directories
|
362
|
+
# For these backends, we have to create an explicit marker file inside the directory
|
363
|
+
# In this case it is also necessary to check parents explicitly for non-recursive requests
|
364
|
+
|
365
|
+
if not recursive and prior_stat.type == pa_fs.FileType.NotFound:
|
366
|
+
parent_path = self._resolve_parent(resolved_path)
|
367
|
+
if parent_path is not None:
|
368
|
+
parent_stat: pa_fs.FileInfo = self._fs.get_file_info(parent_path)
|
369
|
+
if parent_stat.type != pa_fs.FileType.Directory:
|
370
|
+
raise FileNotFoundError
|
371
|
+
|
372
|
+
dir_marker = resolved_path + self._TRAC_DIR_MARKER
|
373
|
+
with self._fs.open_output_stream(dir_marker) as stream:
|
374
|
+
stream.write(b"")
|
331
375
|
|
332
376
|
def rm(self, storage_path: str):
|
333
377
|
|
@@ -357,11 +401,11 @@ class CommonFileStorage(IFileStorage):
|
|
357
401
|
|
358
402
|
self._fs.delete_dir(resolved_path)
|
359
403
|
|
360
|
-
def read_byte_stream(self, storage_path: str) -> tp.BinaryIO:
|
404
|
+
def read_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
361
405
|
|
362
406
|
return self._wrap_operation(self._read_byte_stream, "OPEN BYTE STREAM (READ)", storage_path)
|
363
407
|
|
364
|
-
def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.BinaryIO:
|
408
|
+
def _read_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
365
409
|
|
366
410
|
resolved_path = self._resolve_path(operation_name, storage_path, False)
|
367
411
|
|
@@ -382,13 +426,13 @@ class CommonFileStorage(IFileStorage):
|
|
382
426
|
stream = self._fs.open_input_file(resolved_path)
|
383
427
|
|
384
428
|
# Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
|
385
|
-
return
|
429
|
+
return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, False)) # noqa
|
386
430
|
|
387
|
-
def write_byte_stream(self, storage_path: str) -> tp.BinaryIO:
|
431
|
+
def write_byte_stream(self, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
388
432
|
|
389
433
|
return self._wrap_operation(self._write_byte_stream, "OPEN BYTE STREAM (WRITE)", storage_path)
|
390
434
|
|
391
|
-
def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.BinaryIO:
|
435
|
+
def _write_byte_stream(self, operation_name: str, storage_path: str) -> tp.ContextManager[tp.BinaryIO]:
|
392
436
|
|
393
437
|
resolved_path = self._resolve_path(operation_name, storage_path, False)
|
394
438
|
|
@@ -415,14 +459,10 @@ class CommonFileStorage(IFileStorage):
|
|
415
459
|
stream = self._fs.open_output_stream(resolved_path)
|
416
460
|
|
417
461
|
# Return impl of PyArrow NativeFile instead of BinaryIO - this is the same thing PyArrow does
|
418
|
-
return
|
462
|
+
return _NativeFileContext(stream, lambda: self._close_byte_stream(storage_path, stream, True, delete_on_error)) # noqa
|
419
463
|
|
420
464
|
def _close_byte_stream(self, storage_path: str, stream: tp.BinaryIO, is_write: bool, delete_on_error: bool = False):
|
421
465
|
|
422
|
-
# Do not try to close the stream twice
|
423
|
-
if stream.closed:
|
424
|
-
return
|
425
|
-
|
426
466
|
# If there has been an error, log it
|
427
467
|
exc_info = sys.exc_info()
|
428
468
|
error = exc_info[1] if exc_info is not None else None
|
@@ -438,7 +478,8 @@ class CommonFileStorage(IFileStorage):
|
|
438
478
|
# Close the stream - this may take time for write streams that are not flushed
|
439
479
|
# Closing here gives better logs, because any pause is before the close message
|
440
480
|
# As a fail-safe, _NativeFileResource always calls close() in a "finally" block
|
441
|
-
stream.
|
481
|
+
if not stream.closed:
|
482
|
+
stream.close()
|
442
483
|
|
443
484
|
# Log closing of the stream
|
444
485
|
if is_write:
|
@@ -454,9 +495,33 @@ class CommonFileStorage(IFileStorage):
|
|
454
495
|
file_info = self._fs.get_file_info(storage_path)
|
455
496
|
if file_info.type != pa_fs.FileType.NotFound:
|
456
497
|
self._fs.delete_file(storage_path)
|
457
|
-
|
498
|
+
# different implementations can throw different errors here
|
499
|
+
except Exception: # noqa
|
458
500
|
pass
|
459
501
|
|
502
|
+
# Stream implementations can raise various types of error during stream operations
|
503
|
+
# Errors can have different causes (access, communication, missing / duplicate files etc.)
|
504
|
+
# Also, other errors can occur inside the stream context manager, unrelated to IO
|
505
|
+
|
506
|
+
# In the case of an IO error we want to raise EStorage, other errors should propagate as they are
|
507
|
+
# This handler tries to spot IO errors from inside the PyArrow library, it is probably not fail-safe
|
508
|
+
# If an IO error is not spotted, the original error will propagate and get reported as EUnexpected
|
509
|
+
# Anyway this handler is only for errors that happen after the stream is opened
|
510
|
+
|
511
|
+
# The alternative is to override every method in _NativeFileResource and try to catch there
|
512
|
+
# However, different implementations raise different error types, so we still need some kind of inspection
|
513
|
+
|
514
|
+
if error is not None:
|
515
|
+
|
516
|
+
if isinstance(error, OSError):
|
517
|
+
raise _ex.EStorage from error
|
518
|
+
|
519
|
+
stack = tb.extract_tb(exc_info[2])
|
520
|
+
stack = filter(lambda frame: frame.filename is not None, stack)
|
521
|
+
|
522
|
+
if any(filter(lambda frame: frame.filename.startswith("pyarrow/"), stack)):
|
523
|
+
raise _ex.EStorage from error
|
524
|
+
|
460
525
|
def _wrap_operation(self, func: tp.Callable, operation_name: str, storage_path: str, *args, **kwargs) -> tp.Any:
|
461
526
|
|
462
527
|
operation = f"{operation_name} {self._key} [{storage_path}]"
|