tracdap-runtime 0.5.30__py3-none-any.whl → 0.6.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. tracdap/rt/_exec/dev_mode.py +2 -1
  2. tracdap/rt/_impl/data.py +1 -28
  3. tracdap/rt/_impl/static_api.py +5 -1
  4. tracdap/rt/_impl/storage.py +586 -10
  5. tracdap/rt/_impl/util.py +24 -3
  6. tracdap/rt/_plugins/_helpers.py +26 -25
  7. tracdap/rt/_plugins/storage_aws.py +162 -76
  8. tracdap/rt/_plugins/storage_azure.py +155 -0
  9. tracdap/rt/_plugins/storage_gcp.py +183 -0
  10. tracdap/rt/_plugins/storage_local.py +249 -98
  11. tracdap/rt/_version.py +1 -1
  12. tracdap/rt/api/static_api.py +2 -1
  13. tracdap/rt/config/__init__.py +8 -13
  14. tracdap/rt/config/common.py +10 -0
  15. tracdap/rt/config/common_pb2.py +38 -31
  16. tracdap/rt/config/job_pb2.py +21 -20
  17. tracdap/rt/config/platform.py +60 -25
  18. tracdap/rt/config/platform_pb2.py +52 -45
  19. tracdap/rt/config/result_pb2.py +15 -14
  20. tracdap/rt/config/runtime.py +0 -1
  21. tracdap/rt/config/runtime_pb2.py +24 -24
  22. tracdap/rt/exceptions.py +9 -0
  23. tracdap/rt/ext/plugins.py +0 -12
  24. tracdap/rt/ext/storage.py +47 -29
  25. tracdap/rt/metadata/common_pb2.py +15 -14
  26. tracdap/rt/metadata/custom_pb2.py +9 -8
  27. tracdap/rt/metadata/data_pb2.py +31 -30
  28. tracdap/rt/metadata/file_pb2.py +9 -8
  29. tracdap/rt/metadata/flow_pb2.py +33 -32
  30. tracdap/rt/metadata/job_pb2.py +55 -54
  31. tracdap/rt/metadata/model_pb2.py +31 -30
  32. tracdap/rt/metadata/object_id_pb2.py +13 -12
  33. tracdap/rt/metadata/object_pb2.py +9 -8
  34. tracdap/rt/metadata/search_pb2.py +19 -18
  35. tracdap/rt/metadata/stoarge_pb2.py +31 -30
  36. tracdap/rt/metadata/tag_pb2.py +13 -12
  37. tracdap/rt/metadata/tag_update_pb2.py +11 -10
  38. tracdap/rt/metadata/type_pb2.py +29 -28
  39. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dev1.dist-info}/METADATA +26 -15
  40. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dev1.dist-info}/RECORD +43 -43
  41. tracdap/rt/config/gateway.py +0 -104
  42. tracdap/rt/config/gateway_pb2.py +0 -45
  43. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dev1.dist-info}/LICENSE +0 -0
  44. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dev1.dist-info}/WHEEL +0 -0
  45. {tracdap_runtime-0.5.30.dist-info → tracdap_runtime-0.6.0.dev1.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/util.py CHANGED
@@ -145,6 +145,28 @@ def logger_for_namespace(namespace: str) -> logging.Logger:
145
145
  return logging.getLogger(namespace)
146
146
 
147
147
 
148
+ def format_file_size(size: int) -> str:
149
+
150
+ if size < 1024:
151
+ if size == 0:
152
+ return "0 bytes"
153
+ elif size == 1:
154
+ return "1 byte"
155
+ else:
156
+ return f"{size} bytes"
157
+
158
+ if size < 1024 ** 2:
159
+ kb = size / 1024
160
+ return f"{kb:.1f} KB"
161
+
162
+ if size < 1024 ** 3:
163
+ mb = size / (1024 ** 2)
164
+ return f"{mb:.1f} MB"
165
+
166
+ gb = size / (1024 ** 3)
167
+ return f"{gb:.1f} GB"
168
+
169
+
148
170
  def new_object_id(object_type: meta.ObjectType) -> meta.TagHeader:
149
171
 
150
172
  timestamp = dt.datetime.utcnow()
@@ -296,7 +318,7 @@ def error_details_from_exception(error: Exception):
296
318
 
297
319
  def filter_model_stack_trace(full_stack: tb.StackSummary, checkout_directory: pathlib.Path):
298
320
 
299
- frame_names = list(map(lambda frame: frame.name, full_stack))
321
+ frame_names = list(map(lambda frame_: frame_.name, full_stack))
300
322
 
301
323
  if __FIRST_MODEL_FRAME_NAME in frame_names:
302
324
  first_model_frame = frame_names.index(__FIRST_MODEL_FRAME_NAME)
@@ -309,7 +331,7 @@ def filter_model_stack_trace(full_stack: tb.StackSummary, checkout_directory: pa
309
331
 
310
332
  for frame_index, frame in enumerate(full_stack[first_model_frame:]):
311
333
  module_path = pathlib.Path(frame.filename)
312
- if ("tracdap" in module_path.parts):
334
+ if "tracdap" in module_path.parts:
313
335
  tracdap_index = len(module_path.parts) - 1 - list(reversed(module_path.parts)).index("tracdap")
314
336
  if tracdap_index < len(module_path.parts)-1:
315
337
  if module_path.parts[tracdap_index+1] == "rt":
@@ -322,4 +344,3 @@ def filter_model_stack_trace(full_stack: tb.StackSummary, checkout_directory: pa
322
344
  last_model_frame = first_model_frame + frame_index
323
345
 
324
346
  return full_stack[first_model_frame:last_model_frame+1]
325
-
@@ -22,6 +22,8 @@ import platform
22
22
  import urllib.parse
23
23
  import typing as tp
24
24
 
25
+ import tracdap.rt.exceptions as _ex
26
+
25
27
 
26
28
  def get_plugin_property(properties: tp.Dict[str, str], property_name: str):
27
29
 
@@ -40,6 +42,30 @@ def get_plugin_property(properties: tp.Dict[str, str], property_name: str):
40
42
  return None
41
43
 
42
44
 
45
+ def get_plugin_property_boolean(properties: tp.Dict[str, str], property_name: str, property_default: bool = False):
46
+
47
+ property_value = get_plugin_property(properties, property_name)
48
+
49
+ if property_value is None:
50
+ return property_default
51
+
52
+ if isinstance(property_value, bool):
53
+ return property_value
54
+
55
+ if isinstance(property_value, str):
56
+
57
+ if len(property_value.strip()) == 0:
58
+ return property_default
59
+
60
+ if property_value.strip().lower() == "true":
61
+ return True
62
+
63
+ if property_value.strip().lower() == "false":
64
+ return False
65
+
66
+ raise _ex.EConfigParse(f"Invalid value for [{property_name}]: Expected a boolean value, got [{property_value}]")
67
+
68
+
43
69
  # Handling for credentials supplied via HTTP(S) URLs
44
70
 
45
71
  __HTTP_TOKEN_KEY = "token"
@@ -98,31 +124,6 @@ def apply_http_credentials(url: urllib.parse.ParseResult, credentials: str) -> u
98
124
 
99
125
  # Logging helpers
100
126
 
101
- _T = tp.TypeVar("_T")
102
-
103
-
104
- class _LogClose(tp.Generic[_T]):
105
-
106
- def __init__(self, ctx_mgr: _T, log, msg):
107
- self.__ctx_mgr = ctx_mgr
108
- self.__log = log
109
- self.__msg = msg
110
-
111
- def __getitem__(self, item):
112
- return self.__ctx_mgr.__getitem__(item)
113
-
114
- def __enter__(self):
115
- return self.__ctx_mgr.__enter__()
116
-
117
- def __exit__(self, exc_type, exc_val, exc_tb):
118
- self.__ctx_mgr.__exit__(exc_type, exc_val, exc_tb)
119
- self.__log.info(self.__msg)
120
-
121
-
122
- def log_close(ctx_mgg: _T, log: logging.Logger, msg: str) -> _T:
123
-
124
- return _LogClose(ctx_mgg, log, msg)
125
-
126
127
 
127
128
  def log_safe(param: tp.Any):
128
129
 
@@ -23,31 +23,26 @@ import tracdap.rt.exceptions as ex
23
23
  import tracdap.rt.ext.plugins as plugins
24
24
  from tracdap.rt.ext.storage import *
25
25
 
26
+ from pyarrow import fs as afs
27
+
26
28
  # Set of common helpers across the core plugins (do not reference rt._impl)
27
29
  from . import _helpers
28
30
 
29
- # TODO: Remove dependencies on internal implementation details
30
- import tracdap.rt._impl.storage as _storage
31
-
32
- # AWS SDK
33
- import boto3
34
- import botocore.response # noqa
35
- import botocore.exceptions as aws_ex # noqa
36
-
37
-
38
- class S3ObjectStorage(IFileStorage):
39
-
40
- # This is a quick implementation of IFileStorage on S3 using the boto3 AWS SDK
41
31
 
42
- # TODO: Migrate IFileStorage interface to use object storage as the primary concept
43
- # It is much easier to express objects on a file system than vice versa
44
- # This change must also be made in the Java code
32
+ try:
33
+ # AWS SDK
34
+ import boto3
35
+ import botocore.response
36
+ import botocore.exceptions as aws_ex
37
+ __aws_available = True
38
+ except ImportError:
39
+ boto3 = None
40
+ botocore = None
41
+ aws_ex = None
42
+ __aws_available = False
45
43
 
46
- # TODO: Switch to using Apache Arrow file system interface
47
- # Arrow already has implementations for AWS, GCP, HDFS and local files
48
- # The arrow interface also allows extension with fsspec, to support Azure blob storage or custom implementations
49
44
 
50
- # https://arrow.apache.org/docs/python/filesystems.html
45
+ class AwsStorageProvider(IStorageProvider):
51
46
 
52
47
  BUCKET_PROPERTY = "bucket"
53
48
  PREFIX_PROPERTY = "prefix"
@@ -61,38 +56,106 @@ class S3ObjectStorage(IFileStorage):
61
56
  ACCESS_KEY_ID_PROPERTY = "accessKeyId"
62
57
  SECRET_ACCESS_KEY_PROPERTY = "secretAccessKey"
63
58
 
64
- def __init__(self, config: cfg.PluginConfig, options: dict = None):
59
+ RUNTIME_FS_PROPERTY = "runtimeFs"
60
+ RUNTIME_FS_AUTO = "auto"
61
+ RUNTIME_FS_ARROW = "arrow"
62
+ RUNTIME_FS_BOTO3 = "boto3"
63
+ RUNTIME_FS_DEFAULT = RUNTIME_FS_AUTO
64
+
65
+ ARROW_CLIENT_ARGS = {
66
+ REGION_PROPERTY: "region",
67
+ ENDPOINT_PROPERTY: "endpoint_override",
68
+ ACCESS_KEY_ID_PROPERTY: "access_key",
69
+ SECRET_ACCESS_KEY_PROPERTY: "secret_key"
70
+ }
71
+
72
+ BOTO_CLIENT_ARGS = {
73
+ REGION_PROPERTY: "region_name",
74
+ ENDPOINT_PROPERTY: "endpoint_url",
75
+ ACCESS_KEY_ID_PROPERTY: "aws_access_key_id",
76
+ SECRET_ACCESS_KEY_PROPERTY: "aws_secret_access_key"
77
+ }
78
+
79
+ def __init__(self, properties: tp.Dict[str, str]):
65
80
 
66
81
  self._log = _helpers.logger_for_object(self)
82
+ self._properties = properties
67
83
 
68
- self._properties = config.properties
84
+ self._runtime_fs = _helpers.get_plugin_property(
85
+ properties, self.RUNTIME_FS_PROPERTY) \
86
+ or self.RUNTIME_FS_DEFAULT
87
+
88
+ def has_arrow_native(self) -> bool:
89
+ if self._runtime_fs == self.RUNTIME_FS_ARROW:
90
+ return True
91
+ elif self._runtime_fs == self.RUNTIME_FS_AUTO:
92
+ return afs.S3FileSystem is not None
93
+ else:
94
+ return False
95
+
96
+ def has_file_storage(self) -> bool:
97
+ if self._runtime_fs == self.RUNTIME_FS_BOTO3:
98
+ return True
99
+ elif self._runtime_fs == self.RUNTIME_FS_AUTO:
100
+ return afs.S3FileSystem is None
101
+ else:
102
+ return False
69
103
 
70
- self._bucket = _helpers.get_plugin_property(self._properties, self.BUCKET_PROPERTY)
71
- self._prefix = _helpers.get_plugin_property(self._properties, self.PREFIX_PROPERTY) or ""
72
- self._region = _helpers.get_plugin_property(self._properties, self.REGION_PROPERTY)
73
- self._endpoint = _helpers.get_plugin_property(self._properties, self.ENDPOINT_PROPERTY)
104
+ def get_arrow_native(self) -> afs.SubTreeFileSystem:
74
105
 
75
- credentials_params = self.setup_credentials()
106
+ s3fs_args = self.setup_client_args(self.ARROW_CLIENT_ARGS)
107
+ s3fs = afs.S3FileSystem(**s3fs_args)
76
108
 
77
- client_args = {
78
- "service_name": "s3",
79
- **credentials_params}
109
+ bucket = _helpers.get_plugin_property(self._properties, self.BUCKET_PROPERTY)
110
+ prefix = _helpers.get_plugin_property(self._properties, self.PREFIX_PROPERTY)
80
111
 
81
- if self._region is not None:
82
- client_args["region_name"] = self._region
112
+ if bucket is None or len(bucket.strip()) == 0:
113
+ message = f"Missing required config property [{self.BUCKET_PROPERTY}] for S3 storage"
114
+ self._log.error(message)
115
+ raise ex.EConfigParse(message)
83
116
 
84
- if self._endpoint is not None:
85
- client_args["endpoint_url"] = self._endpoint
117
+ root_path = f"{bucket}/{prefix}" if prefix else bucket
86
118
 
87
- self._client = boto3.client(**client_args)
119
+ return afs.SubTreeFileSystem(root_path, s3fs)
120
+
121
+ def get_file_storage(self) -> IFileStorage:
122
+
123
+ client_args = self.setup_client_args(self.BOTO_CLIENT_ARGS)
124
+ client_args["service_name"] = "s3"
125
+
126
+ config = cfg.PluginConfig()
127
+ config.protocol = "S3"
128
+ config.properties = self._properties
129
+
130
+ return S3ObjectStorage(config, client_args)
131
+
132
+ def setup_client_args(self, key_mapping: tp.Dict[str, str]) -> tp.Dict[str, tp.Any]:
88
133
 
89
- def setup_credentials(self):
134
+ client_args = dict()
90
135
 
91
- mechanism = _helpers.get_plugin_property(self._properties, self.CREDENTIALS_PROPERTY) or self.CREDENTIALS_DEFAULT
136
+ region = _helpers.get_plugin_property(self._properties, self.REGION_PROPERTY)
137
+ endpoint = _helpers.get_plugin_property(self._properties, self.ENDPOINT_PROPERTY)
92
138
 
93
- if mechanism.lower() == self.CREDENTIALS_DEFAULT:
139
+ if region is not None:
140
+ region_key = key_mapping[self.REGION_PROPERTY]
141
+ client_args[region_key] = region
142
+
143
+ if endpoint is not None:
144
+ endpoint_key = key_mapping[self.ENDPOINT_PROPERTY]
145
+ client_args[endpoint_key] = endpoint
146
+
147
+ credentials = self.setup_credentials(key_mapping)
148
+ client_args.update(credentials)
149
+
150
+ return client_args
151
+
152
+ def setup_credentials(self, key_mapping: tp.Dict[str, str]):
153
+
154
+ mechanism = _helpers.get_plugin_property(self._properties, self.CREDENTIALS_PROPERTY)
155
+
156
+ if mechanism is None or len(mechanism) == 0 or mechanism.lower() == self.CREDENTIALS_DEFAULT:
94
157
  self._log.info(f"Using [{self.CREDENTIALS_DEFAULT}] credentials mechanism")
95
- return {}
158
+ return dict()
96
159
 
97
160
  if mechanism.lower() == self.CREDENTIALS_STATIC:
98
161
 
@@ -103,12 +166,49 @@ class S3ObjectStorage(IFileStorage):
103
166
  f"Using [{self.CREDENTIALS_STATIC}] credentials mechanism, " +
104
167
  f"access key id = [{access_key_id}]")
105
168
 
106
- return {"aws_access_key_id": access_key_id, "aws_secret_access_key": secret_access_key}
169
+ access_key_id_arg = key_mapping[self.ACCESS_KEY_ID_PROPERTY]
170
+ secret_access_key_arg = key_mapping[self.SECRET_ACCESS_KEY_PROPERTY]
171
+
172
+ return {
173
+ access_key_id_arg: access_key_id,
174
+ secret_access_key_arg: secret_access_key}
107
175
 
108
176
  message = f"Unrecognised credentials mechanism: [{mechanism}]"
109
177
  self._log.error(message)
110
178
  raise ex.EStartup(message)
111
179
 
180
+
181
+ if __aws_available:
182
+ plugins.PluginManager.register_plugin(IStorageProvider, AwsStorageProvider, ["S3"])
183
+
184
+
185
+ # ----------------------------------------------------------------------------------------------------------------------
186
+ # CUSTOM IMPLEMENTATION FOR S3 STORAGE
187
+ # ----------------------------------------------------------------------------------------------------------------------
188
+
189
+ # This is the old implementation that was used before Arrow native was made available
190
+ # It is likely to be removed in a future release
191
+
192
+
193
+ class S3ObjectStorage(IFileStorage):
194
+
195
+ # This is a quick implementation of IFileStorage on S3 using the boto3 AWS SDK
196
+
197
+ def __init__(self, config: cfg.PluginConfig, client_args: dict):
198
+
199
+ self._log = _helpers.logger_for_object(self)
200
+
201
+ self._properties = config.properties
202
+ self._bucket = _helpers.get_plugin_property(self._properties, AwsStorageProvider.BUCKET_PROPERTY)
203
+ self._prefix = _helpers.get_plugin_property(self._properties, AwsStorageProvider.PREFIX_PROPERTY) or ""
204
+
205
+ if self._bucket is None or len(self._bucket.strip()) == 0:
206
+ message = f"Missing required config property [{AwsStorageProvider.BUCKET_PROPERTY}] for S3 storage"
207
+ self._log.error(message)
208
+ raise ex.EConfigParse(message)
209
+
210
+ self._client = boto3.client(**client_args)
211
+
112
212
  def exists(self, storage_path: str) -> bool:
113
213
 
114
214
  try:
@@ -140,19 +240,21 @@ class S3ObjectStorage(IFileStorage):
140
240
 
141
241
  self._log.info(f"STAT [{storage_path}]")
142
242
 
243
+ name = storage_path.split("/")[-1]
244
+
143
245
  if self.exists(storage_path):
144
246
 
145
247
  # Only OBJECTS can support stat atm
146
248
  # Handling for directories needs to be changed, as part of refactor onto object storage
147
249
  size = self.size(storage_path)
148
- return FileStat(FileType.FILE, size)
250
+ return FileStat(name, FileType.FILE, storage_path, size)
149
251
 
150
252
  else:
151
253
 
152
254
  self.ls(storage_path)
153
- return FileStat(FileType.DIRECTORY, 0)
255
+ return FileStat(name, FileType.DIRECTORY, storage_path, 0)
154
256
 
155
- def ls(self, storage_path: str) -> tp.List[str]:
257
+ def ls(self, storage_path: str, recursive: bool = False) -> tp.List[FileStat]:
156
258
 
157
259
  self._log.info(f"LS [{storage_path}]")
158
260
 
@@ -174,36 +276,41 @@ class S3ObjectStorage(IFileStorage):
174
276
  if raw_key == prefix:
175
277
  continue
176
278
  key = raw_key.replace(prefix, "")
177
- keys.append(key)
279
+ size = entry["Size"]
280
+ mtime = entry["LastModified "]
281
+ stat = FileStat(key, FileType.FILE, raw_key, size, mtime=mtime)
282
+ keys.append(stat)
178
283
 
179
284
  if "CommonPrefixes" in response:
180
285
  for raw_prefix in response["CommonPrefixes"]:
181
286
  common_prefix = raw_prefix.replace(prefix, "")
182
- keys.append(common_prefix)
287
+ stat = FileStat(common_prefix, FileType.DIRECTORY, raw_prefix, 0)
288
+ keys.append(stat)
183
289
 
184
290
  return keys
185
291
 
186
- def mkdir(self, storage_path: str, recursive: bool = False, exists_ok: bool = False):
292
+ def mkdir(self, storage_path: str, recursive: bool = False):
187
293
 
188
294
  self._log.info(f"MKDIR [{storage_path}]")
189
295
 
190
296
  # No-op in object storage
191
297
  pass
192
298
 
193
- def rm(self, storage_path: str, recursive: bool = False):
299
+ def rm(self, storage_path: str):
194
300
 
195
301
  try:
196
302
  self._log.info(f"RM [{storage_path}]")
197
303
 
198
- if recursive:
199
- raise RuntimeError("RM (recursive) not available for S3 storage")
200
-
201
304
  object_key = self._resolve_path(storage_path)
202
305
  self._client.delete_object(Bucket=self._bucket, Key=object_key)
203
306
 
204
307
  except aws_ex.ClientError as error:
205
308
  raise ex.EStorageRequest(f"Storage error: {str(error)}") from error
206
309
 
310
+ def rmdir(self, storage_path: str):
311
+
312
+ raise RuntimeError("RMDIR (recursive) not available for S3 storage")
313
+
207
314
  def read_bytes(self, storage_path: str) -> bytes:
208
315
 
209
316
  self._log.info(f"READ BYTES [{storage_path}]")
@@ -218,7 +325,7 @@ class S3ObjectStorage(IFileStorage):
218
325
  data = self.read_bytes(storage_path)
219
326
  return io.BytesIO(data)
220
327
 
221
- def _read_impl(self, storage_path: str) -> botocore.response.StreamingBody:
328
+ def _read_impl(self, storage_path: str):
222
329
 
223
330
  try:
224
331
 
@@ -229,7 +336,7 @@ class S3ObjectStorage(IFileStorage):
229
336
  except aws_ex.ClientError as error:
230
337
  raise ex.EStorageRequest(f"Storage error: {str(error)}") from error
231
338
 
232
- def write_bytes(self, storage_path: str, data: bytes, overwrite: bool = False):
339
+ def write_bytes(self, storage_path: str, data: bytes):
233
340
 
234
341
  try:
235
342
  self._log.info(f"WRITE BYTES [{storage_path}]")
@@ -244,43 +351,27 @@ class S3ObjectStorage(IFileStorage):
244
351
  except aws_ex.ClientError as error:
245
352
  raise ex.EStorageRequest(f"Storage error: {str(error)}") from error
246
353
 
247
- def write_byte_stream(self, storage_path: str, overwrite: bool = False) -> tp.BinaryIO:
354
+ def write_byte_stream(self, storage_path: str) -> tp.BinaryIO:
248
355
 
249
356
  self._log.info(f"WRITE BYTE STREAM [{storage_path}]")
250
357
 
251
- return self._AwsWriteBuf(self, storage_path, overwrite)
358
+ return self._AwsWriteBuf(self, storage_path)
252
359
 
253
360
  class _AwsWriteBuf(io.BytesIO):
254
361
 
255
- def __init__(self, storage, storage_path, overwrite: bool):
362
+ def __init__(self, storage, storage_path):
256
363
  super().__init__()
257
364
  self._storage = storage
258
365
  self._storage_path = storage_path
259
- self._overwrite = overwrite
260
366
  self._written = False
261
367
 
262
368
  def close(self):
263
369
  if not self._written:
264
370
  self.seek(0)
265
371
  data = self.read()
266
- self._storage.write_bytes(self._storage_path, data, self._overwrite)
372
+ self._storage.write_bytes(self._storage_path, data)
267
373
  self._written = True
268
374
 
269
- # TODO: These methods can be removed from the interface, they are not needed
270
- # (storage layer only needs to work in binary mode)
271
-
272
- def read_text(self, storage_path: str, encoding: str = 'utf-8') -> str:
273
- raise RuntimeError("READ (text mode) not available for S3 storage")
274
-
275
- def read_text_stream(self, storage_path: str, encoding: str = 'utf-8') -> tp.TextIO:
276
- raise RuntimeError("READ (text mode) not available for S3 storage")
277
-
278
- def write_text(self, storage_path: str, data: str, encoding: str = 'utf-8', overwrite: bool = False):
279
- raise RuntimeError("WRITE (text mode) not available for S3 storage")
280
-
281
- def write_text_stream(self, storage_path: str, encoding: str = 'utf-8', overwrite: bool = False) -> tp.TextIO:
282
- raise RuntimeError("WRITE (text mode) not available for S3 storage")
283
-
284
375
  def _resolve_path(self, storage_path: str) -> str:
285
376
 
286
377
  if self._prefix is None or self._prefix.strip() == "":
@@ -290,8 +381,3 @@ class S3ObjectStorage(IFileStorage):
290
381
  full_path = self._prefix + separator + storage_path
291
382
 
292
383
  return full_path[1:] if full_path.startswith("/") else full_path
293
-
294
-
295
- # Register the S3 storage plugin
296
-
297
- _storage.StorageManager.register_storage_type("S3", S3ObjectStorage, _storage.CommonDataStorage)
@@ -0,0 +1,155 @@
1
+ # Copyright 2023 Accenture Global Solutions Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ import typing as tp
17
+
18
+ # TRAC interfaces
19
+ import tracdap.rt.exceptions as ex
20
+ import tracdap.rt.ext.plugins as plugins
21
+ from tracdap.rt.ext.storage import *
22
+
23
+ import pyarrow.fs as afs
24
+
25
+ try:
26
+ # These dependencies are provided by the optional [azure] feature
27
+ # For local development, pip install -r requirements_plugins.txt
28
+ import azure.storage.blob as az_blob # noqa
29
+ import adlfs # noqa
30
+ __azure_available = True
31
+ except ImportError:
32
+ adlfs = None
33
+ __azure_available = False
34
+
35
+ # Set of common helpers across the core plugins (do not reference rt._impl)
36
+ from . import _helpers
37
+
38
+
39
+ class AzureBlobStorageProvider(IStorageProvider):
40
+
41
+ # This client depends on the Azure fsspec implementation, since there is no native implementation from Arrow
42
+ # To enable it, the tracdap package must be installed with the optional [azure] feature
43
+
44
+ # Current supported authentication mechanisms are "default" and "access_key"
45
+ # Client always uses location mode = primary, version aware = False
46
+
47
+ STORAGE_ACCOUNT_PROPERTY = "storageAccount"
48
+ CONTAINER_PROPERTY = "container"
49
+ PREFIX_PROPERTY = "prefix"
50
+
51
+ CREDENTIALS_PROPERTY = "credentials"
52
+ CREDENTIALS_DEFAULT = "default"
53
+ CREDENTIALS_ACCESS_KEY = "access_key"
54
+
55
+ ACCESS_KEY_PROPERTY = "accessKey"
56
+
57
+ RUNTIME_FS_PROPERTY = "runtimeFs"
58
+ RUNTIME_FS_AUTO = "auto"
59
+ RUNTIME_FS_FSSPEC = "fsspec"
60
+ RUNTIME_FS_DEFAULT = RUNTIME_FS_AUTO
61
+
62
+ def __init__(self, properties: tp.Dict[str, str]):
63
+
64
+ self._log = _helpers.logger_for_object(self)
65
+ self._properties = properties
66
+
67
+ self._runtime_fs = _helpers.get_plugin_property(
68
+ properties, self.RUNTIME_FS_PROPERTY) \
69
+ or self.RUNTIME_FS_DEFAULT
70
+
71
+ # The Azure SDK is very verbose with logging
72
+ # Avoid log noise by raising the log level for the Azure namespace
73
+ azure_log = _helpers.logger_for_namespace("azure.core")
74
+ azure_log.level = logging.WARNING
75
+
76
+ def has_arrow_native(self) -> bool:
77
+ return True
78
+
79
+ def get_arrow_native(self) -> afs.SubTreeFileSystem:
80
+
81
+ if self._runtime_fs == self.RUNTIME_FS_AUTO or self._runtime_fs == self.RUNTIME_FS_FSSPEC:
82
+ azure_fs = self.create_fsspec()
83
+ else:
84
+ message = f"Requested runtime FS [{self._runtime_fs}] is not available for Azure storage"
85
+ self._log.error(message)
86
+ raise ex.EStartup(message)
87
+
88
+ container = _helpers.get_plugin_property(self._properties, self.CONTAINER_PROPERTY)
89
+ prefix = _helpers.get_plugin_property(self._properties, self.PREFIX_PROPERTY)
90
+
91
+ if container is None or container.strip() == "":
92
+ message = f"Missing required config property [{self.CONTAINER_PROPERTY}] for Azure blob storage"
93
+ self._log.error(message)
94
+ raise ex.EConfigParse(message)
95
+
96
+ root_path = f"{container}/{prefix}" if prefix else container
97
+
98
+ return afs.SubTreeFileSystem(root_path, azure_fs)
99
+
100
+ def create_fsspec(self) -> afs.FileSystem:
101
+
102
+ azure_fsspec_args = self.setup_client_args()
103
+ azure_fsspec = adlfs.AzureBlobFileSystem(**azure_fsspec_args)
104
+
105
+ return afs.PyFileSystem(afs.FSSpecHandler(azure_fsspec))
106
+
107
+ def setup_client_args(self) -> tp.Dict[str, tp.Any]:
108
+
109
+ client_args = dict()
110
+
111
+ storage_account = _helpers.get_plugin_property(self._properties, self.STORAGE_ACCOUNT_PROPERTY)
112
+
113
+ if storage_account is None or len(storage_account.strip()) == 0:
114
+ message = f"Missing required config property [{self.STORAGE_ACCOUNT_PROPERTY}] for Azure blob storage"
115
+ self._log.error(message)
116
+ raise ex.EConfigParse(message)
117
+
118
+ client_args["account_name"] = storage_account
119
+
120
+ credentials = self.setup_credentials()
121
+ client_args.update(credentials)
122
+
123
+ return client_args
124
+
125
+ def setup_credentials(self):
126
+
127
+ # Only default (Google ADC) mechanism is supported
128
+ # Arrow GCP FS does also support access tokens, but ADC is probably all we ever need
129
+
130
+ mechanism = _helpers.get_plugin_property(self._properties, self.CREDENTIALS_PROPERTY)
131
+
132
+ if mechanism is None or len(mechanism) == 0 or mechanism.lower() == self.CREDENTIALS_DEFAULT:
133
+ self._log.info(f"Using [{self.CREDENTIALS_DEFAULT}] credentials mechanism")
134
+ return {"anon": False}
135
+
136
+ if mechanism == self.CREDENTIALS_ACCESS_KEY:
137
+
138
+ self._log.info(f"Using [{self.CREDENTIALS_ACCESS_KEY}] credentials mechanism")
139
+
140
+ access_key = _helpers.get_plugin_property(self._properties, self.ACCESS_KEY_PROPERTY)
141
+
142
+ if access_key is None or len(access_key.strip()) == 0:
143
+ message = f"Missing required config property [{self.ACCESS_KEY_PROPERTY}] for Azure blob storage"
144
+ raise ex.EConfigParse(message)
145
+
146
+ return {"account_key": access_key}
147
+
148
+ message = f"Unrecognised credentials mechanism: [{mechanism}]"
149
+ self._log.error(message)
150
+ raise ex.EStartup(message)
151
+
152
+
153
+ # Only register the plugin if the [azure] feature is available
154
+ if __azure_available:
155
+ plugins.PluginManager.register_plugin(IStorageProvider, AzureBlobStorageProvider, ["BLOB"])