anemoi-utils 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of anemoi-utils might be problematic. Click here for more details.

anemoi/utils/remote/s3.py CHANGED
@@ -29,7 +29,7 @@ import logging
29
29
  import os
30
30
  import threading
31
31
  from collections.abc import Iterable
32
- from copy import deepcopy
32
+ from contextlib import closing
33
33
  from typing import Any
34
34
 
35
35
  import tqdm
@@ -38,752 +38,698 @@ from ..config import load_config
38
38
  from ..humanize import bytes_to_human
39
39
  from . import BaseDownload
40
40
  from . import BaseUpload
41
+ from . import transfer
41
42
 
42
43
  LOG = logging.getLogger(__name__)
43
- SECRETS = ["aws_access_key_id", "aws_secret_access_key"]
44
+ SECRETS = ["aws_access_key_id", "aws_secret_access_key", "access_key_id", "secret_access_key"]
44
45
 
45
- # s3_clients are not thread-safe, so we need to create a new client for each thread
46
46
 
47
- thread_local = threading.local()
47
+ MIGRATE = {
48
+ "aws_access_key_id": "access_key_id",
49
+ "aws_secret_access_key": "secret_access_key",
50
+ }
48
51
 
52
+ CACHE = {}
53
+ LOCK = threading.Lock()
49
54
 
50
- def _s3_config(bucket: str, *, region: str = None) -> Any:
51
- """Get an S3 client config for the specified bucket and region.
52
55
 
53
- Parameters
54
- ----------
55
- bucket : str
56
- The name of the S3 bucket.
57
- region : str, optional
58
- The AWS region of the S3 bucket.
59
-
60
- Returns
61
- -------
62
- Any
63
- The S3 client.
64
- """
65
- from botocore import UNSIGNED
66
-
67
- boto3_config = {}
68
-
69
- if region:
70
- # This is using AWS
71
-
72
- options = {"region_name": region}
73
-
74
- # Anonymous access
75
- if not (
76
- os.path.exists(os.path.expanduser("~/.aws/credentials"))
77
- or ("AWS_ACCESS_KEY_ID" in os.environ and "AWS_SECRET_ACCESS_KEY" in os.environ)
78
- ):
79
- boto3_config["signature_version"] = UNSIGNED
80
-
81
- else:
82
-
83
- # We may be accessing a different S3 compatible service
84
- # Use anemoi.config to get the configuration
85
-
86
- region = "unknown-region"
87
-
88
- options = {"region_name": region}
89
- config = load_config(secrets=SECRETS)
90
-
91
- cfg = config.get("object-storage", {})
92
- candidate = None
93
- for k, v in cfg.items():
94
- if isinstance(v, (str, int, float, bool)):
95
- options[k] = v
96
-
97
- if isinstance(v, dict):
98
- if fnmatch.fnmatch(bucket, k):
99
- if candidate is not None:
100
- raise ValueError(f"Multiple object storage configurations match {bucket}: {candidate} and {k}")
101
- candidate = k
102
-
103
- if candidate is not None:
104
- for k, v in cfg.get(candidate, {}).items():
105
- if isinstance(v, (str, int, float, bool)):
106
- options[k] = v
107
-
108
- type = options.pop("type", "s3")
109
- if type != "s3":
110
- raise ValueError(f"Unsupported object storage type {type}")
56
+ class S3Object:
57
+ def __init__(self, url: str) -> None:
58
+ """Initialise an S3Object from a URL.
111
59
 
112
- if "config" in options:
113
- boto3_config.update(options["config"])
114
- del options["config"]
115
-
116
- def _(options):
117
-
118
- def __(k, v):
119
- if k in SECRETS:
120
- return "***"
121
- return v
122
-
123
- if isinstance(options, dict):
124
- return {k: __(k, v) for k, v in options.items()}
125
-
126
- if isinstance(options, list):
127
- return [_(o) for o in options]
128
-
129
- return options
130
-
131
- LOG.debug(f"Using S3 options: {_(options)}")
132
-
133
- return boto3_config, options
60
+ Parameters
61
+ ----------
62
+ url : str
63
+ S3 URL (e.g., 's3://bucket/key').
64
+ """
65
+ self.url = url
66
+ try:
67
+ s3, empty, self.bucket, self.key = url.split("/", 3)
68
+ except ValueError:
69
+ raise ValueError(f"Invalid S3 URL: {url}")
70
+ assert s3 == "s3:"
71
+ assert empty == ""
72
+ self.dirname = f"s3://{self.bucket}"
134
73
 
135
74
 
136
- def s3_options(bucket: str, *, region: str = None, service: str = "s3") -> dict:
137
- """Get the S3 configuration for the specified bucket and region.
75
+ def _s3_object(url_or_object: str | S3Object) -> S3Object:
76
+ """Convert a string or S3Object to S3Object.
138
77
 
139
78
  Parameters
140
79
  ----------
141
- bucket : str
142
- The name of the S3 bucket.
143
- region : str, optional
144
- The AWS region of the S3 bucket.
145
- service : str, optional
146
- The AWS service to use, default is "s3".
80
+ url_or_object : str or S3Object
81
+ S3 URL or S3Object instance.
147
82
 
148
83
  Returns
149
84
  -------
150
- dict
151
- The S3 configuration.
85
+ S3Object
86
+ S3Object instance.
152
87
  """
153
- _, options = _s3_config(bucket, region=region)
154
- return options
88
+ if isinstance(url_or_object, S3Object):
89
+ return url_or_object
90
+
91
+ if isinstance(url_or_object, str):
92
+ return S3Object(url_or_object)
155
93
 
94
+ raise TypeError(f"Invalid type for S3 object: {type(url_or_object)}")
156
95
 
157
- def s3_client(bucket: str, *, region: str = None, service: str = "s3") -> Any:
158
- """Get an S3 client for the specified bucket and region.
96
+
97
+ def _hide_secrets(options: dict | list) -> dict | list:
98
+ """Hide secret values in options.
159
99
 
160
100
  Parameters
161
101
  ----------
162
- bucket : str
163
- The name of the S3 bucket.
164
- region : str, optional
165
- The AWS region of the S3 bucket.
166
- service : str, optional
167
- The AWS service to use, default is "s3".
102
+ options : dict or list
103
+ Options possibly containing secrets.
168
104
 
169
105
  Returns
170
106
  -------
171
- Any
172
- The S3 client.
107
+ dict or list
108
+ Options with secrets hidden.
173
109
  """
174
- import boto3
175
- from botocore.client import Config
176
110
 
177
- if not hasattr(thread_local, "s3_clients"):
178
- thread_local.s3_clients = {}
111
+ def __(k, v):
112
+ if k in SECRETS:
113
+ return "***"
114
+ return v
179
115
 
180
- key = f"{bucket}-{region}-{service}"
116
+ if isinstance(options, dict):
117
+ return {k: __(k, v) for k, v in options.items()}
181
118
 
182
- if key in thread_local.s3_clients:
183
- return thread_local.s3_clients[key]
119
+ if isinstance(options, list):
120
+ return [_hide_secrets(o) for o in options]
184
121
 
185
- boto3_config, options = _s3_config(bucket, region=region)
186
-
187
- boto3_config.update(
188
- dict(
189
- max_pool_connections=25,
190
- request_checksum_calculation="when_required",
191
- response_checksum_validation="when_required",
192
- )
193
- )
194
-
195
- options["config"] = Config(**boto3_config)
196
-
197
- def _(options):
198
-
199
- def __(k, v):
200
- if k in SECRETS:
201
- return "***"
202
- return v
122
+ return options
203
123
 
204
- if isinstance(options, dict):
205
- return {k: __(k, v) for k, v in options.items()}
206
124
 
207
- if isinstance(options, list):
208
- return [_(o) for o in options]
125
+ def _s3_options(obj: str | S3Object) -> dict:
126
+ """Get S3 options for a given object.
209
127
 
210
- return options
128
+ Parameters
129
+ ----------
130
+ obj : str or S3Object
131
+ S3 URL or S3Object instance.
211
132
 
212
- LOG.debug(f"Using S3 options: {_(options)}")
133
+ Returns
134
+ -------
135
+ dict
136
+ S3 connection options.
137
+ """
213
138
 
214
- thread_local.s3_clients[key] = boto3.client(service, **options)
139
+ obj = _s3_object(obj)
215
140
 
216
- return thread_local.s3_clients[key]
141
+ with LOCK:
142
+ if obj.dirname in CACHE:
143
+ return CACHE[obj.dirname]
217
144
 
145
+ options = {}
218
146
 
219
- class S3Upload(BaseUpload):
147
+ # We may be accessing a different S3 compatible service
148
+ # Use anemoi.config to get the configuration
220
149
 
221
- def get_temporary_target(self, target: str, pattern: str) -> str:
222
- """Get a temporary target path based on the given pattern.
150
+ config = load_config(secrets=SECRETS)
223
151
 
224
- Parameters
225
- ----------
226
- target : str
227
- The original target path.
228
- pattern : str
229
- The pattern to format the temporary path.
152
+ cfg = config.get("object-storage", {})
153
+ candidate = None
154
+ for k, v in cfg.items():
155
+ if isinstance(v, (str, int, float, bool)):
156
+ options[k] = v
230
157
 
231
- Returns
232
- -------
233
- str
234
- The temporary target path.
235
- """
236
- return target
158
+ if isinstance(v, dict):
159
+ if fnmatch.fnmatch(obj.bucket, k):
160
+ if candidate is not None:
161
+ raise ValueError(f"Multiple object storage configurations match {obj.bucket}: {candidate} and {k}")
162
+ candidate = k
237
163
 
238
- def rename_target(self, target: str, temporary_target: str) -> None:
239
- """Rename the target to a new target path.
164
+ if candidate is not None:
165
+ for k, v in cfg.get(candidate, {}).items():
166
+ if isinstance(v, (str, int, float, bool)):
167
+ options[k] = v
240
168
 
241
- Parameters
242
- ----------
243
- target : str
244
- The original target path.
245
- temporary_target : str
246
- The new target path.
247
- """
248
- pass
169
+ type = options.pop("type", "s3")
170
+ if type != "s3":
171
+ raise ValueError(f"Unsupported object storage type {type}")
249
172
 
250
- def delete_target(self, target: str) -> None:
251
- """Delete the target path.
173
+ for k, v in MIGRATE.items():
174
+ if k in options:
175
+ LOG.warning(f"Option '{k}' is deprecated, use '{v}' instead")
176
+ options[v] = options.pop(k)
252
177
 
253
- Parameters
254
- ----------
255
- target : str
256
- The target path to delete.
257
- """
258
- pass
259
- # delete(target)
260
-
261
- def _transfer_file(
262
- self,
263
- source: str,
264
- target: str,
265
- overwrite: bool,
266
- resume: bool,
267
- verbosity: int,
268
- threads: int,
269
- config: dict = None,
270
- ) -> int:
271
- """Transfer a file to S3.
178
+ LOG.info(f"Using S3 options: {_hide_secrets(options)}")
272
179
 
273
- Parameters
274
- ----------
275
- source : str
276
- The source file path.
277
- target : str
278
- The target S3 path.
279
- overwrite : bool
280
- Whether to overwrite the target if it exists.
281
- resume : bool
282
- Whether to resume the transfer if possible.
283
- verbosity : int
284
- The verbosity level.
285
- threads : int
286
- The number of threads to use.
287
- config : dict, optional
288
- Additional configuration options.
180
+ with LOCK:
181
+ CACHE[obj.dirname] = options
289
182
 
290
- Returns
291
- -------
292
- int
293
- The size of the transferred file.
294
-
295
- Raises
296
- ------
297
- ValueError
298
- If the target already exists and overwrite or resume is not specified.
299
- """
300
- from botocore.exceptions import ClientError
301
-
302
- assert target.startswith("s3://")
183
+ return options
303
184
 
304
- _, _, bucket, key = target.split("/", 3)
305
- s3 = s3_client(bucket)
306
185
 
307
- size = os.path.getsize(source)
186
+ def s3_client(obj: str | S3Object) -> Any:
187
+ """Create an S3 client for the given URL.
308
188
 
309
- if verbosity > 0:
310
- LOG.info(f"{self.action} {source} to {target} ({bytes_to_human(size)})")
189
+ Parameters
190
+ ----------
191
+ obj : str or S3Object
192
+ S3 URL or S3Object instance.
311
193
 
312
- try:
313
- results = s3.head_object(Bucket=bucket, Key=key)
314
- remote_size = int(results["ContentLength"])
315
- except ClientError as e:
316
- if e.response["Error"]["Code"] != "404":
317
- raise
318
- remote_size = None
319
-
320
- if remote_size is not None:
321
- if remote_size != size:
322
- LOG.warning(
323
- f"{target} already exists, but with different size, re-uploading (remote={remote_size}, local={size})"
324
- )
325
- elif resume:
326
- # LOGGER.info(f"{target} already exists, skipping")
327
- return size
194
+ Returns
195
+ -------
196
+ Any
197
+ S3 client instance.
198
+ """
328
199
 
329
- if remote_size is not None and not overwrite and not resume:
330
- raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
331
-
332
- if verbosity > 0:
333
- with tqdm.tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024, leave=False) as pbar:
334
- s3.upload_file(
335
- source,
336
- bucket,
337
- key,
338
- Callback=lambda x: pbar.update(x),
339
- Config=config,
340
- )
341
- else:
342
- s3.upload_file(source, bucket, key, Config=config)
200
+ import obstore
343
201
 
344
- return size
202
+ obj = _s3_object(obj)
203
+ options = _s3_options(obj)
204
+ LOG.debug(f"Using S3 options: {_hide_secrets(options)}")
205
+ return obstore.store.from_url(obj.dirname, **options)
345
206
 
346
207
 
347
- class S3Download(BaseDownload):
208
+ def upload_file(source: str, target: str, overwrite: bool, resume: bool, verbosity: int) -> int:
209
+ """Upload a file to S3.
348
210
 
349
- def copy(self, source: str, target: str, **kwargs) -> None:
350
- """Copy a file or folder from S3 to the local filesystem.
211
+ Parameters
212
+ ----------
213
+ source : str
214
+ Local file path to upload.
215
+ target : str
216
+ S3 target URL.
217
+ overwrite : bool
218
+ Overwrite existing file if True.
219
+ resume : bool
220
+ Resume upload if True.
221
+ verbosity : int
222
+ Verbosity level.
351
223
 
352
- Parameters
353
- ----------
354
- source : str
355
- The source S3 path.
356
- target : str
357
- The target local path.
358
- kwargs : dict
359
- Additional arguments for the transfer.
360
- """
361
- assert source.startswith("s3://")
224
+ Returns
225
+ -------
226
+ int
227
+ Number of bytes uploaded.
228
+ """
362
229
 
363
- if source.endswith("/"):
364
- self.transfer_folder(source=source, target=target, **kwargs)
365
- else:
366
- self.transfer_file(source=source, target=target, **kwargs)
230
+ import obstore
367
231
 
368
- def list_source(self, source: str) -> Iterable:
369
- """List the objects in the source S3 path.
232
+ obj = _s3_object(target)
370
233
 
371
- Parameters
372
- ----------
373
- source : str
374
- The source S3 path.
234
+ s3 = s3_client(obj)
235
+ size = os.path.getsize(source)
375
236
 
376
- Returns
377
- -------
378
- Iterable
379
- An iterable of S3 objects.
380
- """
381
- yield from _list_objects(source)
237
+ if verbosity > 0:
238
+ LOG.info(f"Upload {source} to {target} ({bytes_to_human(size)})")
382
239
 
383
- def source_path(self, s3_object: dict, source: str) -> str:
384
- """Get the S3 path of the object.
240
+ try:
241
+ remote_size = object_info(obj)["size"]
242
+ except FileNotFoundError:
243
+ remote_size = None
244
+
245
+ if remote_size is not None:
246
+ if remote_size != size:
247
+ LOG.warning(
248
+ f"{target} already exists, but with different size, re-uploading (remote={remote_size}, local={size})"
249
+ )
250
+ elif resume:
251
+ return size
252
+
253
+ if remote_size is not None and not overwrite and not resume:
254
+ raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
255
+
256
+ with tqdm.tqdm(
257
+ desc=obj.key,
258
+ total=size,
259
+ unit="B",
260
+ unit_scale=True,
261
+ unit_divisor=1024,
262
+ leave=verbosity >= 2,
263
+ delay=0 if verbosity > 0 else 10,
264
+ ) as pbar:
265
+ chunk_size = 1024 * 1024
266
+ total = size
267
+ with open(source, "rb") as f:
268
+ with closing(obstore.open_writer(s3, obj.key, buffer_size=chunk_size)) as g:
269
+ while total > 0:
270
+ chunk = f.read(min(chunk_size, total))
271
+ g.write(chunk)
272
+ pbar.update(len(chunk))
273
+ total -= len(chunk)
274
+
275
+ return size
276
+
277
+
278
+ def download_file(source: str, target: str, overwrite: bool, resume: bool, verbosity: int) -> int:
279
+ """Download a file from S3.
385
280
 
386
- Parameters
387
- ----------
388
- s3_object : dict
389
- The S3 object.
390
- source : str
391
- The source S3 path.
281
+ Parameters
282
+ ----------
283
+ source : str
284
+ S3 source URL.
285
+ target : str
286
+ Local file path to save.
287
+ overwrite : bool
288
+ Overwrite existing file if True.
289
+ resume : bool
290
+ Resume download if True.
291
+ verbosity : int
292
+ Verbosity level.
392
293
 
393
- Returns
394
- -------
395
- str
396
- The S3 path of the object.
397
- """
398
- _, _, bucket, _ = source.split("/", 3)
399
- return f"s3://{bucket}/{s3_object['Key']}"
294
+ Returns
295
+ -------
296
+ int
297
+ Number of bytes downloaded.
298
+ """
400
299
 
401
- def target_path(self, s3_object: dict, source: str, target: str) -> str:
402
- """Get the local path for the S3 object.
300
+ import obstore
403
301
 
404
- Parameters
405
- ----------
406
- s3_object : dict
407
- The S3 object.
408
- source : str
409
- The source S3 path.
410
- target : str
411
- The target local path.
302
+ obj = _s3_object(source)
412
303
 
413
- Returns
414
- -------
415
- str
416
- The local path for the S3 object.
417
- """
418
- _, _, _, folder = source.split("/", 3)
419
- local_path = os.path.join(target, os.path.relpath(s3_object["Key"], folder))
420
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
421
- return local_path
304
+ s3 = s3_client(obj)
422
305
 
423
- def source_size(self, s3_object: dict) -> int:
424
- """Get the size of the S3 object.
306
+ size = object_info(source)["size"]
425
307
 
426
- Parameters
427
- ----------
428
- s3_object : dict
429
- The S3 object.
308
+ if verbosity > 0:
309
+ LOG.info(f"Download {source} to {target} ({bytes_to_human(size)})")
430
310
 
431
- Returns
432
- -------
433
- int
434
- The size of the S3 object.
435
- """
436
- return s3_object["Size"]
437
-
438
- def _transfer_file(
439
- self,
440
- source: str,
441
- target: str,
442
- overwrite: bool,
443
- resume: bool,
444
- verbosity: int,
445
- threads: int,
446
- config: dict = None,
447
- ) -> int:
448
- """Transfer a file from S3 to the local filesystem.
311
+ if overwrite:
312
+ resume = False
449
313
 
450
- Parameters
451
- ----------
452
- source : str
453
- The source S3 path.
454
- target : str
455
- The target local path.
456
- overwrite : bool
457
- Whether to overwrite the target if it exists.
458
- resume : bool
459
- Whether to resume the transfer if possible.
460
- verbosity : int
461
- The verbosity level.
462
- threads : int
463
- The number of threads to use.
464
- config : dict, optional
465
- Additional configuration options.
314
+ if resume:
315
+ if os.path.exists(target):
316
+ local_size = os.path.getsize(target)
317
+ if local_size != size:
318
+ LOG.warning(f"{target} already with different size, re-downloading (remote={size}, local={local_size})")
319
+ else:
320
+ return size
466
321
 
467
- Returns
468
- -------
469
- int
470
- The size of the transferred file.
322
+ if os.path.exists(target) and not overwrite:
323
+ raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
471
324
 
472
- Raises
473
- ------
474
- ValueError
475
- If the target does not exist on S3.
476
- """
477
- # from boto3.s3.transfer import TransferConfig
325
+ with tqdm.tqdm(
326
+ desc=obj.key,
327
+ total=size,
328
+ unit="B",
329
+ unit_scale=True,
330
+ unit_divisor=1024,
331
+ leave=verbosity >= 2,
332
+ delay=0 if verbosity > 0 else 10,
333
+ ) as pbar:
334
+ chunk_size = 1024 * 1024
335
+ total = size
336
+ with closing(obstore.open_reader(s3, obj.key, buffer_size=chunk_size)) as f:
337
+ with open(target, "wb") as g:
338
+ while total > 0:
339
+ chunk = f.read(min(chunk_size, total))
340
+ g.write(chunk)
341
+ pbar.update(len(chunk))
342
+ total -= len(chunk)
478
343
 
479
- _, _, bucket, key = source.split("/", 3)
480
- s3 = s3_client(bucket)
344
+ return size
481
345
 
482
- if key.endswith("/"):
483
- return 0
484
346
 
485
- try:
486
- response = s3.head_object(Bucket=bucket, Key=key)
487
- except s3.exceptions.ClientError as e:
488
- if e.response["Error"]["Code"] == "404":
489
- raise ValueError(f"{source} does not exist ({bucket}, {key})")
490
- raise
491
-
492
- size = int(response["ContentLength"])
493
-
494
- if verbosity > 0:
495
- LOG.info(f"{self.action} {source} to {target} ({bytes_to_human(size)})")
496
-
497
- if overwrite:
498
- resume = False
499
-
500
- if resume:
501
- if os.path.exists(target):
502
- local_size = os.path.getsize(target)
503
- if local_size != size:
504
- LOG.warning(
505
- f"{target} already with different size, re-downloading (remote={size}, local={local_size})"
506
- )
507
- else:
508
- # if verbosity > 0:
509
- # LOGGER.info(f"{target} already exists, skipping")
510
- return size
511
-
512
- if os.path.exists(target) and not overwrite:
513
- raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
514
-
515
- if verbosity > 0:
516
- with tqdm.tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024, leave=False) as pbar:
517
- s3.download_file(
518
- bucket,
519
- key,
520
- target,
521
- Callback=lambda x: pbar.update(x),
522
- Config=config,
523
- )
524
- else:
525
- s3.download_file(bucket, key, target, Config=config)
526
-
527
- return size
347
+ def _list_objects(target: str, batch: bool = False) -> Iterable[list[dict]] | Iterable[dict]:
528
348
 
349
+ import obstore
529
350
 
530
- def _list_objects(target: str, batch: bool = False) -> Iterable:
531
- """List the objects in the target S3 path.
351
+ """
352
+ List objects in an S3 folder.
532
353
 
533
354
  Parameters
534
355
  ----------
535
356
  target : str
536
- The target S3 path.
357
+ S3 folder URL.
537
358
  batch : bool, optional
538
- Whether to return objects in batches, by default False.
359
+ Yield batches if True, else yield individual objects.
539
360
 
540
361
  Returns
541
362
  -------
542
363
  Iterable
543
- An iterable of S3 objects.
364
+ Iterable of objects or batches.
544
365
  """
545
- _, _, bucket, prefix = target.split("/", 3)
546
- s3 = s3_client(bucket)
366
+ obj = _s3_object(target)
547
367
 
548
- paginator = s3.get_paginator("list_objects_v2")
368
+ s3 = s3_client(obj)
549
369
 
550
- for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
551
- if "Contents" in page:
552
- objects = deepcopy(page["Contents"])
553
- if batch:
554
- yield objects
555
- else:
556
- yield from objects
370
+ for files in obstore.list(s3, obj.key + "/", chunk_size=1024):
371
+ if batch:
372
+ yield files
373
+ else:
374
+ yield from files
557
375
 
558
376
 
559
377
  def delete_folder(target: str) -> None:
560
- """Delete a folder from S3.
378
+
379
+ import obstore
380
+
381
+ """
382
+ Delete all objects in an S3 folder.
561
383
 
562
384
  Parameters
563
385
  ----------
564
386
  target : str
565
- The target S3 folder path.
387
+ S3 folder URL.
566
388
  """
567
- _, _, bucket, _ = target.split("/", 3)
568
- s3 = s3_client(bucket)
389
+ obj = _s3_object(target)
390
+ s3 = s3_client(obj)
569
391
 
570
392
  total = 0
571
- for batch in _list_objects(target, batch=True):
393
+ for batch in _list_objects(obj, batch=True):
394
+ paths = [o["path"] for o in batch]
572
395
  LOG.info(f"Deleting {len(batch):,} objects from {target}")
573
- s3.delete_objects(Bucket=bucket, Delete={"Objects": [{"Key": o["Key"]} for o in batch]})
396
+ obstore.delete(s3, paths)
574
397
  total += len(batch)
575
398
  LOG.info(f"Deleted {len(batch):,} objects (total={total:,})")
576
399
 
577
400
 
578
401
  def delete_file(target: str) -> None:
579
- """Delete a file from S3.
402
+ import obstore
580
403
 
581
- Parameters
582
- ----------
583
- target : str
584
- The target S3 file path.
585
- """
586
- from botocore.exceptions import ClientError
404
+ obj = _s3_object(target)
587
405
 
588
- _, _, bucket, key = target.split("/", 3)
589
- s3 = s3_client(bucket)
406
+ s3 = s3_client(obj)
590
407
 
591
- try:
592
- s3.head_object(Bucket=bucket, Key=key)
593
- exits = True
594
- except ClientError as e:
595
- if e.response["Error"]["Code"] != "404":
596
- raise
597
- exits = False
598
-
599
- if not exits:
408
+ if not object_exists(obj):
600
409
  LOG.warning(f"{target} does not exist. Did you mean to delete a folder? Then add a trailing '/'")
601
410
  return
602
411
 
603
412
  LOG.info(f"Deleting {target}")
604
- s3.delete_object(Bucket=bucket, Key=key)
413
+ obstore.delete(s3, obj.key)
605
414
  LOG.info(f"{target} is deleted")
606
415
 
607
416
 
608
417
  def delete(target: str) -> None:
609
- """Delete a file or a folder from S3.
418
+ """Delete a file or folder from S3.
610
419
 
611
420
  Parameters
612
421
  ----------
613
422
  target : str
614
- The URL of a file or a folder on S3. The URL should start with 's3://'.
423
+ S3 URL (file or folder).
615
424
  """
616
425
 
617
- assert target.startswith("s3://")
618
-
619
426
  if target.endswith("/"):
620
427
  delete_folder(target)
621
428
  else:
622
429
  delete_file(target)
623
430
 
624
431
 
625
- def list_folder(folder: str) -> Iterable:
626
- """List the subfolders in a folder on S3.
432
+ def list_folder(folder: str) -> Iterable[dict]:
433
+ """List objects in an S3 folder.
627
434
 
628
435
  Parameters
629
436
  ----------
630
437
  folder : str
631
- The URL of a folder on S3. The URL should start with 's3://'.
438
+ S3 folder URL.
632
439
 
633
440
  Returns
634
441
  -------
635
- list
636
- A list of the subfolder names in the folder.
442
+ Iterable
443
+ Iterable of objects.
637
444
  """
638
-
639
- assert folder.startswith("s3://")
640
- if not folder.endswith("/"):
641
- folder += "/"
642
-
643
- _, _, bucket, prefix = folder.split("/", 3)
644
-
645
- s3 = s3_client(bucket)
646
- paginator = s3.get_paginator("list_objects_v2")
647
-
648
- for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
649
- if "CommonPrefixes" in page:
650
- yield from [folder + _["Prefix"] for _ in page.get("CommonPrefixes") if _["Prefix"] != "/"]
651
- if "Contents" in page:
652
- yield from [folder + _["Key"] for _ in page.get("Contents")]
445
+ return _list_objects(folder)
653
446
 
654
447
 
655
448
  def object_info(target: str) -> dict:
656
- """Get information about an object on S3.
449
+ """Get information about an S3 object.
657
450
 
658
451
  Parameters
659
452
  ----------
660
453
  target : str
661
- The URL of a file or a folder on S3. The URL should start with 's3://'.
454
+ S3 object URL.
662
455
 
663
456
  Returns
664
457
  -------
665
458
  dict
666
- A dictionary with information about the object.
459
+ Object metadata.
667
460
  """
668
-
669
- _, _, bucket, key = target.split("/", 3)
670
- s3 = s3_client(bucket)
671
-
672
- try:
673
- return s3.head_object(Bucket=bucket, Key=key)
674
- except s3.exceptions.ClientError as e:
675
- if e.response["Error"]["Code"] == "404":
676
- raise FileNotFoundError(f"{target} does not exist")
677
- raise
461
+ obj = _s3_object(target)
462
+ s3 = s3_client(obj)
463
+ return s3.head(obj.key)
678
464
 
679
465
 
680
466
  def object_exists(target: str) -> bool:
681
- """Check if an object exists.
467
+ """Check if an S3 object exists.
682
468
 
683
469
  Parameters
684
470
  ----------
685
471
  target : str
686
- The URL of a file or a folder on S3. The URL should start with 's3://'.
472
+ S3 object URL.
687
473
 
688
474
  Returns
689
475
  -------
690
476
  bool
691
- True if the object exists, False otherwise.
477
+ True if object exists, False otherwise.
692
478
  """
693
-
694
- _, _, bucket, key = target.split("/", 3)
695
- s3 = s3_client(bucket)
479
+ obj = _s3_object(target)
480
+ s3 = s3_client(obj)
696
481
 
697
482
  try:
698
- s3.head_object(Bucket=bucket, Key=key)
483
+ s3.head(obj.key)
699
484
  return True
700
- except s3.exceptions.ClientError as e:
701
- if e.response["Error"]["Code"] == "404":
702
- return False
703
- raise
485
+ except FileNotFoundError:
486
+ return False
704
487
 
705
488
 
706
- def object_acl(target: str) -> dict:
707
- """Get information about an object's ACL on S3.
489
+ def get_object(target: str) -> bool:
490
+ """Check if an S3 object exists.
708
491
 
709
492
  Parameters
710
493
  ----------
711
494
  target : str
712
- The URL of a file or a folder on S3. The URL should start with 's3://'.
495
+ S3 object URL.
713
496
 
714
497
  Returns
715
498
  -------
716
- dict
717
- A dictionary with information about the object's ACL.
499
+ bool
500
+ True if object exists, False otherwise.
718
501
  """
502
+ obj = _s3_object(target)
503
+ s3 = s3_client(obj)
719
504
 
720
- _, _, bucket, key = target.split("/", 3)
721
- s3 = s3_client(bucket)
722
-
723
- return s3.get_object_acl(Bucket=bucket, Key=key)
505
+ return s3.get(obj.key).bytes()
724
506
 
725
507
 
726
508
  def download(source: str, target: str, *args, **kwargs) -> None:
727
- """Download a file or folder from S3 to the local filesystem.
509
+ """Download from S3 using transfer utility.
728
510
 
729
511
  Parameters
730
512
  ----------
731
513
  source : str
732
- The source S3 path.
514
+ S3 source URL.
733
515
  target : str
734
- The target local path.
735
- args : tuple
736
- Additional positional arguments.
737
- kwargs : dict
516
+ Local target path.
517
+ *args
518
+ Additional arguments.
519
+ **kwargs
738
520
  Additional keyword arguments.
739
521
  """
740
- from . import transfer
741
522
 
742
523
  assert source.startswith("s3://"), f"source {source} should start with 's3://'"
743
524
  return transfer(source, target, *args, **kwargs)
744
525
 
745
526
 
746
527
  def upload(source: str, target: str, *args, **kwargs) -> None:
747
- """Upload a file or folder to S3.
528
+ """Upload to S3 using transfer utility.
748
529
 
749
530
  Parameters
750
531
  ----------
751
532
  source : str
752
- The source file or folder path.
533
+ Local source path.
753
534
  target : str
754
- The target S3 path.
755
- args : tuple
756
- Additional positional arguments.
757
- kwargs : dict
535
+ S3 target URL.
536
+ *args
537
+ Additional arguments.
538
+ **kwargs
758
539
  Additional keyword arguments.
759
540
  """
760
- from . import transfer
761
541
 
762
542
  assert target.startswith("s3://"), f"target {target} should start with 's3://'"
763
543
  return transfer(source, target, *args, **kwargs)
764
544
 
765
545
 
766
- def quotas(target: str) -> dict:
767
- """Get the quotas for an S3 bucket.
546
+ ##########################
547
+ # Generic transfer classes
548
+ ##########################
549
+ class S3Upload(BaseUpload):
768
550
 
769
- Parameters
770
- ----------
771
- target : str
772
- The URL of a file or a folder on S3. The URL should start with 's3://'.
551
+ def get_temporary_target(self, target: str, pattern: str) -> str:
552
+ """Get temporary target path for upload.
773
553
 
774
- Returns
775
- -------
776
- dict
777
- A dictionary with the quotas for the bucket.
778
- """
779
- from botocore.exceptions import ClientError
554
+ Parameters
555
+ ----------
556
+ target : str
557
+ S3 target URL.
558
+ pattern : str
559
+ Pattern for temporary naming.
780
560
 
781
- _, _, bucket, _ = target.split("/", 3)
782
- s3 = s3_client(bucket, service="service-quotas")
561
+ Returns
562
+ -------
563
+ str
564
+ Temporary target path.
565
+ """
566
+ return target
783
567
 
784
- try:
785
- return s3.list_service_quotas(ServiceCode="ec2")
786
- except ClientError as e:
787
- if e.response["Error"]["Code"] == "404":
788
- raise ValueError(f"{target} does not exist")
789
- raise
568
+ def rename_target(self, target: str, temporary_target: str) -> None:
569
+ """Rename temporary target to final target.
570
+
571
+ Parameters
572
+ ----------
573
+ target : str
574
+ Final target path.
575
+ temporary_target : str
576
+ Temporary target path.
577
+ """
578
+ pass
579
+
580
+ def delete_target(self, target: str) -> None:
581
+ """Delete target from S3.
582
+
583
+ Parameters
584
+ ----------
585
+ target : str
586
+ S3 target URL.
587
+ """
588
+
589
+ pass
590
+
591
+ def _transfer_file(self, source: str, target: str, overwrite: bool, resume: bool, verbosity: int, **kwargs) -> int:
592
+ """Transfer a file to S3.
593
+
594
+ Parameters
595
+ ----------
596
+ source : str
597
+ Local source path.
598
+ target : str
599
+ S3 target URL.
600
+ overwrite : bool
601
+ Overwrite existing file if True.
602
+ resume : bool
603
+ Resume upload if True.
604
+ verbosity : int
605
+ Verbosity level.
606
+ kwargs : dict
607
+ Additional keyword arguments.
608
+
609
+ Returns
610
+ -------
611
+ int
612
+ Number of bytes uploaded.
613
+ """
614
+
615
+ return upload_file(source, target, overwrite, resume, verbosity)
616
+
617
+
618
+ class S3Download(BaseDownload):
619
+
620
+ def copy(self, source: str, target: str, **kwargs) -> None:
621
+ """Copy file or folder from S3.
622
+
623
+ Parameters
624
+ ----------
625
+ source : str
626
+ S3 source URL.
627
+ target : str
628
+ Local target path.
629
+ **kwargs
630
+ Additional keyword arguments.
631
+ """
632
+
633
+ assert source.startswith("s3://")
634
+
635
+ if source.endswith("/"):
636
+ self.transfer_folder(source=source, target=target, **kwargs)
637
+ else:
638
+ self.transfer_file(source=source, target=target, **kwargs)
639
+
640
+ def list_source(self, source: str) -> Iterable[dict]:
641
+ """List objects in S3 source folder.
642
+
643
+ Parameters
644
+ ----------
645
+ source : str
646
+ S3 source folder URL.
647
+
648
+ Returns
649
+ -------
650
+ Iterable
651
+ Iterable of objects.
652
+ """
653
+ yield from _list_objects(source)
654
+
655
+ def source_path(self, s3_object: dict, source: str) -> str:
656
+ """Get S3 path for a source object.
657
+
658
+ Parameters
659
+ ----------
660
+ s3_object : dict
661
+ S3 object metadata.
662
+ source : str
663
+ S3 source folder URL.
664
+
665
+ Returns
666
+ -------
667
+ str
668
+ S3 object path.
669
+ """
670
+ object = _s3_object(source)
671
+ return f"s3://{object.bucket}/{s3_object['path']}"
672
+
673
+ def target_path(self, s3_object: dict, source: str, target: str) -> str:
674
+ """Get local target path for an S3 object.
675
+
676
+ Parameters
677
+ ----------
678
+ s3_object : dict
679
+ S3 object metadata.
680
+ source : str
681
+ S3 source folder URL.
682
+ target : str
683
+ Local target folder.
684
+
685
+ Returns
686
+ -------
687
+ str
688
+ Local target path.
689
+ """
690
+
691
+ object = _s3_object(source)
692
+ local_path = os.path.join(target, os.path.relpath(s3_object["path"], object.key))
693
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
694
+ return local_path
695
+
696
+ def source_size(self, s3_object: dict) -> int:
697
+ """Get size of S3 object.
698
+
699
+ Parameters
700
+ ----------
701
+ s3_object : dict
702
+ S3 object metadata.
703
+
704
+ Returns
705
+ -------
706
+ int
707
+ Size in bytes.
708
+ """
709
+ return s3_object["size"]
710
+
711
+ def _transfer_file(self, source: str, target: str, overwrite: bool, resume: bool, verbosity: int, **kwargs) -> int:
712
+ """Transfer a file from S3.
713
+
714
+ Parameters
715
+ ----------
716
+ source : str
717
+ S3 source URL.
718
+ target : str
719
+ Local target path.
720
+ overwrite : bool
721
+ Overwrite existing file if True.
722
+ resume : bool
723
+ Resume download if True.
724
+ verbosity : int
725
+ Verbosity level.
726
+ kwargs : dict
727
+ Additional keyword arguments.
728
+
729
+ Returns
730
+ -------
731
+ int
732
+ Number of bytes downloaded.
733
+ """
734
+
735
+ return download_file(source, target, overwrite, resume, verbosity)