anemoi-utils 0.4.35__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anemoi-utils might be problematic. Click here for more details.
- anemoi/utils/_environment.py +3 -0
- anemoi/utils/_version.py +2 -2
- anemoi/utils/checkpoints.py +47 -32
- anemoi/utils/cli.py +38 -5
- anemoi/utils/commands/transfer.py +6 -2
- anemoi/utils/config.py +0 -6
- anemoi/utils/logs.py +34 -6
- anemoi/utils/mlflow/auth.py +39 -1
- anemoi/utils/registry.py +55 -1
- anemoi/utils/remote/__init__.py +1 -3
- anemoi/utils/remote/s3.py +489 -543
- {anemoi_utils-0.4.35.dist-info → anemoi_utils-0.4.36.dist-info}/METADATA +3 -2
- {anemoi_utils-0.4.35.dist-info → anemoi_utils-0.4.36.dist-info}/RECORD +17 -17
- {anemoi_utils-0.4.35.dist-info → anemoi_utils-0.4.36.dist-info}/WHEEL +0 -0
- {anemoi_utils-0.4.35.dist-info → anemoi_utils-0.4.36.dist-info}/entry_points.txt +0 -0
- {anemoi_utils-0.4.35.dist-info → anemoi_utils-0.4.36.dist-info}/licenses/LICENSE +0 -0
- {anemoi_utils-0.4.35.dist-info → anemoi_utils-0.4.36.dist-info}/top_level.txt +0 -0
anemoi/utils/remote/s3.py
CHANGED
|
@@ -29,7 +29,7 @@ import logging
|
|
|
29
29
|
import os
|
|
30
30
|
import threading
|
|
31
31
|
from collections.abc import Iterable
|
|
32
|
-
from
|
|
32
|
+
from contextlib import closing
|
|
33
33
|
from typing import Any
|
|
34
34
|
|
|
35
35
|
import tqdm
|
|
@@ -38,752 +38,698 @@ from ..config import load_config
|
|
|
38
38
|
from ..humanize import bytes_to_human
|
|
39
39
|
from . import BaseDownload
|
|
40
40
|
from . import BaseUpload
|
|
41
|
+
from . import transfer
|
|
41
42
|
|
|
42
43
|
LOG = logging.getLogger(__name__)
|
|
43
|
-
SECRETS = ["aws_access_key_id", "aws_secret_access_key"]
|
|
44
|
+
SECRETS = ["aws_access_key_id", "aws_secret_access_key", "access_key_id", "secret_access_key"]
|
|
44
45
|
|
|
45
|
-
# s3_clients are not thread-safe, so we need to create a new client for each thread
|
|
46
46
|
|
|
47
|
-
|
|
47
|
+
MIGRATE = {
|
|
48
|
+
"aws_access_key_id": "access_key_id",
|
|
49
|
+
"aws_secret_access_key": "secret_access_key",
|
|
50
|
+
}
|
|
48
51
|
|
|
52
|
+
CACHE = {}
|
|
53
|
+
LOCK = threading.Lock()
|
|
49
54
|
|
|
50
|
-
def _s3_config(bucket: str, *, region: str = None) -> Any:
|
|
51
|
-
"""Get an S3 client config for the specified bucket and region.
|
|
52
55
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
The name of the S3 bucket.
|
|
57
|
-
region : str, optional
|
|
58
|
-
The AWS region of the S3 bucket.
|
|
59
|
-
|
|
60
|
-
Returns
|
|
61
|
-
-------
|
|
62
|
-
Any
|
|
63
|
-
The S3 client.
|
|
64
|
-
"""
|
|
65
|
-
from botocore import UNSIGNED
|
|
66
|
-
|
|
67
|
-
boto3_config = {}
|
|
68
|
-
|
|
69
|
-
if region:
|
|
70
|
-
# This is using AWS
|
|
71
|
-
|
|
72
|
-
options = {"region_name": region}
|
|
73
|
-
|
|
74
|
-
# Anonymous access
|
|
75
|
-
if not (
|
|
76
|
-
os.path.exists(os.path.expanduser("~/.aws/credentials"))
|
|
77
|
-
or ("AWS_ACCESS_KEY_ID" in os.environ and "AWS_SECRET_ACCESS_KEY" in os.environ)
|
|
78
|
-
):
|
|
79
|
-
boto3_config["signature_version"] = UNSIGNED
|
|
80
|
-
|
|
81
|
-
else:
|
|
82
|
-
|
|
83
|
-
# We may be accessing a different S3 compatible service
|
|
84
|
-
# Use anemoi.config to get the configuration
|
|
85
|
-
|
|
86
|
-
region = "unknown-region"
|
|
87
|
-
|
|
88
|
-
options = {"region_name": region}
|
|
89
|
-
config = load_config(secrets=SECRETS)
|
|
90
|
-
|
|
91
|
-
cfg = config.get("object-storage", {})
|
|
92
|
-
candidate = None
|
|
93
|
-
for k, v in cfg.items():
|
|
94
|
-
if isinstance(v, (str, int, float, bool)):
|
|
95
|
-
options[k] = v
|
|
96
|
-
|
|
97
|
-
if isinstance(v, dict):
|
|
98
|
-
if fnmatch.fnmatch(bucket, k):
|
|
99
|
-
if candidate is not None:
|
|
100
|
-
raise ValueError(f"Multiple object storage configurations match {bucket}: {candidate} and {k}")
|
|
101
|
-
candidate = k
|
|
102
|
-
|
|
103
|
-
if candidate is not None:
|
|
104
|
-
for k, v in cfg.get(candidate, {}).items():
|
|
105
|
-
if isinstance(v, (str, int, float, bool)):
|
|
106
|
-
options[k] = v
|
|
107
|
-
|
|
108
|
-
type = options.pop("type", "s3")
|
|
109
|
-
if type != "s3":
|
|
110
|
-
raise ValueError(f"Unsupported object storage type {type}")
|
|
56
|
+
class S3Object:
|
|
57
|
+
def __init__(self, url: str) -> None:
|
|
58
|
+
"""Initialise an S3Object from a URL.
|
|
111
59
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
if isinstance(options, list):
|
|
127
|
-
return [_(o) for o in options]
|
|
128
|
-
|
|
129
|
-
return options
|
|
130
|
-
|
|
131
|
-
LOG.debug(f"Using S3 options: {_(options)}")
|
|
132
|
-
|
|
133
|
-
return boto3_config, options
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
url : str
|
|
63
|
+
S3 URL (e.g., 's3://bucket/key').
|
|
64
|
+
"""
|
|
65
|
+
self.url = url
|
|
66
|
+
try:
|
|
67
|
+
s3, empty, self.bucket, self.key = url.split("/", 3)
|
|
68
|
+
except ValueError:
|
|
69
|
+
raise ValueError(f"Invalid S3 URL: {url}")
|
|
70
|
+
assert s3 == "s3:"
|
|
71
|
+
assert empty == ""
|
|
72
|
+
self.dirname = f"s3://{self.bucket}"
|
|
134
73
|
|
|
135
74
|
|
|
136
|
-
def
|
|
137
|
-
"""
|
|
75
|
+
def _s3_object(url_or_object: str | S3Object) -> S3Object:
|
|
76
|
+
"""Convert a string or S3Object to S3Object.
|
|
138
77
|
|
|
139
78
|
Parameters
|
|
140
79
|
----------
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
region : str, optional
|
|
144
|
-
The AWS region of the S3 bucket.
|
|
145
|
-
service : str, optional
|
|
146
|
-
The AWS service to use, default is "s3".
|
|
80
|
+
url_or_object : str or S3Object
|
|
81
|
+
S3 URL or S3Object instance.
|
|
147
82
|
|
|
148
83
|
Returns
|
|
149
84
|
-------
|
|
150
|
-
|
|
151
|
-
|
|
85
|
+
S3Object
|
|
86
|
+
S3Object instance.
|
|
152
87
|
"""
|
|
153
|
-
|
|
154
|
-
|
|
88
|
+
if isinstance(url_or_object, S3Object):
|
|
89
|
+
return url_or_object
|
|
90
|
+
|
|
91
|
+
if isinstance(url_or_object, str):
|
|
92
|
+
return S3Object(url_or_object)
|
|
155
93
|
|
|
94
|
+
raise TypeError(f"Invalid type for S3 object: {type(url_or_object)}")
|
|
156
95
|
|
|
157
|
-
|
|
158
|
-
|
|
96
|
+
|
|
97
|
+
def _hide_secrets(options: dict | list) -> dict | list:
|
|
98
|
+
"""Hide secret values in options.
|
|
159
99
|
|
|
160
100
|
Parameters
|
|
161
101
|
----------
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
region : str, optional
|
|
165
|
-
The AWS region of the S3 bucket.
|
|
166
|
-
service : str, optional
|
|
167
|
-
The AWS service to use, default is "s3".
|
|
102
|
+
options : dict or list
|
|
103
|
+
Options possibly containing secrets.
|
|
168
104
|
|
|
169
105
|
Returns
|
|
170
106
|
-------
|
|
171
|
-
|
|
172
|
-
|
|
107
|
+
dict or list
|
|
108
|
+
Options with secrets hidden.
|
|
173
109
|
"""
|
|
174
|
-
import boto3
|
|
175
|
-
from botocore.client import Config
|
|
176
110
|
|
|
177
|
-
|
|
178
|
-
|
|
111
|
+
def __(k, v):
|
|
112
|
+
if k in SECRETS:
|
|
113
|
+
return "***"
|
|
114
|
+
return v
|
|
179
115
|
|
|
180
|
-
|
|
116
|
+
if isinstance(options, dict):
|
|
117
|
+
return {k: __(k, v) for k, v in options.items()}
|
|
181
118
|
|
|
182
|
-
if
|
|
183
|
-
return
|
|
119
|
+
if isinstance(options, list):
|
|
120
|
+
return [_hide_secrets(o) for o in options]
|
|
184
121
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
boto3_config.update(
|
|
188
|
-
dict(
|
|
189
|
-
max_pool_connections=25,
|
|
190
|
-
request_checksum_calculation="when_required",
|
|
191
|
-
response_checksum_validation="when_required",
|
|
192
|
-
)
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
options["config"] = Config(**boto3_config)
|
|
196
|
-
|
|
197
|
-
def _(options):
|
|
198
|
-
|
|
199
|
-
def __(k, v):
|
|
200
|
-
if k in SECRETS:
|
|
201
|
-
return "***"
|
|
202
|
-
return v
|
|
122
|
+
return options
|
|
203
123
|
|
|
204
|
-
if isinstance(options, dict):
|
|
205
|
-
return {k: __(k, v) for k, v in options.items()}
|
|
206
124
|
|
|
207
|
-
|
|
208
|
-
|
|
125
|
+
def _s3_options(obj: str | S3Object) -> dict:
|
|
126
|
+
"""Get S3 options for a given object.
|
|
209
127
|
|
|
210
|
-
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
obj : str or S3Object
|
|
131
|
+
S3 URL or S3Object instance.
|
|
211
132
|
|
|
212
|
-
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
dict
|
|
136
|
+
S3 connection options.
|
|
137
|
+
"""
|
|
213
138
|
|
|
214
|
-
|
|
139
|
+
obj = _s3_object(obj)
|
|
215
140
|
|
|
216
|
-
|
|
141
|
+
with LOCK:
|
|
142
|
+
if obj.dirname in CACHE:
|
|
143
|
+
return CACHE[obj.dirname]
|
|
217
144
|
|
|
145
|
+
options = {}
|
|
218
146
|
|
|
219
|
-
|
|
147
|
+
# We may be accessing a different S3 compatible service
|
|
148
|
+
# Use anemoi.config to get the configuration
|
|
220
149
|
|
|
221
|
-
|
|
222
|
-
"""Get a temporary target path based on the given pattern.
|
|
150
|
+
config = load_config(secrets=SECRETS)
|
|
223
151
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
The pattern to format the temporary path.
|
|
152
|
+
cfg = config.get("object-storage", {})
|
|
153
|
+
candidate = None
|
|
154
|
+
for k, v in cfg.items():
|
|
155
|
+
if isinstance(v, (str, int, float, bool)):
|
|
156
|
+
options[k] = v
|
|
230
157
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
return target
|
|
158
|
+
if isinstance(v, dict):
|
|
159
|
+
if fnmatch.fnmatch(obj.bucket, k):
|
|
160
|
+
if candidate is not None:
|
|
161
|
+
raise ValueError(f"Multiple object storage configurations match {obj.bucket}: {candidate} and {k}")
|
|
162
|
+
candidate = k
|
|
237
163
|
|
|
238
|
-
|
|
239
|
-
|
|
164
|
+
if candidate is not None:
|
|
165
|
+
for k, v in cfg.get(candidate, {}).items():
|
|
166
|
+
if isinstance(v, (str, int, float, bool)):
|
|
167
|
+
options[k] = v
|
|
240
168
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
The original target path.
|
|
245
|
-
temporary_target : str
|
|
246
|
-
The new target path.
|
|
247
|
-
"""
|
|
248
|
-
pass
|
|
169
|
+
type = options.pop("type", "s3")
|
|
170
|
+
if type != "s3":
|
|
171
|
+
raise ValueError(f"Unsupported object storage type {type}")
|
|
249
172
|
|
|
250
|
-
|
|
251
|
-
|
|
173
|
+
for k, v in MIGRATE.items():
|
|
174
|
+
if k in options:
|
|
175
|
+
LOG.warning(f"Option '{k}' is deprecated, use '{v}' instead")
|
|
176
|
+
options[v] = options.pop(k)
|
|
252
177
|
|
|
253
|
-
|
|
254
|
-
----------
|
|
255
|
-
target : str
|
|
256
|
-
The target path to delete.
|
|
257
|
-
"""
|
|
258
|
-
pass
|
|
259
|
-
# delete(target)
|
|
260
|
-
|
|
261
|
-
def _transfer_file(
|
|
262
|
-
self,
|
|
263
|
-
source: str,
|
|
264
|
-
target: str,
|
|
265
|
-
overwrite: bool,
|
|
266
|
-
resume: bool,
|
|
267
|
-
verbosity: int,
|
|
268
|
-
threads: int,
|
|
269
|
-
config: dict = None,
|
|
270
|
-
) -> int:
|
|
271
|
-
"""Transfer a file to S3.
|
|
178
|
+
LOG.info(f"Using S3 options: {_hide_secrets(options)}")
|
|
272
179
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
source : str
|
|
276
|
-
The source file path.
|
|
277
|
-
target : str
|
|
278
|
-
The target S3 path.
|
|
279
|
-
overwrite : bool
|
|
280
|
-
Whether to overwrite the target if it exists.
|
|
281
|
-
resume : bool
|
|
282
|
-
Whether to resume the transfer if possible.
|
|
283
|
-
verbosity : int
|
|
284
|
-
The verbosity level.
|
|
285
|
-
threads : int
|
|
286
|
-
The number of threads to use.
|
|
287
|
-
config : dict, optional
|
|
288
|
-
Additional configuration options.
|
|
180
|
+
with LOCK:
|
|
181
|
+
CACHE[obj.dirname] = options
|
|
289
182
|
|
|
290
|
-
|
|
291
|
-
-------
|
|
292
|
-
int
|
|
293
|
-
The size of the transferred file.
|
|
294
|
-
|
|
295
|
-
Raises
|
|
296
|
-
------
|
|
297
|
-
ValueError
|
|
298
|
-
If the target already exists and overwrite or resume is not specified.
|
|
299
|
-
"""
|
|
300
|
-
from botocore.exceptions import ClientError
|
|
301
|
-
|
|
302
|
-
assert target.startswith("s3://")
|
|
183
|
+
return options
|
|
303
184
|
|
|
304
|
-
_, _, bucket, key = target.split("/", 3)
|
|
305
|
-
s3 = s3_client(bucket)
|
|
306
185
|
|
|
307
|
-
|
|
186
|
+
def s3_client(obj: str | S3Object) -> Any:
|
|
187
|
+
"""Create an S3 client for the given URL.
|
|
308
188
|
|
|
309
|
-
|
|
310
|
-
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
obj : str or S3Object
|
|
192
|
+
S3 URL or S3Object instance.
|
|
311
193
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
raise
|
|
318
|
-
remote_size = None
|
|
319
|
-
|
|
320
|
-
if remote_size is not None:
|
|
321
|
-
if remote_size != size:
|
|
322
|
-
LOG.warning(
|
|
323
|
-
f"{target} already exists, but with different size, re-uploading (remote={remote_size}, local={size})"
|
|
324
|
-
)
|
|
325
|
-
elif resume:
|
|
326
|
-
# LOGGER.info(f"{target} already exists, skipping")
|
|
327
|
-
return size
|
|
194
|
+
Returns
|
|
195
|
+
-------
|
|
196
|
+
Any
|
|
197
|
+
S3 client instance.
|
|
198
|
+
"""
|
|
328
199
|
|
|
329
|
-
|
|
330
|
-
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
|
|
331
|
-
|
|
332
|
-
if verbosity > 0:
|
|
333
|
-
with tqdm.tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024, leave=False) as pbar:
|
|
334
|
-
s3.upload_file(
|
|
335
|
-
source,
|
|
336
|
-
bucket,
|
|
337
|
-
key,
|
|
338
|
-
Callback=lambda x: pbar.update(x),
|
|
339
|
-
Config=config,
|
|
340
|
-
)
|
|
341
|
-
else:
|
|
342
|
-
s3.upload_file(source, bucket, key, Config=config)
|
|
200
|
+
import obstore
|
|
343
201
|
|
|
344
|
-
|
|
202
|
+
obj = _s3_object(obj)
|
|
203
|
+
options = _s3_options(obj)
|
|
204
|
+
LOG.debug(f"Using S3 options: {_hide_secrets(options)}")
|
|
205
|
+
return obstore.store.from_url(obj.dirname, **options)
|
|
345
206
|
|
|
346
207
|
|
|
347
|
-
|
|
208
|
+
def upload_file(source: str, target: str, overwrite: bool, resume: bool, verbosity: int) -> int:
|
|
209
|
+
"""Upload a file to S3.
|
|
348
210
|
|
|
349
|
-
|
|
350
|
-
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
source : str
|
|
214
|
+
Local file path to upload.
|
|
215
|
+
target : str
|
|
216
|
+
S3 target URL.
|
|
217
|
+
overwrite : bool
|
|
218
|
+
Overwrite existing file if True.
|
|
219
|
+
resume : bool
|
|
220
|
+
Resume upload if True.
|
|
221
|
+
verbosity : int
|
|
222
|
+
Verbosity level.
|
|
351
223
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
The target local path.
|
|
358
|
-
kwargs : dict
|
|
359
|
-
Additional arguments for the transfer.
|
|
360
|
-
"""
|
|
361
|
-
assert source.startswith("s3://")
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
int
|
|
227
|
+
Number of bytes uploaded.
|
|
228
|
+
"""
|
|
362
229
|
|
|
363
|
-
|
|
364
|
-
self.transfer_folder(source=source, target=target, **kwargs)
|
|
365
|
-
else:
|
|
366
|
-
self.transfer_file(source=source, target=target, **kwargs)
|
|
230
|
+
import obstore
|
|
367
231
|
|
|
368
|
-
|
|
369
|
-
"""List the objects in the source S3 path.
|
|
232
|
+
obj = _s3_object(target)
|
|
370
233
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
source : str
|
|
374
|
-
The source S3 path.
|
|
234
|
+
s3 = s3_client(obj)
|
|
235
|
+
size = os.path.getsize(source)
|
|
375
236
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
Iterable
|
|
379
|
-
An iterable of S3 objects.
|
|
380
|
-
"""
|
|
381
|
-
yield from _list_objects(source)
|
|
237
|
+
if verbosity > 0:
|
|
238
|
+
LOG.info(f"Upload {source} to {target} ({bytes_to_human(size)})")
|
|
382
239
|
|
|
383
|
-
|
|
384
|
-
""
|
|
240
|
+
try:
|
|
241
|
+
remote_size = object_info(obj)["size"]
|
|
242
|
+
except FileNotFoundError:
|
|
243
|
+
remote_size = None
|
|
244
|
+
|
|
245
|
+
if remote_size is not None:
|
|
246
|
+
if remote_size != size:
|
|
247
|
+
LOG.warning(
|
|
248
|
+
f"{target} already exists, but with different size, re-uploading (remote={remote_size}, local={size})"
|
|
249
|
+
)
|
|
250
|
+
elif resume:
|
|
251
|
+
return size
|
|
252
|
+
|
|
253
|
+
if remote_size is not None and not overwrite and not resume:
|
|
254
|
+
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
|
|
255
|
+
|
|
256
|
+
with tqdm.tqdm(
|
|
257
|
+
desc=obj.key,
|
|
258
|
+
total=size,
|
|
259
|
+
unit="B",
|
|
260
|
+
unit_scale=True,
|
|
261
|
+
unit_divisor=1024,
|
|
262
|
+
leave=verbosity >= 2,
|
|
263
|
+
delay=0 if verbosity > 0 else 10,
|
|
264
|
+
) as pbar:
|
|
265
|
+
chunk_size = 1024 * 1024
|
|
266
|
+
total = size
|
|
267
|
+
with open(source, "rb") as f:
|
|
268
|
+
with closing(obstore.open_writer(s3, obj.key, buffer_size=chunk_size)) as g:
|
|
269
|
+
while total > 0:
|
|
270
|
+
chunk = f.read(min(chunk_size, total))
|
|
271
|
+
g.write(chunk)
|
|
272
|
+
pbar.update(len(chunk))
|
|
273
|
+
total -= len(chunk)
|
|
274
|
+
|
|
275
|
+
return size
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def download_file(source: str, target: str, overwrite: bool, resume: bool, verbosity: int) -> int:
|
|
279
|
+
"""Download a file from S3.
|
|
385
280
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
281
|
+
Parameters
|
|
282
|
+
----------
|
|
283
|
+
source : str
|
|
284
|
+
S3 source URL.
|
|
285
|
+
target : str
|
|
286
|
+
Local file path to save.
|
|
287
|
+
overwrite : bool
|
|
288
|
+
Overwrite existing file if True.
|
|
289
|
+
resume : bool
|
|
290
|
+
Resume download if True.
|
|
291
|
+
verbosity : int
|
|
292
|
+
Verbosity level.
|
|
392
293
|
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
_, _, bucket, _ = source.split("/", 3)
|
|
399
|
-
return f"s3://{bucket}/{s3_object['Key']}"
|
|
294
|
+
Returns
|
|
295
|
+
-------
|
|
296
|
+
int
|
|
297
|
+
Number of bytes downloaded.
|
|
298
|
+
"""
|
|
400
299
|
|
|
401
|
-
|
|
402
|
-
"""Get the local path for the S3 object.
|
|
300
|
+
import obstore
|
|
403
301
|
|
|
404
|
-
|
|
405
|
-
----------
|
|
406
|
-
s3_object : dict
|
|
407
|
-
The S3 object.
|
|
408
|
-
source : str
|
|
409
|
-
The source S3 path.
|
|
410
|
-
target : str
|
|
411
|
-
The target local path.
|
|
302
|
+
obj = _s3_object(source)
|
|
412
303
|
|
|
413
|
-
|
|
414
|
-
-------
|
|
415
|
-
str
|
|
416
|
-
The local path for the S3 object.
|
|
417
|
-
"""
|
|
418
|
-
_, _, _, folder = source.split("/", 3)
|
|
419
|
-
local_path = os.path.join(target, os.path.relpath(s3_object["Key"], folder))
|
|
420
|
-
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
421
|
-
return local_path
|
|
304
|
+
s3 = s3_client(obj)
|
|
422
305
|
|
|
423
|
-
|
|
424
|
-
"""Get the size of the S3 object.
|
|
306
|
+
size = object_info(source)["size"]
|
|
425
307
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
s3_object : dict
|
|
429
|
-
The S3 object.
|
|
308
|
+
if verbosity > 0:
|
|
309
|
+
LOG.info(f"Download {source} to {target} ({bytes_to_human(size)})")
|
|
430
310
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
int
|
|
434
|
-
The size of the S3 object.
|
|
435
|
-
"""
|
|
436
|
-
return s3_object["Size"]
|
|
437
|
-
|
|
438
|
-
def _transfer_file(
|
|
439
|
-
self,
|
|
440
|
-
source: str,
|
|
441
|
-
target: str,
|
|
442
|
-
overwrite: bool,
|
|
443
|
-
resume: bool,
|
|
444
|
-
verbosity: int,
|
|
445
|
-
threads: int,
|
|
446
|
-
config: dict = None,
|
|
447
|
-
) -> int:
|
|
448
|
-
"""Transfer a file from S3 to the local filesystem.
|
|
311
|
+
if overwrite:
|
|
312
|
+
resume = False
|
|
449
313
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
Whether to overwrite the target if it exists.
|
|
458
|
-
resume : bool
|
|
459
|
-
Whether to resume the transfer if possible.
|
|
460
|
-
verbosity : int
|
|
461
|
-
The verbosity level.
|
|
462
|
-
threads : int
|
|
463
|
-
The number of threads to use.
|
|
464
|
-
config : dict, optional
|
|
465
|
-
Additional configuration options.
|
|
314
|
+
if resume:
|
|
315
|
+
if os.path.exists(target):
|
|
316
|
+
local_size = os.path.getsize(target)
|
|
317
|
+
if local_size != size:
|
|
318
|
+
LOG.warning(f"{target} already with different size, re-downloading (remote={size}, local={local_size})")
|
|
319
|
+
else:
|
|
320
|
+
return size
|
|
466
321
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
int
|
|
470
|
-
The size of the transferred file.
|
|
322
|
+
if os.path.exists(target) and not overwrite:
|
|
323
|
+
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
|
|
471
324
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
325
|
+
with tqdm.tqdm(
|
|
326
|
+
desc=obj.key,
|
|
327
|
+
total=size,
|
|
328
|
+
unit="B",
|
|
329
|
+
unit_scale=True,
|
|
330
|
+
unit_divisor=1024,
|
|
331
|
+
leave=verbosity >= 2,
|
|
332
|
+
delay=0 if verbosity > 0 else 10,
|
|
333
|
+
) as pbar:
|
|
334
|
+
chunk_size = 1024 * 1024
|
|
335
|
+
total = size
|
|
336
|
+
with closing(obstore.open_reader(s3, obj.key, buffer_size=chunk_size)) as f:
|
|
337
|
+
with open(target, "wb") as g:
|
|
338
|
+
while total > 0:
|
|
339
|
+
chunk = f.read(min(chunk_size, total))
|
|
340
|
+
g.write(chunk)
|
|
341
|
+
pbar.update(len(chunk))
|
|
342
|
+
total -= len(chunk)
|
|
478
343
|
|
|
479
|
-
|
|
480
|
-
s3 = s3_client(bucket)
|
|
344
|
+
return size
|
|
481
345
|
|
|
482
|
-
if key.endswith("/"):
|
|
483
|
-
return 0
|
|
484
346
|
|
|
485
|
-
|
|
486
|
-
response = s3.head_object(Bucket=bucket, Key=key)
|
|
487
|
-
except s3.exceptions.ClientError as e:
|
|
488
|
-
if e.response["Error"]["Code"] == "404":
|
|
489
|
-
raise ValueError(f"{source} does not exist ({bucket}, {key})")
|
|
490
|
-
raise
|
|
491
|
-
|
|
492
|
-
size = int(response["ContentLength"])
|
|
493
|
-
|
|
494
|
-
if verbosity > 0:
|
|
495
|
-
LOG.info(f"{self.action} {source} to {target} ({bytes_to_human(size)})")
|
|
496
|
-
|
|
497
|
-
if overwrite:
|
|
498
|
-
resume = False
|
|
499
|
-
|
|
500
|
-
if resume:
|
|
501
|
-
if os.path.exists(target):
|
|
502
|
-
local_size = os.path.getsize(target)
|
|
503
|
-
if local_size != size:
|
|
504
|
-
LOG.warning(
|
|
505
|
-
f"{target} already with different size, re-downloading (remote={size}, local={local_size})"
|
|
506
|
-
)
|
|
507
|
-
else:
|
|
508
|
-
# if verbosity > 0:
|
|
509
|
-
# LOGGER.info(f"{target} already exists, skipping")
|
|
510
|
-
return size
|
|
511
|
-
|
|
512
|
-
if os.path.exists(target) and not overwrite:
|
|
513
|
-
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
|
|
514
|
-
|
|
515
|
-
if verbosity > 0:
|
|
516
|
-
with tqdm.tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024, leave=False) as pbar:
|
|
517
|
-
s3.download_file(
|
|
518
|
-
bucket,
|
|
519
|
-
key,
|
|
520
|
-
target,
|
|
521
|
-
Callback=lambda x: pbar.update(x),
|
|
522
|
-
Config=config,
|
|
523
|
-
)
|
|
524
|
-
else:
|
|
525
|
-
s3.download_file(bucket, key, target, Config=config)
|
|
526
|
-
|
|
527
|
-
return size
|
|
347
|
+
def _list_objects(target: str, batch: bool = False) -> Iterable[list[dict]] | Iterable[dict]:
|
|
528
348
|
|
|
349
|
+
import obstore
|
|
529
350
|
|
|
530
|
-
|
|
531
|
-
|
|
351
|
+
"""
|
|
352
|
+
List objects in an S3 folder.
|
|
532
353
|
|
|
533
354
|
Parameters
|
|
534
355
|
----------
|
|
535
356
|
target : str
|
|
536
|
-
|
|
357
|
+
S3 folder URL.
|
|
537
358
|
batch : bool, optional
|
|
538
|
-
|
|
359
|
+
Yield batches if True, else yield individual objects.
|
|
539
360
|
|
|
540
361
|
Returns
|
|
541
362
|
-------
|
|
542
363
|
Iterable
|
|
543
|
-
|
|
364
|
+
Iterable of objects or batches.
|
|
544
365
|
"""
|
|
545
|
-
|
|
546
|
-
s3 = s3_client(bucket)
|
|
366
|
+
obj = _s3_object(target)
|
|
547
367
|
|
|
548
|
-
|
|
368
|
+
s3 = s3_client(obj)
|
|
549
369
|
|
|
550
|
-
for
|
|
551
|
-
if
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
else:
|
|
556
|
-
yield from objects
|
|
370
|
+
for files in obstore.list(s3, obj.key + "/", chunk_size=1024):
|
|
371
|
+
if batch:
|
|
372
|
+
yield files
|
|
373
|
+
else:
|
|
374
|
+
yield from files
|
|
557
375
|
|
|
558
376
|
|
|
559
377
|
def delete_folder(target: str) -> None:
|
|
560
|
-
|
|
378
|
+
|
|
379
|
+
import obstore
|
|
380
|
+
|
|
381
|
+
"""
|
|
382
|
+
Delete all objects in an S3 folder.
|
|
561
383
|
|
|
562
384
|
Parameters
|
|
563
385
|
----------
|
|
564
386
|
target : str
|
|
565
|
-
|
|
387
|
+
S3 folder URL.
|
|
566
388
|
"""
|
|
567
|
-
|
|
568
|
-
s3 = s3_client(
|
|
389
|
+
obj = _s3_object(target)
|
|
390
|
+
s3 = s3_client(obj)
|
|
569
391
|
|
|
570
392
|
total = 0
|
|
571
|
-
for batch in _list_objects(
|
|
393
|
+
for batch in _list_objects(obj, batch=True):
|
|
394
|
+
paths = [o["path"] for o in batch]
|
|
572
395
|
LOG.info(f"Deleting {len(batch):,} objects from {target}")
|
|
573
|
-
|
|
396
|
+
obstore.delete(s3, paths)
|
|
574
397
|
total += len(batch)
|
|
575
398
|
LOG.info(f"Deleted {len(batch):,} objects (total={total:,})")
|
|
576
399
|
|
|
577
400
|
|
|
578
401
|
def delete_file(target: str) -> None:
|
|
579
|
-
|
|
402
|
+
import obstore
|
|
580
403
|
|
|
581
|
-
|
|
582
|
-
----------
|
|
583
|
-
target : str
|
|
584
|
-
The target S3 file path.
|
|
585
|
-
"""
|
|
586
|
-
from botocore.exceptions import ClientError
|
|
404
|
+
obj = _s3_object(target)
|
|
587
405
|
|
|
588
|
-
|
|
589
|
-
s3 = s3_client(bucket)
|
|
406
|
+
s3 = s3_client(obj)
|
|
590
407
|
|
|
591
|
-
|
|
592
|
-
s3.head_object(Bucket=bucket, Key=key)
|
|
593
|
-
exits = True
|
|
594
|
-
except ClientError as e:
|
|
595
|
-
if e.response["Error"]["Code"] != "404":
|
|
596
|
-
raise
|
|
597
|
-
exits = False
|
|
598
|
-
|
|
599
|
-
if not exits:
|
|
408
|
+
if not object_exists(obj):
|
|
600
409
|
LOG.warning(f"{target} does not exist. Did you mean to delete a folder? Then add a trailing '/'")
|
|
601
410
|
return
|
|
602
411
|
|
|
603
412
|
LOG.info(f"Deleting {target}")
|
|
604
|
-
|
|
413
|
+
obstore.delete(s3, obj.key)
|
|
605
414
|
LOG.info(f"{target} is deleted")
|
|
606
415
|
|
|
607
416
|
|
|
608
417
|
def delete(target: str) -> None:
|
|
609
|
-
"""Delete a file or
|
|
418
|
+
"""Delete a file or folder from S3.
|
|
610
419
|
|
|
611
420
|
Parameters
|
|
612
421
|
----------
|
|
613
422
|
target : str
|
|
614
|
-
|
|
423
|
+
S3 URL (file or folder).
|
|
615
424
|
"""
|
|
616
425
|
|
|
617
|
-
assert target.startswith("s3://")
|
|
618
|
-
|
|
619
426
|
if target.endswith("/"):
|
|
620
427
|
delete_folder(target)
|
|
621
428
|
else:
|
|
622
429
|
delete_file(target)
|
|
623
430
|
|
|
624
431
|
|
|
625
|
-
def list_folder(folder: str) -> Iterable:
|
|
626
|
-
"""List
|
|
432
|
+
def list_folder(folder: str) -> Iterable[dict]:
|
|
433
|
+
"""List objects in an S3 folder.
|
|
627
434
|
|
|
628
435
|
Parameters
|
|
629
436
|
----------
|
|
630
437
|
folder : str
|
|
631
|
-
|
|
438
|
+
S3 folder URL.
|
|
632
439
|
|
|
633
440
|
Returns
|
|
634
441
|
-------
|
|
635
|
-
|
|
636
|
-
|
|
442
|
+
Iterable
|
|
443
|
+
Iterable of objects.
|
|
637
444
|
"""
|
|
638
|
-
|
|
639
|
-
assert folder.startswith("s3://")
|
|
640
|
-
if not folder.endswith("/"):
|
|
641
|
-
folder += "/"
|
|
642
|
-
|
|
643
|
-
_, _, bucket, prefix = folder.split("/", 3)
|
|
644
|
-
|
|
645
|
-
s3 = s3_client(bucket)
|
|
646
|
-
paginator = s3.get_paginator("list_objects_v2")
|
|
647
|
-
|
|
648
|
-
for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
|
|
649
|
-
if "CommonPrefixes" in page:
|
|
650
|
-
yield from [folder + _["Prefix"] for _ in page.get("CommonPrefixes") if _["Prefix"] != "/"]
|
|
651
|
-
if "Contents" in page:
|
|
652
|
-
yield from [folder + _["Key"] for _ in page.get("Contents")]
|
|
445
|
+
return _list_objects(folder)
|
|
653
446
|
|
|
654
447
|
|
|
655
448
|
def object_info(target: str) -> dict:
|
|
656
|
-
"""Get information about an object
|
|
449
|
+
"""Get information about an S3 object.
|
|
657
450
|
|
|
658
451
|
Parameters
|
|
659
452
|
----------
|
|
660
453
|
target : str
|
|
661
|
-
|
|
454
|
+
S3 object URL.
|
|
662
455
|
|
|
663
456
|
Returns
|
|
664
457
|
-------
|
|
665
458
|
dict
|
|
666
|
-
|
|
459
|
+
Object metadata.
|
|
667
460
|
"""
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
s3
|
|
671
|
-
|
|
672
|
-
try:
|
|
673
|
-
return s3.head_object(Bucket=bucket, Key=key)
|
|
674
|
-
except s3.exceptions.ClientError as e:
|
|
675
|
-
if e.response["Error"]["Code"] == "404":
|
|
676
|
-
raise FileNotFoundError(f"{target} does not exist")
|
|
677
|
-
raise
|
|
461
|
+
obj = _s3_object(target)
|
|
462
|
+
s3 = s3_client(obj)
|
|
463
|
+
return s3.head(obj.key)
|
|
678
464
|
|
|
679
465
|
|
|
680
466
|
def object_exists(target: str) -> bool:
|
|
681
|
-
"""Check if an object exists.
|
|
467
|
+
"""Check if an S3 object exists.
|
|
682
468
|
|
|
683
469
|
Parameters
|
|
684
470
|
----------
|
|
685
471
|
target : str
|
|
686
|
-
|
|
472
|
+
S3 object URL.
|
|
687
473
|
|
|
688
474
|
Returns
|
|
689
475
|
-------
|
|
690
476
|
bool
|
|
691
|
-
True if
|
|
477
|
+
True if object exists, False otherwise.
|
|
692
478
|
"""
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
s3 = s3_client(bucket)
|
|
479
|
+
obj = _s3_object(target)
|
|
480
|
+
s3 = s3_client(obj)
|
|
696
481
|
|
|
697
482
|
try:
|
|
698
|
-
s3.
|
|
483
|
+
s3.head(obj.key)
|
|
699
484
|
return True
|
|
700
|
-
except
|
|
701
|
-
|
|
702
|
-
return False
|
|
703
|
-
raise
|
|
485
|
+
except FileNotFoundError:
|
|
486
|
+
return False
|
|
704
487
|
|
|
705
488
|
|
|
706
|
-
def
|
|
707
|
-
"""
|
|
489
|
+
def get_object(target: str) -> bool:
|
|
490
|
+
"""Check if an S3 object exists.
|
|
708
491
|
|
|
709
492
|
Parameters
|
|
710
493
|
----------
|
|
711
494
|
target : str
|
|
712
|
-
|
|
495
|
+
S3 object URL.
|
|
713
496
|
|
|
714
497
|
Returns
|
|
715
498
|
-------
|
|
716
|
-
|
|
717
|
-
|
|
499
|
+
bool
|
|
500
|
+
True if object exists, False otherwise.
|
|
718
501
|
"""
|
|
502
|
+
obj = _s3_object(target)
|
|
503
|
+
s3 = s3_client(obj)
|
|
719
504
|
|
|
720
|
-
|
|
721
|
-
s3 = s3_client(bucket)
|
|
722
|
-
|
|
723
|
-
return s3.get_object_acl(Bucket=bucket, Key=key)
|
|
505
|
+
return s3.get(obj.key).bytes()
|
|
724
506
|
|
|
725
507
|
|
|
726
508
|
def download(source: str, target: str, *args, **kwargs) -> None:
|
|
727
|
-
"""Download
|
|
509
|
+
"""Download from S3 using transfer utility.
|
|
728
510
|
|
|
729
511
|
Parameters
|
|
730
512
|
----------
|
|
731
513
|
source : str
|
|
732
|
-
|
|
514
|
+
S3 source URL.
|
|
733
515
|
target : str
|
|
734
|
-
|
|
735
|
-
args
|
|
736
|
-
Additional
|
|
737
|
-
kwargs
|
|
516
|
+
Local target path.
|
|
517
|
+
*args
|
|
518
|
+
Additional arguments.
|
|
519
|
+
**kwargs
|
|
738
520
|
Additional keyword arguments.
|
|
739
521
|
"""
|
|
740
|
-
from . import transfer
|
|
741
522
|
|
|
742
523
|
assert source.startswith("s3://"), f"source {source} should start with 's3://'"
|
|
743
524
|
return transfer(source, target, *args, **kwargs)
|
|
744
525
|
|
|
745
526
|
|
|
746
527
|
def upload(source: str, target: str, *args, **kwargs) -> None:
|
|
747
|
-
"""Upload
|
|
528
|
+
"""Upload to S3 using transfer utility.
|
|
748
529
|
|
|
749
530
|
Parameters
|
|
750
531
|
----------
|
|
751
532
|
source : str
|
|
752
|
-
|
|
533
|
+
Local source path.
|
|
753
534
|
target : str
|
|
754
|
-
|
|
755
|
-
args
|
|
756
|
-
Additional
|
|
757
|
-
kwargs
|
|
535
|
+
S3 target URL.
|
|
536
|
+
*args
|
|
537
|
+
Additional arguments.
|
|
538
|
+
**kwargs
|
|
758
539
|
Additional keyword arguments.
|
|
759
540
|
"""
|
|
760
|
-
from . import transfer
|
|
761
541
|
|
|
762
542
|
assert target.startswith("s3://"), f"target {target} should start with 's3://'"
|
|
763
543
|
return transfer(source, target, *args, **kwargs)
|
|
764
544
|
|
|
765
545
|
|
|
766
|
-
|
|
767
|
-
|
|
546
|
+
##########################
|
|
547
|
+
# Generic transfer classes
|
|
548
|
+
##########################
|
|
549
|
+
class S3Upload(BaseUpload):
|
|
768
550
|
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
target : str
|
|
772
|
-
The URL of a file or a folder on S3. The URL should start with 's3://'.
|
|
551
|
+
def get_temporary_target(self, target: str, pattern: str) -> str:
|
|
552
|
+
"""Get temporary target path for upload.
|
|
773
553
|
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
554
|
+
Parameters
|
|
555
|
+
----------
|
|
556
|
+
target : str
|
|
557
|
+
S3 target URL.
|
|
558
|
+
pattern : str
|
|
559
|
+
Pattern for temporary naming.
|
|
780
560
|
|
|
781
|
-
|
|
782
|
-
|
|
561
|
+
Returns
|
|
562
|
+
-------
|
|
563
|
+
str
|
|
564
|
+
Temporary target path.
|
|
565
|
+
"""
|
|
566
|
+
return target
|
|
783
567
|
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
568
|
+
def rename_target(self, target: str, temporary_target: str) -> None:
|
|
569
|
+
"""Rename temporary target to final target.
|
|
570
|
+
|
|
571
|
+
Parameters
|
|
572
|
+
----------
|
|
573
|
+
target : str
|
|
574
|
+
Final target path.
|
|
575
|
+
temporary_target : str
|
|
576
|
+
Temporary target path.
|
|
577
|
+
"""
|
|
578
|
+
pass
|
|
579
|
+
|
|
580
|
+
def delete_target(self, target: str) -> None:
|
|
581
|
+
"""Delete target from S3.
|
|
582
|
+
|
|
583
|
+
Parameters
|
|
584
|
+
----------
|
|
585
|
+
target : str
|
|
586
|
+
S3 target URL.
|
|
587
|
+
"""
|
|
588
|
+
|
|
589
|
+
pass
|
|
590
|
+
|
|
591
|
+
def _transfer_file(self, source: str, target: str, overwrite: bool, resume: bool, verbosity: int, **kwargs) -> int:
|
|
592
|
+
"""Transfer a file to S3.
|
|
593
|
+
|
|
594
|
+
Parameters
|
|
595
|
+
----------
|
|
596
|
+
source : str
|
|
597
|
+
Local source path.
|
|
598
|
+
target : str
|
|
599
|
+
S3 target URL.
|
|
600
|
+
overwrite : bool
|
|
601
|
+
Overwrite existing file if True.
|
|
602
|
+
resume : bool
|
|
603
|
+
Resume upload if True.
|
|
604
|
+
verbosity : int
|
|
605
|
+
Verbosity level.
|
|
606
|
+
kwargs : dict
|
|
607
|
+
Additional keyword arguments.
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
int
|
|
612
|
+
Number of bytes uploaded.
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
return upload_file(source, target, overwrite, resume, verbosity)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
class S3Download(BaseDownload):
|
|
619
|
+
|
|
620
|
+
def copy(self, source: str, target: str, **kwargs) -> None:
|
|
621
|
+
"""Copy file or folder from S3.
|
|
622
|
+
|
|
623
|
+
Parameters
|
|
624
|
+
----------
|
|
625
|
+
source : str
|
|
626
|
+
S3 source URL.
|
|
627
|
+
target : str
|
|
628
|
+
Local target path.
|
|
629
|
+
**kwargs
|
|
630
|
+
Additional keyword arguments.
|
|
631
|
+
"""
|
|
632
|
+
|
|
633
|
+
assert source.startswith("s3://")
|
|
634
|
+
|
|
635
|
+
if source.endswith("/"):
|
|
636
|
+
self.transfer_folder(source=source, target=target, **kwargs)
|
|
637
|
+
else:
|
|
638
|
+
self.transfer_file(source=source, target=target, **kwargs)
|
|
639
|
+
|
|
640
|
+
def list_source(self, source: str) -> Iterable[dict]:
|
|
641
|
+
"""List objects in S3 source folder.
|
|
642
|
+
|
|
643
|
+
Parameters
|
|
644
|
+
----------
|
|
645
|
+
source : str
|
|
646
|
+
S3 source folder URL.
|
|
647
|
+
|
|
648
|
+
Returns
|
|
649
|
+
-------
|
|
650
|
+
Iterable
|
|
651
|
+
Iterable of objects.
|
|
652
|
+
"""
|
|
653
|
+
yield from _list_objects(source)
|
|
654
|
+
|
|
655
|
+
def source_path(self, s3_object: dict, source: str) -> str:
|
|
656
|
+
"""Get S3 path for a source object.
|
|
657
|
+
|
|
658
|
+
Parameters
|
|
659
|
+
----------
|
|
660
|
+
s3_object : dict
|
|
661
|
+
S3 object metadata.
|
|
662
|
+
source : str
|
|
663
|
+
S3 source folder URL.
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
str
|
|
668
|
+
S3 object path.
|
|
669
|
+
"""
|
|
670
|
+
object = _s3_object(source)
|
|
671
|
+
return f"s3://{object.bucket}/{s3_object['path']}"
|
|
672
|
+
|
|
673
|
+
def target_path(self, s3_object: dict, source: str, target: str) -> str:
|
|
674
|
+
"""Get local target path for an S3 object.
|
|
675
|
+
|
|
676
|
+
Parameters
|
|
677
|
+
----------
|
|
678
|
+
s3_object : dict
|
|
679
|
+
S3 object metadata.
|
|
680
|
+
source : str
|
|
681
|
+
S3 source folder URL.
|
|
682
|
+
target : str
|
|
683
|
+
Local target folder.
|
|
684
|
+
|
|
685
|
+
Returns
|
|
686
|
+
-------
|
|
687
|
+
str
|
|
688
|
+
Local target path.
|
|
689
|
+
"""
|
|
690
|
+
|
|
691
|
+
object = _s3_object(source)
|
|
692
|
+
local_path = os.path.join(target, os.path.relpath(s3_object["path"], object.key))
|
|
693
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
694
|
+
return local_path
|
|
695
|
+
|
|
696
|
+
def source_size(self, s3_object: dict) -> int:
|
|
697
|
+
"""Get size of S3 object.
|
|
698
|
+
|
|
699
|
+
Parameters
|
|
700
|
+
----------
|
|
701
|
+
s3_object : dict
|
|
702
|
+
S3 object metadata.
|
|
703
|
+
|
|
704
|
+
Returns
|
|
705
|
+
-------
|
|
706
|
+
int
|
|
707
|
+
Size in bytes.
|
|
708
|
+
"""
|
|
709
|
+
return s3_object["size"]
|
|
710
|
+
|
|
711
|
+
def _transfer_file(self, source: str, target: str, overwrite: bool, resume: bool, verbosity: int, **kwargs) -> int:
|
|
712
|
+
"""Transfer a file from S3.
|
|
713
|
+
|
|
714
|
+
Parameters
|
|
715
|
+
----------
|
|
716
|
+
source : str
|
|
717
|
+
S3 source URL.
|
|
718
|
+
target : str
|
|
719
|
+
Local target path.
|
|
720
|
+
overwrite : bool
|
|
721
|
+
Overwrite existing file if True.
|
|
722
|
+
resume : bool
|
|
723
|
+
Resume download if True.
|
|
724
|
+
verbosity : int
|
|
725
|
+
Verbosity level.
|
|
726
|
+
kwargs : dict
|
|
727
|
+
Additional keyword arguments.
|
|
728
|
+
|
|
729
|
+
Returns
|
|
730
|
+
-------
|
|
731
|
+
int
|
|
732
|
+
Number of bytes downloaded.
|
|
733
|
+
"""
|
|
734
|
+
|
|
735
|
+
return download_file(source, target, overwrite, resume, verbosity)
|