anemoi-utils 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anemoi-utils might be problematic. Click here for more details.
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/.pre-commit-config.yaml +1 -1
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/PKG-INFO +1 -1
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/conf.py +8 -4
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/index.rst +2 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/_version.py +2 -2
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/cli.py +1 -1
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/config.py +60 -12
- anemoi_utils-0.3.6/src/anemoi/utils/s3.py +463 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi_utils.egg-info/PKG-INFO +1 -1
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/tests/test_utils.py +3 -1
- anemoi_utils-0.3.4/src/anemoi/utils/s3.py +0 -280
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/.github/workflows/python-publish.yml +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/.gitignore +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/.readthedocs.yaml +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/LICENSE +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/README.md +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/Makefile +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/_static/logo.png +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/_static/style.css +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/_templates/.gitkeep +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/installing.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/checkpoints.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/config.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/dates.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/grib.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/humanize.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/provenance.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/s3.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/modules/text.rst +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/docs/requirements.txt +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/pyproject.toml +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/setup.cfg +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/__init__.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/__main__.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/caching.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/checkpoints.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/commands/__init__.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/commands/checkpoint.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/dates.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/grib.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/humanize.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/mars/__init__.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/mars/mars.yaml +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/provenance.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/text.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi/utils/timer.py +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi_utils.egg-info/SOURCES.txt +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi_utils.egg-info/dependency_links.txt +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi_utils.egg-info/entry_points.txt +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi_utils.egg-info/requires.txt +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/src/anemoi_utils.egg-info/top_level.txt +0 -0
- {anemoi_utils-0.3.4 → anemoi_utils-0.3.6}/tests/test_dates.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: anemoi-utils
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: A package to hold various functions to support training of ML models on ECMWF data.
|
|
5
5
|
Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
|
|
6
6
|
License: Apache License
|
|
@@ -93,12 +93,16 @@ intersphinx_mapping = {
|
|
|
93
93
|
"https://anemoi-inference.readthedocs.io/en/latest/",
|
|
94
94
|
("../../anemoi-inference/docs/_build/html/objects.inv", None),
|
|
95
95
|
),
|
|
96
|
+
"anemoi-graphs": (
|
|
97
|
+
"https://anemoi-graphs.readthedocs.io/en/latest/",
|
|
98
|
+
("../../anemoi-graphs/docs/_build/html/objects.inv", None),
|
|
99
|
+
),
|
|
100
|
+
"anemoi-registry": (
|
|
101
|
+
"https://anemoi-registry.readthedocs.io/en/latest/",
|
|
102
|
+
("../../anemoi-registry/docs/_build/html/objects.inv", None),
|
|
103
|
+
),
|
|
96
104
|
}
|
|
97
105
|
|
|
98
|
-
|
|
99
|
-
# https://www.notion.so/Deepnote-Launch-Buttons-63c642a5e875463495ed2341e83a4b2a
|
|
100
|
-
|
|
101
|
-
|
|
102
106
|
# -- Options for HTML output -------------------------------------------------
|
|
103
107
|
|
|
104
108
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
|
@@ -47,8 +47,10 @@ of the *Anemoi* packages.
|
|
|
47
47
|
- :ref:`anemoi-utils <anemoi-utils:index-page>`
|
|
48
48
|
- :ref:`anemoi-datasets <anemoi-datasets:index-page>`
|
|
49
49
|
- :ref:`anemoi-models <anemoi-models:index-page>`
|
|
50
|
+
- :ref:`anemoi-graphs <anemoi-graphs:index-page>`
|
|
50
51
|
- :ref:`anemoi-training <anemoi-training:index-page>`
|
|
51
52
|
- :ref:`anemoi-inference <anemoi-inference:index-page>`
|
|
53
|
+
- :ref:`anemoi-registry <anemoi-registry:index-page>`
|
|
52
54
|
|
|
53
55
|
*********
|
|
54
56
|
License
|
|
@@ -6,8 +6,12 @@
|
|
|
6
6
|
# nor does it submit to any jurisdiction.
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
import json
|
|
9
10
|
import logging
|
|
10
11
|
import os
|
|
12
|
+
import threading
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
11
15
|
|
|
12
16
|
try:
|
|
13
17
|
import tomllib # Only available since 3.11
|
|
@@ -40,10 +44,49 @@ class DotDict(dict):
|
|
|
40
44
|
|
|
41
45
|
def __init__(self, *args, **kwargs):
|
|
42
46
|
super().__init__(*args, **kwargs)
|
|
47
|
+
|
|
43
48
|
for k, v in self.items():
|
|
44
49
|
if isinstance(v, dict):
|
|
45
50
|
self[k] = DotDict(v)
|
|
46
51
|
|
|
52
|
+
if isinstance(v, list):
|
|
53
|
+
self[k] = [DotDict(i) if isinstance(i, dict) else i for i in v]
|
|
54
|
+
|
|
55
|
+
if isinstance(v, tuple):
|
|
56
|
+
self[k] = [DotDict(i) if isinstance(i, dict) else i for i in v]
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def from_file(cls, path: str):
|
|
60
|
+
_, ext = os.path.splitext(path)
|
|
61
|
+
if ext == ".yaml" or ext == ".yml":
|
|
62
|
+
return cls.from_yaml_file(path)
|
|
63
|
+
elif ext == ".json":
|
|
64
|
+
return cls.from_json_file(path)
|
|
65
|
+
elif ext == ".toml":
|
|
66
|
+
return cls.from_toml_file(path)
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError(f"Unknown file extension {ext}")
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_yaml_file(cls, path: str):
|
|
72
|
+
with open(path, "r") as file:
|
|
73
|
+
data = yaml.safe_load(file)
|
|
74
|
+
|
|
75
|
+
return cls(data)
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def from_json_file(cls, path: str):
|
|
79
|
+
with open(path, "r") as file:
|
|
80
|
+
data = json.load(file)
|
|
81
|
+
|
|
82
|
+
return cls(data)
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_toml_file(cls, path: str):
|
|
86
|
+
with open(path, "r") as file:
|
|
87
|
+
data = tomllib.load(file)
|
|
88
|
+
return cls(data)
|
|
89
|
+
|
|
47
90
|
def __getattr__(self, attr):
|
|
48
91
|
try:
|
|
49
92
|
return self[attr]
|
|
@@ -60,16 +103,10 @@ class DotDict(dict):
|
|
|
60
103
|
|
|
61
104
|
|
|
62
105
|
CONFIG = None
|
|
106
|
+
CONFIG_LOCK = threading.Lock()
|
|
63
107
|
|
|
64
108
|
|
|
65
|
-
def
|
|
66
|
-
"""Load the configuration from `~/.anemoi.toml`.
|
|
67
|
-
|
|
68
|
-
Returns
|
|
69
|
-
-------
|
|
70
|
-
DotDict
|
|
71
|
-
The configuration
|
|
72
|
-
"""
|
|
109
|
+
def _load_config():
|
|
73
110
|
global CONFIG
|
|
74
111
|
if CONFIG is not None:
|
|
75
112
|
return CONFIG
|
|
@@ -86,9 +123,20 @@ def load_config():
|
|
|
86
123
|
return DotDict(CONFIG)
|
|
87
124
|
|
|
88
125
|
|
|
89
|
-
def
|
|
90
|
-
"""
|
|
126
|
+
def load_config():
|
|
127
|
+
"""Load the configuration from `~/.anemoi.toml`.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
DotDict
|
|
132
|
+
The configuration
|
|
133
|
+
"""
|
|
134
|
+
with CONFIG_LOCK:
|
|
135
|
+
return _load_config()
|
|
136
|
+
|
|
91
137
|
|
|
138
|
+
def check_config_mode():
|
|
92
139
|
conf = os.path.expanduser("~/.anemoi.toml")
|
|
93
|
-
|
|
94
|
-
|
|
140
|
+
mode = os.stat(conf).st_mode
|
|
141
|
+
if mode & 0o777 != 0o600:
|
|
142
|
+
raise SystemError(f"Configuration file {conf} is not secure. " "Please run `chmod 600 ~/.anemoi.toml`.")
|
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
|
|
2
|
+
# This software is licensed under the terms of the Apache Licence Version 2.0
|
|
3
|
+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
|
4
|
+
# In applying this licence, ECMWF does not waive the privileges and immunities
|
|
5
|
+
# granted to it by virtue of its status as an intergovernmental organisation
|
|
6
|
+
# nor does it submit to any jurisdiction.
|
|
7
|
+
|
|
8
|
+
"""This module provides functions to upload, download, list and delete files and folders on S3.
|
|
9
|
+
The functions of this package expect that the AWS credentials are set up in the environment
|
|
10
|
+
typicaly by setting the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables or
|
|
11
|
+
by creating a `~/.aws/credentials` file. It is also possible to set the `endpoint_url` in the same file
|
|
12
|
+
to use a different S3 compatible service::
|
|
13
|
+
|
|
14
|
+
[default]
|
|
15
|
+
endpoint_url = https://some-storage.somewhere.world
|
|
16
|
+
aws_access_key_id = xxxxxxxxxxxxxxxxxxxxxxxx
|
|
17
|
+
aws_secret_access_key = xxxxxxxxxxxxxxxxxxxxxxxx
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import concurrent
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
import threading
|
|
25
|
+
from copy import deepcopy
|
|
26
|
+
|
|
27
|
+
import tqdm
|
|
28
|
+
|
|
29
|
+
from .config import check_config_mode
|
|
30
|
+
from .config import load_config
|
|
31
|
+
from .humanize import bytes
|
|
32
|
+
|
|
33
|
+
LOGGER = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# s3_clients are not thread-safe, so we need to create a new client for each thread
|
|
37
|
+
|
|
38
|
+
thread_local = threading.local()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def s3_client(bucket):
|
|
42
|
+
import boto3
|
|
43
|
+
|
|
44
|
+
config = load_config()
|
|
45
|
+
if "object-storage" in config:
|
|
46
|
+
check_config_mode()
|
|
47
|
+
|
|
48
|
+
if not hasattr(thread_local, "s3_clients"):
|
|
49
|
+
thread_local.s3_clients = {}
|
|
50
|
+
|
|
51
|
+
if bucket not in thread_local.s3_clients:
|
|
52
|
+
|
|
53
|
+
options = {}
|
|
54
|
+
options.update(config.get("object-storage", {}))
|
|
55
|
+
options.update(config.get("object-storage", {}).get(bucket, {}))
|
|
56
|
+
|
|
57
|
+
type = options.pop("type", "s3")
|
|
58
|
+
if type != "s3":
|
|
59
|
+
raise ValueError(f"Unsupported object storage type {type}")
|
|
60
|
+
|
|
61
|
+
if "config" in options:
|
|
62
|
+
from botocore.client import Config
|
|
63
|
+
|
|
64
|
+
options["config"] = Config(**options["config"])
|
|
65
|
+
del options["config"]
|
|
66
|
+
|
|
67
|
+
thread_local.s3_clients[bucket] = boto3.client("s3", **options)
|
|
68
|
+
|
|
69
|
+
return thread_local.s3_clients[bucket]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class Transfer:
|
|
73
|
+
|
|
74
|
+
def transfer_folder(self, *, source, target, overwrite=False, resume=False, verbosity=1, threads=1):
|
|
75
|
+
assert verbosity == 1, verbosity
|
|
76
|
+
|
|
77
|
+
# from boto3.s3.transfer import TransferConfig
|
|
78
|
+
# config = TransferConfig(use_threads=False)
|
|
79
|
+
config = None
|
|
80
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
|
81
|
+
try:
|
|
82
|
+
if verbosity > 0:
|
|
83
|
+
LOGGER.info(f"{self.action} {source} to {target}")
|
|
84
|
+
|
|
85
|
+
total = 0
|
|
86
|
+
|
|
87
|
+
futures = []
|
|
88
|
+
for name in self.list_source(source):
|
|
89
|
+
|
|
90
|
+
futures.append(
|
|
91
|
+
executor.submit(
|
|
92
|
+
self.transfer_file,
|
|
93
|
+
source=self.source_path(name, source),
|
|
94
|
+
target=self.target_path(name, source, target),
|
|
95
|
+
overwrite=overwrite,
|
|
96
|
+
resume=resume,
|
|
97
|
+
verbosity=verbosity - 1,
|
|
98
|
+
config=config,
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
total += self.source_size(name)
|
|
102
|
+
|
|
103
|
+
if len(futures) % 10000 == 0:
|
|
104
|
+
if verbosity > 0:
|
|
105
|
+
LOGGER.info(f"Preparing transfer, {len(futures):,} files... ({bytes(total)})")
|
|
106
|
+
done, _ = concurrent.futures.wait(
|
|
107
|
+
futures,
|
|
108
|
+
timeout=0.001,
|
|
109
|
+
return_when=concurrent.futures.FIRST_EXCEPTION,
|
|
110
|
+
)
|
|
111
|
+
# Trigger exceptions if any
|
|
112
|
+
for future in done:
|
|
113
|
+
future.result()
|
|
114
|
+
|
|
115
|
+
if verbosity > 0:
|
|
116
|
+
LOGGER.info(f"{self.action} {len(futures):,} files ({bytes(total)})")
|
|
117
|
+
with tqdm.tqdm(total=total, unit="B", unit_scale=True, unit_divisor=1024) as pbar:
|
|
118
|
+
for future in futures:
|
|
119
|
+
pbar.update(future.result())
|
|
120
|
+
else:
|
|
121
|
+
for future in futures:
|
|
122
|
+
future.result()
|
|
123
|
+
|
|
124
|
+
except Exception:
|
|
125
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
126
|
+
raise
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class Upload(Transfer):
|
|
130
|
+
action = "Uploading"
|
|
131
|
+
|
|
132
|
+
def list_source(self, source):
|
|
133
|
+
for root, _, files in os.walk(source):
|
|
134
|
+
for file in files:
|
|
135
|
+
yield os.path.join(root, file)
|
|
136
|
+
|
|
137
|
+
def source_path(self, local_path, source):
|
|
138
|
+
return local_path
|
|
139
|
+
|
|
140
|
+
def target_path(self, source_path, source, target):
|
|
141
|
+
relative_path = os.path.relpath(source_path, source)
|
|
142
|
+
s3_path = os.path.join(target, relative_path)
|
|
143
|
+
return s3_path
|
|
144
|
+
|
|
145
|
+
def source_size(self, local_path):
|
|
146
|
+
return os.path.getsize(local_path)
|
|
147
|
+
|
|
148
|
+
def transfer_file(self, source, target, overwrite, resume, verbosity, config=None):
|
|
149
|
+
|
|
150
|
+
from botocore.exceptions import ClientError
|
|
151
|
+
|
|
152
|
+
assert target.startswith("s3://")
|
|
153
|
+
|
|
154
|
+
_, _, bucket, key = target.split("/", 3)
|
|
155
|
+
s3 = s3_client(bucket)
|
|
156
|
+
|
|
157
|
+
size = os.path.getsize(source)
|
|
158
|
+
|
|
159
|
+
if verbosity > 0:
|
|
160
|
+
LOGGER.info(f"{self.action} {source} to {target} ({bytes(size)})")
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
results = s3.head_object(Bucket=bucket, Key=key)
|
|
164
|
+
remote_size = int(results["ContentLength"])
|
|
165
|
+
except ClientError as e:
|
|
166
|
+
if e.response["Error"]["Code"] != "404":
|
|
167
|
+
raise
|
|
168
|
+
remote_size = None
|
|
169
|
+
|
|
170
|
+
if remote_size is not None:
|
|
171
|
+
if remote_size != size:
|
|
172
|
+
LOGGER.warning(
|
|
173
|
+
f"{target} already exists, but with different size, re-uploading (remote={remote_size}, local={size})"
|
|
174
|
+
)
|
|
175
|
+
elif resume:
|
|
176
|
+
# LOGGER.info(f"{target} already exists, skipping")
|
|
177
|
+
return size
|
|
178
|
+
|
|
179
|
+
if remote_size is not None and not overwrite and not resume:
|
|
180
|
+
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
|
|
181
|
+
|
|
182
|
+
if verbosity > 0:
|
|
183
|
+
with tqdm.tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024, leave=False) as pbar:
|
|
184
|
+
s3.upload_file(source, bucket, key, Callback=lambda x: pbar.update(x), Config=config)
|
|
185
|
+
else:
|
|
186
|
+
s3.upload_file(source, bucket, key, Config=config)
|
|
187
|
+
|
|
188
|
+
return size
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class Download(Transfer):
|
|
192
|
+
action = "Downloading"
|
|
193
|
+
|
|
194
|
+
def list_source(self, source):
|
|
195
|
+
yield from _list_objects(source)
|
|
196
|
+
|
|
197
|
+
def source_path(self, s3_object, source):
|
|
198
|
+
_, _, bucket, _ = source.split("/", 3)
|
|
199
|
+
return f"s3://{bucket}/{s3_object['Key']}"
|
|
200
|
+
|
|
201
|
+
def target_path(self, s3_object, source, target):
|
|
202
|
+
_, _, _, folder = source.split("/", 3)
|
|
203
|
+
local_path = os.path.join(target, os.path.relpath(s3_object["Key"], folder))
|
|
204
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
205
|
+
return local_path
|
|
206
|
+
|
|
207
|
+
def source_size(self, s3_object):
|
|
208
|
+
return s3_object["Size"]
|
|
209
|
+
|
|
210
|
+
def transfer_file(self, source, target, overwrite, resume, verbosity, config=None):
|
|
211
|
+
# from boto3.s3.transfer import TransferConfig
|
|
212
|
+
|
|
213
|
+
_, _, bucket, key = source.split("/", 3)
|
|
214
|
+
s3 = s3_client(bucket)
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
response = s3.head_object(Bucket=bucket, Key=key)
|
|
218
|
+
except s3.exceptions.ClientError as e:
|
|
219
|
+
print(e.response["Error"]["Code"], e.response["Error"]["Message"], bucket, key)
|
|
220
|
+
if e.response["Error"]["Code"] == "404":
|
|
221
|
+
raise ValueError(f"{source} does not exist ({bucket}, {key})")
|
|
222
|
+
raise
|
|
223
|
+
|
|
224
|
+
size = int(response["ContentLength"])
|
|
225
|
+
|
|
226
|
+
if verbosity > 0:
|
|
227
|
+
LOGGER.info(f"Downloading {source} to {target} ({bytes(size)})")
|
|
228
|
+
|
|
229
|
+
if overwrite:
|
|
230
|
+
resume = False
|
|
231
|
+
|
|
232
|
+
if resume:
|
|
233
|
+
if os.path.exists(target):
|
|
234
|
+
local_size = os.path.getsize(target)
|
|
235
|
+
if local_size != size:
|
|
236
|
+
LOGGER.warning(
|
|
237
|
+
f"{target} already with different size, re-downloading (remote={size}, local={size})"
|
|
238
|
+
)
|
|
239
|
+
else:
|
|
240
|
+
# if verbosity > 0:
|
|
241
|
+
# LOGGER.info(f"{target} already exists, skipping")
|
|
242
|
+
return size
|
|
243
|
+
|
|
244
|
+
if os.path.exists(target) and not overwrite:
|
|
245
|
+
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'resume' to skip")
|
|
246
|
+
|
|
247
|
+
if verbosity > 0:
|
|
248
|
+
with tqdm.tqdm(total=size, unit="B", unit_scale=True, unit_divisor=1024, leave=False) as pbar:
|
|
249
|
+
s3.download_file(bucket, key, target, Callback=lambda x: pbar.update(x), Config=config)
|
|
250
|
+
else:
|
|
251
|
+
s3.download_file(bucket, key, target, Config=config)
|
|
252
|
+
|
|
253
|
+
return size
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def upload(source, target, *, overwrite=False, resume=False, verbosity=1, threads=1):
|
|
257
|
+
"""Upload a file or a folder to S3.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
source : str
|
|
262
|
+
A path to a file or a folder to upload.
|
|
263
|
+
target : str
|
|
264
|
+
A URL to a file or a folder on S3. The url should start with 's3://'.
|
|
265
|
+
overwrite : bool, optional
|
|
266
|
+
If the data is alreay on S3 it will be overwritten, by default False
|
|
267
|
+
resume : bool, optional
|
|
268
|
+
If the data is alreay on S3 it will not be uploaded, unless the remote file
|
|
269
|
+
has a different size, by default False
|
|
270
|
+
threads : int, optional
|
|
271
|
+
The number of threads to use when uploading a directory, by default 1
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
uploader = Upload()
|
|
275
|
+
if os.path.isdir(source):
|
|
276
|
+
uploader.transfer_folder(
|
|
277
|
+
source=source, target=target, overwrite=overwrite, resume=resume, verbosity=verbosity, threads=threads
|
|
278
|
+
)
|
|
279
|
+
else:
|
|
280
|
+
uploader.transfer_file(source=source, target=target, overwrite=overwrite, resume=resume, verbosity=verbosity)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def download(source, target, *, overwrite=False, resume=False, verbosity=1, threads=1):
|
|
284
|
+
"""Download a file or a folder from S3.
|
|
285
|
+
|
|
286
|
+
Parameters
|
|
287
|
+
----------
|
|
288
|
+
source : str
|
|
289
|
+
The URL of a file or a folder on S3. The url should start with 's3://'. If the URL ends with a '/' it is
|
|
290
|
+
assumed to be a folder, otherwise it is assumed to be a file.
|
|
291
|
+
target : str
|
|
292
|
+
The local path where the file or folder will be downloaded.
|
|
293
|
+
overwrite : bool, optional
|
|
294
|
+
If false, files which have already been download will be skipped, unless their size
|
|
295
|
+
does not match their size on S3 , by default False
|
|
296
|
+
resume : bool, optional
|
|
297
|
+
If the data is alreay on local it will not be downloaded, unless the remote file
|
|
298
|
+
has a different size, by default False
|
|
299
|
+
threads : int, optional
|
|
300
|
+
The number of threads to use when downloading a directory, by default 1
|
|
301
|
+
"""
|
|
302
|
+
assert source.startswith("s3://")
|
|
303
|
+
|
|
304
|
+
downloader = Download()
|
|
305
|
+
|
|
306
|
+
if source.endswith("/"):
|
|
307
|
+
downloader.transfer_folder(
|
|
308
|
+
source=source,
|
|
309
|
+
target=target,
|
|
310
|
+
overwrite=overwrite,
|
|
311
|
+
resume=resume,
|
|
312
|
+
verbosity=verbosity,
|
|
313
|
+
threads=threads,
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
downloader.transfer_file(
|
|
317
|
+
source=source,
|
|
318
|
+
target=target,
|
|
319
|
+
overwrite=overwrite,
|
|
320
|
+
resume=resume,
|
|
321
|
+
verbosity=verbosity,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _list_objects(target, batch=False):
|
|
326
|
+
_, _, bucket, prefix = target.split("/", 3)
|
|
327
|
+
s3 = s3_client(bucket)
|
|
328
|
+
|
|
329
|
+
paginator = s3.get_paginator("list_objects_v2")
|
|
330
|
+
|
|
331
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
|
332
|
+
if "Contents" in page:
|
|
333
|
+
objects = deepcopy(page["Contents"])
|
|
334
|
+
if batch:
|
|
335
|
+
yield objects
|
|
336
|
+
else:
|
|
337
|
+
yield from objects
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _delete_folder(target):
|
|
341
|
+
_, _, bucket, _ = target.split("/", 3)
|
|
342
|
+
s3 = s3_client(bucket)
|
|
343
|
+
|
|
344
|
+
total = 0
|
|
345
|
+
for batch in _list_objects(target, batch=True):
|
|
346
|
+
LOGGER.info(f"Deleting {len(batch):,} objects from {target}")
|
|
347
|
+
s3.delete_objects(Bucket=bucket, Delete={"Objects": [{"Key": o["Key"]} for o in batch]})
|
|
348
|
+
total += len(batch)
|
|
349
|
+
LOGGER.info(f"Deleted {len(batch):,} objects (total={total:,})")
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _delete_file(target):
|
|
353
|
+
from botocore.exceptions import ClientError
|
|
354
|
+
|
|
355
|
+
_, _, bucket, key = target.split("/", 3)
|
|
356
|
+
s3 = s3_client(bucket)
|
|
357
|
+
|
|
358
|
+
try:
|
|
359
|
+
s3.head_object(Bucket=bucket, Key=key)
|
|
360
|
+
exits = True
|
|
361
|
+
except ClientError as e:
|
|
362
|
+
if e.response["Error"]["Code"] != "404":
|
|
363
|
+
raise
|
|
364
|
+
exits = False
|
|
365
|
+
|
|
366
|
+
if not exits:
|
|
367
|
+
LOGGER.warning(f"{target} does not exist. Did you mean to delete a folder? Then add a trailing '/'")
|
|
368
|
+
return
|
|
369
|
+
|
|
370
|
+
LOGGER.info(f"Deleting {target}")
|
|
371
|
+
print(s3.delete_object(Bucket=bucket, Key=key))
|
|
372
|
+
LOGGER.info(f"{target} is deleted")
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def delete(target):
|
|
376
|
+
"""Delete a file or a folder from S3.
|
|
377
|
+
|
|
378
|
+
Parameters
|
|
379
|
+
----------
|
|
380
|
+
target : str
|
|
381
|
+
The URL of a file or a folder on S3. The url should start with 's3://'. If the URL ends with a '/' it is
|
|
382
|
+
assumed to be a folder, otherwise it is assumed to be a file.
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
assert target.startswith("s3://")
|
|
386
|
+
|
|
387
|
+
if target.endswith("/"):
|
|
388
|
+
_delete_folder(target)
|
|
389
|
+
else:
|
|
390
|
+
_delete_file(target)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def list_folder(folder):
|
|
394
|
+
"""List the sub folders in a folder on S3.
|
|
395
|
+
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
folder : str
|
|
399
|
+
The URL of a folder on S3. The url should start with 's3://'.
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
list
|
|
404
|
+
A list of the subfolders names in the folder.
|
|
405
|
+
"""
|
|
406
|
+
|
|
407
|
+
assert folder.startswith("s3://")
|
|
408
|
+
if not folder.endswith("/"):
|
|
409
|
+
folder += "/"
|
|
410
|
+
|
|
411
|
+
_, _, bucket, prefix = folder.split("/", 3)
|
|
412
|
+
|
|
413
|
+
s3 = s3_client(bucket)
|
|
414
|
+
paginator = s3.get_paginator("list_objects_v2")
|
|
415
|
+
|
|
416
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
|
|
417
|
+
if "CommonPrefixes" in page:
|
|
418
|
+
yield from [folder + _["Prefix"] for _ in page.get("CommonPrefixes")]
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def object_info(target):
|
|
422
|
+
"""Get information about an object on S3.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
target : str
|
|
427
|
+
The URL of a file or a folder on S3. The url should start with 's3://'.
|
|
428
|
+
|
|
429
|
+
Returns
|
|
430
|
+
-------
|
|
431
|
+
dict
|
|
432
|
+
A dictionary with information about the object.
|
|
433
|
+
"""
|
|
434
|
+
|
|
435
|
+
_, _, bucket, key = target.split("/", 3)
|
|
436
|
+
s3 = s3_client(bucket)
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
return s3.head_object(Bucket=bucket, Key=key)
|
|
440
|
+
except s3.exceptions.ClientError as e:
|
|
441
|
+
if e.response["Error"]["Code"] == "404":
|
|
442
|
+
raise ValueError(f"{target} does not exist")
|
|
443
|
+
raise
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def object_acl(target):
|
|
447
|
+
"""Get information about an object's ACL on S3.
|
|
448
|
+
|
|
449
|
+
Parameters
|
|
450
|
+
----------
|
|
451
|
+
target : str
|
|
452
|
+
The URL of a file or a folder on S3. The url should start with 's3://'.
|
|
453
|
+
|
|
454
|
+
Returns
|
|
455
|
+
-------
|
|
456
|
+
dict
|
|
457
|
+
A dictionary with information about the object's ACL.
|
|
458
|
+
"""
|
|
459
|
+
|
|
460
|
+
_, _, bucket, key = target.split("/", 3)
|
|
461
|
+
s3 = s3_client()
|
|
462
|
+
|
|
463
|
+
return s3.get_object_acl(Bucket=bucket, Key=key)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: anemoi-utils
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: A package to hold various functions to support training of ML models on ECMWF data.
|
|
5
5
|
Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
|
|
6
6
|
License: Apache License
|
|
@@ -12,7 +12,7 @@ from anemoi.utils.grib import shortname_to_paramid
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def test_dotdict():
|
|
15
|
-
d = DotDict(a=1, b=2, c=dict(d=3, e=4))
|
|
15
|
+
d = DotDict(a=1, b=2, c=dict(d=3, e=4), e=[1, dict(a=3), 3])
|
|
16
16
|
assert d.a == 1
|
|
17
17
|
assert d.b == 2
|
|
18
18
|
assert d.c.d == 3
|
|
@@ -27,6 +27,8 @@ def test_dotdict():
|
|
|
27
27
|
d.d.x = 6
|
|
28
28
|
assert d.d.x == 6
|
|
29
29
|
|
|
30
|
+
assert d.e[1].a == 3
|
|
31
|
+
|
|
30
32
|
|
|
31
33
|
def test_grib():
|
|
32
34
|
assert shortname_to_paramid("2t") == 167
|
|
@@ -1,280 +0,0 @@
|
|
|
1
|
-
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
|
|
2
|
-
# This software is licensed under the terms of the Apache Licence Version 2.0
|
|
3
|
-
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
|
4
|
-
# In applying this licence, ECMWF does not waive the privileges and immunities
|
|
5
|
-
# granted to it by virtue of its status as an intergovernmental organisation
|
|
6
|
-
# nor does it submit to any jurisdiction.
|
|
7
|
-
|
|
8
|
-
"""This module provides functions to upload, download, list and delete files and folders on S3.
|
|
9
|
-
The functions of this package expect that the AWS credentials are set up in the environment
|
|
10
|
-
typicaly by setting the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables or
|
|
11
|
-
by creating a `~/.aws/credentials` file. It is also possible to set the `endpoint_url` in the same file
|
|
12
|
-
to use a different S3 compatible service::
|
|
13
|
-
|
|
14
|
-
[default]
|
|
15
|
-
endpoint_url = https://some-storage.somewhere.world
|
|
16
|
-
aws_access_key_id = xxxxxxxxxxxxxxxxxxxxxxxx
|
|
17
|
-
aws_secret_access_key = xxxxxxxxxxxxxxxxxxxxxxxx
|
|
18
|
-
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
import logging
|
|
22
|
-
import os
|
|
23
|
-
import threading
|
|
24
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
25
|
-
from contextlib import closing
|
|
26
|
-
|
|
27
|
-
import tqdm
|
|
28
|
-
|
|
29
|
-
LOGGER = logging.getLogger(__name__)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# s3_clients are not thread-safe, so we need to create a new client for each thread
|
|
33
|
-
|
|
34
|
-
thread_local = threading.local()
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _s3_client():
|
|
38
|
-
import boto3
|
|
39
|
-
|
|
40
|
-
if not hasattr(thread_local, "s3_client"):
|
|
41
|
-
thread_local.s3_client = boto3.client("s3")
|
|
42
|
-
return thread_local.s3_client
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def _upload_file(source, target, overwrite=False, ignore_existing=False):
|
|
46
|
-
from botocore.exceptions import ClientError
|
|
47
|
-
|
|
48
|
-
assert target.startswith("s3://")
|
|
49
|
-
|
|
50
|
-
_, _, bucket, key = target.split("/", 3)
|
|
51
|
-
|
|
52
|
-
LOGGER.info(f"Uploading {source} to {target}")
|
|
53
|
-
s3_client = _s3_client()
|
|
54
|
-
|
|
55
|
-
size = os.path.getsize(source)
|
|
56
|
-
try:
|
|
57
|
-
results = s3_client.head_object(Bucket=bucket, Key=key)
|
|
58
|
-
remote_size = int(results["ContentLength"])
|
|
59
|
-
except ClientError as e:
|
|
60
|
-
if e.response["Error"]["Code"] != "404":
|
|
61
|
-
raise
|
|
62
|
-
remote_size = None
|
|
63
|
-
|
|
64
|
-
if remote_size is not None:
|
|
65
|
-
if remote_size != size:
|
|
66
|
-
LOGGER.warning(f"{target} already exists, but with different size, re-uploading")
|
|
67
|
-
overwrite = True
|
|
68
|
-
|
|
69
|
-
if ignore_existing:
|
|
70
|
-
LOGGER.info(f"{target} already exists, skipping")
|
|
71
|
-
return
|
|
72
|
-
|
|
73
|
-
if remote_size is not None and not overwrite:
|
|
74
|
-
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'ignore_existing' to skip")
|
|
75
|
-
|
|
76
|
-
with closing(tqdm.tqdm(total=size, unit="B", unit_scale=True, leave=False)) as t:
|
|
77
|
-
s3_client.upload_file(source, bucket, key, Callback=lambda x: t.update(x))
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def _local_file_list(source):
|
|
81
|
-
for root, _, files in os.walk(source):
|
|
82
|
-
for file in files:
|
|
83
|
-
yield os.path.join(root, file)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def _upload_folder(source, target, overwrite=False, ignore_existing=False, threads=1):
|
|
87
|
-
total = sum(1 for _ in _local_file_list(source))
|
|
88
|
-
|
|
89
|
-
with ThreadPoolExecutor(max_workers=threads) as executor:
|
|
90
|
-
futures = []
|
|
91
|
-
for local_path in _local_file_list(source):
|
|
92
|
-
relative_path = os.path.relpath(local_path, source)
|
|
93
|
-
s3_path = os.path.join(target, relative_path)
|
|
94
|
-
futures.append(executor.submit(_upload_file, local_path, s3_path, overwrite, ignore_existing))
|
|
95
|
-
|
|
96
|
-
for future in tqdm.tqdm(futures, total=total):
|
|
97
|
-
future.result()
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def upload(source, target, overwrite=False, ignore_existing=False, threads=1, show_progress=True):
|
|
101
|
-
"""Upload a file or a folder to S3.
|
|
102
|
-
|
|
103
|
-
Parameters
|
|
104
|
-
----------
|
|
105
|
-
source : str
|
|
106
|
-
A path to a file or a folder to upload.
|
|
107
|
-
target : str
|
|
108
|
-
A URL to a file or a folder on S3. The url should start with 's3://'.
|
|
109
|
-
overwrite : bool, optional
|
|
110
|
-
If the data is alreay on S3 it will be overwritten, by default False
|
|
111
|
-
ignore_existing : bool, optional
|
|
112
|
-
If the data is alreay on S3 it will not be uploaded, unless the remote file
|
|
113
|
-
has a different size, by default False
|
|
114
|
-
threads : int, optional
|
|
115
|
-
The number of threads to use when uploading a directory, by default 1
|
|
116
|
-
"""
|
|
117
|
-
if os.path.isdir(source):
|
|
118
|
-
_upload_folder(source, target, overwrite, ignore_existing, threads)
|
|
119
|
-
else:
|
|
120
|
-
_upload_file(source, target, overwrite, ignore_existing)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def _download_file(source, target, overwrite=False, ignore_existing=False):
|
|
124
|
-
s3_client = _s3_client()
|
|
125
|
-
_, _, bucket, key = source.split("/", 3)
|
|
126
|
-
|
|
127
|
-
response = s3_client.head_object(Bucket=bucket, Key=key)
|
|
128
|
-
size = int(response["ContentLength"])
|
|
129
|
-
|
|
130
|
-
if os.path.exists(target):
|
|
131
|
-
|
|
132
|
-
if os.path.exists(target) and os.path.getsize(target) != size:
|
|
133
|
-
LOGGER.info(f"{target} already with different size, re-downloading")
|
|
134
|
-
overwrite = True
|
|
135
|
-
|
|
136
|
-
if not overwrite and not ignore_existing:
|
|
137
|
-
raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'ignore_existing' to skip")
|
|
138
|
-
|
|
139
|
-
if ignore_existing:
|
|
140
|
-
LOGGER.info(f"{target} already exists, skipping")
|
|
141
|
-
return
|
|
142
|
-
|
|
143
|
-
with closing(tqdm.tqdm(total=size, unit="B", unit_scale=True, leave=False)) as t:
|
|
144
|
-
s3_client.download_file(bucket, key, target, Callback=lambda x: t.update(x))
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def _download_folder(source, target, overwrite=False, ignore_existing=False, threads=1):
|
|
148
|
-
source = source.rstrip("/")
|
|
149
|
-
_, _, bucket, folder = source.split("/", 3)
|
|
150
|
-
total = _count_objects_in_folder(source)
|
|
151
|
-
|
|
152
|
-
with ThreadPoolExecutor(max_workers=threads) as executor:
|
|
153
|
-
futures = []
|
|
154
|
-
for o in _list_folder(source):
|
|
155
|
-
name = o["Key"]
|
|
156
|
-
local_path = os.path.join(target, os.path.relpath(name, folder))
|
|
157
|
-
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
158
|
-
futures.append(
|
|
159
|
-
executor.submit(_download_file, f"s3://{bucket}/{name}", local_path, overwrite, ignore_existing)
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
for future in tqdm.tqdm(futures, total=total):
|
|
163
|
-
future.result()
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def download(source, target, overwrite=False, ignore_existing=False, threads=1, show_progress=True):
|
|
167
|
-
"""Download a file or a folder from S3.
|
|
168
|
-
|
|
169
|
-
Parameters
|
|
170
|
-
----------
|
|
171
|
-
source : str
|
|
172
|
-
The URL of a file or a folder on S3. The url should start with 's3://'. If the URL ends with a '/' it is
|
|
173
|
-
assumed to be a folder, otherwise it is assumed to be a file.
|
|
174
|
-
target : str
|
|
175
|
-
The local path where the file or folder will be downloaded.
|
|
176
|
-
overwrite : bool, optional
|
|
177
|
-
If false, files which have already been download will be skipped, unless their size
|
|
178
|
-
does not match their size on S3 , by default False
|
|
179
|
-
ignore_existing : bool, optional
|
|
180
|
-
If the data is alreay on local it will not be downloaded, unless the remote file
|
|
181
|
-
has a different size, by default False
|
|
182
|
-
threads : int, optional
|
|
183
|
-
The number of threads to use when downloading a directory, by default 1
|
|
184
|
-
"""
|
|
185
|
-
assert source.startswith("s3://")
|
|
186
|
-
|
|
187
|
-
if source.endswith("/"):
|
|
188
|
-
_download_folder(source, target, overwrite, ignore_existing, threads)
|
|
189
|
-
else:
|
|
190
|
-
_download_file(source, target, overwrite, ignore_existing)
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def _list_folder(target, batch=False):
|
|
194
|
-
s3_client = _s3_client()
|
|
195
|
-
_, _, bucket, prefix = target.split("/", 3)
|
|
196
|
-
|
|
197
|
-
paginator = s3_client.get_paginator("list_objects_v2")
|
|
198
|
-
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
|
199
|
-
if "Contents" in page:
|
|
200
|
-
objects = [{"Key": obj["Key"]} for obj in page["Contents"]]
|
|
201
|
-
if batch:
|
|
202
|
-
yield objects
|
|
203
|
-
else:
|
|
204
|
-
yield from objects
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
def _count_objects_in_folder(target):
|
|
208
|
-
return sum(len(_) for _ in _list_folder(target, batch=True))
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
def _delete_folder(target, threads):
|
|
212
|
-
s3_client = _s3_client()
|
|
213
|
-
_, _, bucket, _ = target.split("/", 3)
|
|
214
|
-
|
|
215
|
-
for batch in _list_folder(target, batch=True):
|
|
216
|
-
s3_client.delete_objects(Bucket=bucket, Delete={"Objects": batch})
|
|
217
|
-
LOGGER.info(f"Deleted {len(batch)} objects")
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
def _delete_file(target):
|
|
221
|
-
s3_client = _s3_client()
|
|
222
|
-
_, _, bucket, key = target.split("/", 3)
|
|
223
|
-
|
|
224
|
-
LOGGER.info(f"Deleting {target}")
|
|
225
|
-
s3_client.delete_object(Bucket=bucket, Key=key)
|
|
226
|
-
LOGGER.info(f"{target} is deleted")
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
def delete(target):
|
|
230
|
-
"""Delete a file or a folder from S3.
|
|
231
|
-
|
|
232
|
-
Parameters
|
|
233
|
-
----------
|
|
234
|
-
target : str
|
|
235
|
-
The URL of a file or a folder on S3. The url should start with 's3://'. If the URL ends with a '/' it is
|
|
236
|
-
assumed to be a folder, otherwise it is assumed to be a file.
|
|
237
|
-
"""
|
|
238
|
-
|
|
239
|
-
assert target.startswith("s3://")
|
|
240
|
-
|
|
241
|
-
if target.endswith("/"):
|
|
242
|
-
_delete_folder(target)
|
|
243
|
-
else:
|
|
244
|
-
_delete_file(target)
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
def list_folder(target):
|
|
248
|
-
"""List the objects in a folder on S3.
|
|
249
|
-
|
|
250
|
-
Parameters
|
|
251
|
-
----------
|
|
252
|
-
target : str
|
|
253
|
-
The URL of a folder on S3. The url should start with 's3://'.
|
|
254
|
-
|
|
255
|
-
Returns
|
|
256
|
-
-------
|
|
257
|
-
list
|
|
258
|
-
A list of the objects names in the folder.
|
|
259
|
-
"""
|
|
260
|
-
|
|
261
|
-
assert target.startswith("s3://")
|
|
262
|
-
return [o["Key"] for o in _list_folder(target)]
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def count_objects_in_folder(target):
|
|
266
|
-
"""Count the objects in a folder on S3.
|
|
267
|
-
|
|
268
|
-
Parameters
|
|
269
|
-
----------
|
|
270
|
-
target : str
|
|
271
|
-
The URL of a folder on S3. The url should start with 's3://'.
|
|
272
|
-
|
|
273
|
-
Returns
|
|
274
|
-
-------
|
|
275
|
-
int
|
|
276
|
-
The number of objects in the folder.
|
|
277
|
-
"""
|
|
278
|
-
|
|
279
|
-
assert target.startswith("s3://")
|
|
280
|
-
return _count_objects_in_folder(target)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|