anemoi-utils 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of anemoi-utils might be problematic. Click here for more details.

Files changed (52) hide show
  1. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/PKG-INFO +1 -1
  2. anemoi_utils-0.3.4/docs/modules/s3.rst +8 -0
  3. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/_version.py +2 -2
  4. anemoi_utils-0.3.4/src/anemoi/utils/s3.py +280 -0
  5. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/text.py +6 -7
  6. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi_utils.egg-info/PKG-INFO +1 -1
  7. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi_utils.egg-info/SOURCES.txt +1 -0
  8. anemoi_utils-0.3.2/src/anemoi/utils/s3.py +0 -57
  9. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/.github/workflows/python-publish.yml +0 -0
  10. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/.gitignore +0 -0
  11. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/.pre-commit-config.yaml +0 -0
  12. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/.readthedocs.yaml +0 -0
  13. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/LICENSE +0 -0
  14. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/README.md +0 -0
  15. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/Makefile +0 -0
  16. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/_static/logo.png +0 -0
  17. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/_static/style.css +0 -0
  18. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/_templates/.gitkeep +0 -0
  19. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/conf.py +0 -0
  20. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/index.rst +0 -0
  21. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/installing.rst +0 -0
  22. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/modules/checkpoints.rst +0 -0
  23. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/modules/config.rst +0 -0
  24. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/modules/dates.rst +0 -0
  25. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/modules/grib.rst +0 -0
  26. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/modules/humanize.rst +0 -0
  27. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/modules/provenance.rst +0 -0
  28. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/modules/text.rst +0 -0
  29. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/docs/requirements.txt +0 -0
  30. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/pyproject.toml +0 -0
  31. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/setup.cfg +0 -0
  32. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/__init__.py +0 -0
  33. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/__main__.py +0 -0
  34. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/caching.py +0 -0
  35. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/checkpoints.py +0 -0
  36. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/cli.py +0 -0
  37. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/commands/__init__.py +0 -0
  38. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/commands/checkpoint.py +0 -0
  39. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/config.py +0 -0
  40. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/dates.py +0 -0
  41. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/grib.py +0 -0
  42. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/humanize.py +0 -0
  43. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/mars/__init__.py +0 -0
  44. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/mars/mars.yaml +0 -0
  45. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/provenance.py +0 -0
  46. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi/utils/timer.py +0 -0
  47. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi_utils.egg-info/dependency_links.txt +0 -0
  48. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi_utils.egg-info/entry_points.txt +0 -0
  49. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi_utils.egg-info/requires.txt +0 -0
  50. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/src/anemoi_utils.egg-info/top_level.txt +0 -0
  51. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/tests/test_dates.py +0 -0
  52. {anemoi_utils-0.3.2 → anemoi_utils-0.3.4}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: anemoi-utils
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: A package to hold various functions to support training of ML models on ECMWF data.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License: Apache License
@@ -0,0 +1,8 @@
1
+ ####
2
+ s3
3
+ ####
4
+
5
+ .. automodule:: anemoi.utils.s3
6
+ :members:
7
+ :no-undoc-members:
8
+ :show-inheritance:
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.3.2'
16
- __version_tuple__ = version_tuple = (0, 3, 2)
15
+ __version__ = version = '0.3.4'
16
+ __version_tuple__ = version_tuple = (0, 3, 4)
@@ -0,0 +1,280 @@
1
+ # (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
2
+ # This software is licensed under the terms of the Apache Licence Version 2.0
3
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
4
+ # In applying this licence, ECMWF does not waive the privileges and immunities
5
+ # granted to it by virtue of its status as an intergovernmental organisation
6
+ # nor does it submit to any jurisdiction.
7
+
8
+ """This module provides functions to upload, download, list and delete files and folders on S3.
9
+ The functions of this package expect that the AWS credentials are set up in the environment
10
+ typicaly by setting the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables or
11
+ by creating a `~/.aws/credentials` file. It is also possible to set the `endpoint_url` in the same file
12
+ to use a different S3 compatible service::
13
+
14
+ [default]
15
+ endpoint_url = https://some-storage.somewhere.world
16
+ aws_access_key_id = xxxxxxxxxxxxxxxxxxxxxxxx
17
+ aws_secret_access_key = xxxxxxxxxxxxxxxxxxxxxxxx
18
+
19
+ """
20
+
21
+ import logging
22
+ import os
23
+ import threading
24
+ from concurrent.futures import ThreadPoolExecutor
25
+ from contextlib import closing
26
+
27
+ import tqdm
28
+
29
+ LOGGER = logging.getLogger(__name__)
30
+
31
+
32
+ # s3_clients are not thread-safe, so we need to create a new client for each thread
33
+
34
+ thread_local = threading.local()
35
+
36
+
37
+ def _s3_client():
38
+ import boto3
39
+
40
+ if not hasattr(thread_local, "s3_client"):
41
+ thread_local.s3_client = boto3.client("s3")
42
+ return thread_local.s3_client
43
+
44
+
45
+ def _upload_file(source, target, overwrite=False, ignore_existing=False):
46
+ from botocore.exceptions import ClientError
47
+
48
+ assert target.startswith("s3://")
49
+
50
+ _, _, bucket, key = target.split("/", 3)
51
+
52
+ LOGGER.info(f"Uploading {source} to {target}")
53
+ s3_client = _s3_client()
54
+
55
+ size = os.path.getsize(source)
56
+ try:
57
+ results = s3_client.head_object(Bucket=bucket, Key=key)
58
+ remote_size = int(results["ContentLength"])
59
+ except ClientError as e:
60
+ if e.response["Error"]["Code"] != "404":
61
+ raise
62
+ remote_size = None
63
+
64
+ if remote_size is not None:
65
+ if remote_size != size:
66
+ LOGGER.warning(f"{target} already exists, but with different size, re-uploading")
67
+ overwrite = True
68
+
69
+ if ignore_existing:
70
+ LOGGER.info(f"{target} already exists, skipping")
71
+ return
72
+
73
+ if remote_size is not None and not overwrite:
74
+ raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'ignore_existing' to skip")
75
+
76
+ with closing(tqdm.tqdm(total=size, unit="B", unit_scale=True, leave=False)) as t:
77
+ s3_client.upload_file(source, bucket, key, Callback=lambda x: t.update(x))
78
+
79
+
80
+ def _local_file_list(source):
81
+ for root, _, files in os.walk(source):
82
+ for file in files:
83
+ yield os.path.join(root, file)
84
+
85
+
86
+ def _upload_folder(source, target, overwrite=False, ignore_existing=False, threads=1):
87
+ total = sum(1 for _ in _local_file_list(source))
88
+
89
+ with ThreadPoolExecutor(max_workers=threads) as executor:
90
+ futures = []
91
+ for local_path in _local_file_list(source):
92
+ relative_path = os.path.relpath(local_path, source)
93
+ s3_path = os.path.join(target, relative_path)
94
+ futures.append(executor.submit(_upload_file, local_path, s3_path, overwrite, ignore_existing))
95
+
96
+ for future in tqdm.tqdm(futures, total=total):
97
+ future.result()
98
+
99
+
100
+ def upload(source, target, overwrite=False, ignore_existing=False, threads=1, show_progress=True):
101
+ """Upload a file or a folder to S3.
102
+
103
+ Parameters
104
+ ----------
105
+ source : str
106
+ A path to a file or a folder to upload.
107
+ target : str
108
+ A URL to a file or a folder on S3. The url should start with 's3://'.
109
+ overwrite : bool, optional
110
+ If the data is alreay on S3 it will be overwritten, by default False
111
+ ignore_existing : bool, optional
112
+ If the data is alreay on S3 it will not be uploaded, unless the remote file
113
+ has a different size, by default False
114
+ threads : int, optional
115
+ The number of threads to use when uploading a directory, by default 1
116
+ """
117
+ if os.path.isdir(source):
118
+ _upload_folder(source, target, overwrite, ignore_existing, threads)
119
+ else:
120
+ _upload_file(source, target, overwrite, ignore_existing)
121
+
122
+
123
+ def _download_file(source, target, overwrite=False, ignore_existing=False):
124
+ s3_client = _s3_client()
125
+ _, _, bucket, key = source.split("/", 3)
126
+
127
+ response = s3_client.head_object(Bucket=bucket, Key=key)
128
+ size = int(response["ContentLength"])
129
+
130
+ if os.path.exists(target):
131
+
132
+ if os.path.exists(target) and os.path.getsize(target) != size:
133
+ LOGGER.info(f"{target} already with different size, re-downloading")
134
+ overwrite = True
135
+
136
+ if not overwrite and not ignore_existing:
137
+ raise ValueError(f"{target} already exists, use 'overwrite' to replace or 'ignore_existing' to skip")
138
+
139
+ if ignore_existing:
140
+ LOGGER.info(f"{target} already exists, skipping")
141
+ return
142
+
143
+ with closing(tqdm.tqdm(total=size, unit="B", unit_scale=True, leave=False)) as t:
144
+ s3_client.download_file(bucket, key, target, Callback=lambda x: t.update(x))
145
+
146
+
147
+ def _download_folder(source, target, overwrite=False, ignore_existing=False, threads=1):
148
+ source = source.rstrip("/")
149
+ _, _, bucket, folder = source.split("/", 3)
150
+ total = _count_objects_in_folder(source)
151
+
152
+ with ThreadPoolExecutor(max_workers=threads) as executor:
153
+ futures = []
154
+ for o in _list_folder(source):
155
+ name = o["Key"]
156
+ local_path = os.path.join(target, os.path.relpath(name, folder))
157
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
158
+ futures.append(
159
+ executor.submit(_download_file, f"s3://{bucket}/{name}", local_path, overwrite, ignore_existing)
160
+ )
161
+
162
+ for future in tqdm.tqdm(futures, total=total):
163
+ future.result()
164
+
165
+
166
+ def download(source, target, overwrite=False, ignore_existing=False, threads=1, show_progress=True):
167
+ """Download a file or a folder from S3.
168
+
169
+ Parameters
170
+ ----------
171
+ source : str
172
+ The URL of a file or a folder on S3. The url should start with 's3://'. If the URL ends with a '/' it is
173
+ assumed to be a folder, otherwise it is assumed to be a file.
174
+ target : str
175
+ The local path where the file or folder will be downloaded.
176
+ overwrite : bool, optional
177
+ If false, files which have already been download will be skipped, unless their size
178
+ does not match their size on S3 , by default False
179
+ ignore_existing : bool, optional
180
+ If the data is alreay on local it will not be downloaded, unless the remote file
181
+ has a different size, by default False
182
+ threads : int, optional
183
+ The number of threads to use when downloading a directory, by default 1
184
+ """
185
+ assert source.startswith("s3://")
186
+
187
+ if source.endswith("/"):
188
+ _download_folder(source, target, overwrite, ignore_existing, threads)
189
+ else:
190
+ _download_file(source, target, overwrite, ignore_existing)
191
+
192
+
193
+ def _list_folder(target, batch=False):
194
+ s3_client = _s3_client()
195
+ _, _, bucket, prefix = target.split("/", 3)
196
+
197
+ paginator = s3_client.get_paginator("list_objects_v2")
198
+ for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
199
+ if "Contents" in page:
200
+ objects = [{"Key": obj["Key"]} for obj in page["Contents"]]
201
+ if batch:
202
+ yield objects
203
+ else:
204
+ yield from objects
205
+
206
+
207
+ def _count_objects_in_folder(target):
208
+ return sum(len(_) for _ in _list_folder(target, batch=True))
209
+
210
+
211
+ def _delete_folder(target, threads):
212
+ s3_client = _s3_client()
213
+ _, _, bucket, _ = target.split("/", 3)
214
+
215
+ for batch in _list_folder(target, batch=True):
216
+ s3_client.delete_objects(Bucket=bucket, Delete={"Objects": batch})
217
+ LOGGER.info(f"Deleted {len(batch)} objects")
218
+
219
+
220
+ def _delete_file(target):
221
+ s3_client = _s3_client()
222
+ _, _, bucket, key = target.split("/", 3)
223
+
224
+ LOGGER.info(f"Deleting {target}")
225
+ s3_client.delete_object(Bucket=bucket, Key=key)
226
+ LOGGER.info(f"{target} is deleted")
227
+
228
+
229
+ def delete(target):
230
+ """Delete a file or a folder from S3.
231
+
232
+ Parameters
233
+ ----------
234
+ target : str
235
+ The URL of a file or a folder on S3. The url should start with 's3://'. If the URL ends with a '/' it is
236
+ assumed to be a folder, otherwise it is assumed to be a file.
237
+ """
238
+
239
+ assert target.startswith("s3://")
240
+
241
+ if target.endswith("/"):
242
+ _delete_folder(target)
243
+ else:
244
+ _delete_file(target)
245
+
246
+
247
+ def list_folder(target):
248
+ """List the objects in a folder on S3.
249
+
250
+ Parameters
251
+ ----------
252
+ target : str
253
+ The URL of a folder on S3. The url should start with 's3://'.
254
+
255
+ Returns
256
+ -------
257
+ list
258
+ A list of the objects names in the folder.
259
+ """
260
+
261
+ assert target.startswith("s3://")
262
+ return [o["Key"] for o in _list_folder(target)]
263
+
264
+
265
+ def count_objects_in_folder(target):
266
+ """Count the objects in a folder on S3.
267
+
268
+ Parameters
269
+ ----------
270
+ target : str
271
+ The URL of a folder on S3. The url should start with 's3://'.
272
+
273
+ Returns
274
+ -------
275
+ int
276
+ The number of objects in the folder.
277
+ """
278
+
279
+ assert target.startswith("s3://")
280
+ return _count_objects_in_folder(target)
@@ -7,7 +7,6 @@
7
7
 
8
8
  """Text utilities"""
9
9
 
10
- import sys
11
10
  from collections import defaultdict
12
11
 
13
12
  # https://en.wikipedia.org/wiki/Box-drawing_character
@@ -143,13 +142,13 @@ class Tree:
143
142
  self._kids.append(node)
144
143
  return node
145
144
 
146
- def print(self, file=sys.stdout):
145
+ def print(self):
147
146
  padding = []
148
147
 
149
148
  while self._factorise():
150
149
  pass
151
150
 
152
- self._print(padding, file=file)
151
+ self._print(padding)
153
152
 
154
153
  def _leaves(self, result):
155
154
  if self.is_leaf:
@@ -206,21 +205,21 @@ class Tree:
206
205
 
207
206
  return result
208
207
 
209
- def _print(self, padding, file=sys.stdout):
208
+ def _print(self, padding):
210
209
  for i, p in enumerate(padding[:-1]):
211
210
  if p == " └":
212
211
  padding[i] = " "
213
212
  if p == " ├":
214
213
  padding[i] = " │"
215
214
  if padding:
216
- print(f"{''.join(padding)}─{self._text}", file=file)
215
+ print(f"{''.join(padding)}─{self._text}")
217
216
  else:
218
- print(self._text, file=file)
217
+ print(self._text)
219
218
  padding.append(" ")
220
219
  for i, k in enumerate(self._kids):
221
220
  sep = " ├" if i < len(self._kids) - 1 else " └"
222
221
  padding[-1] = sep
223
- k._print(padding, file=file)
222
+ k._print(padding)
224
223
 
225
224
  padding.pop()
226
225
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: anemoi-utils
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: A package to hold various functions to support training of ML models on ECMWF data.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License: Apache License
@@ -19,6 +19,7 @@ docs/modules/dates.rst
19
19
  docs/modules/grib.rst
20
20
  docs/modules/humanize.rst
21
21
  docs/modules/provenance.rst
22
+ docs/modules/s3.rst
22
23
  docs/modules/text.rst
23
24
  src/anemoi/utils/__init__.py
24
25
  src/anemoi/utils/__main__.py
@@ -1,57 +0,0 @@
1
- # (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
2
- # This software is licensed under the terms of the Apache Licence Version 2.0
3
- # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
4
- # In applying this licence, ECMWF does not waive the privileges and immunities
5
- # granted to it by virtue of its status as an intergovernmental organisation
6
- # nor does it submit to any jurisdiction.
7
- import logging
8
- import os
9
- from contextlib import closing
10
-
11
- import boto3
12
- import tqdm
13
-
14
- LOG = logging.getLogger(__name__)
15
-
16
-
17
- def upload(source, target, overwrite=False, ignore_existing=False):
18
- # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html
19
- assert target.startswith("s3://")
20
-
21
- _, _, bucket, key = target.split("/", 3)
22
-
23
- LOG.info(f"Uploading {source} to {target}")
24
- s3_client = boto3.client("s3")
25
-
26
- if not overwrite:
27
- results = s3_client.list_objects(Bucket=bucket, Prefix=key)
28
- if results.get("Contents"):
29
- if ignore_existing:
30
- LOG.info(f"{target} already exists, skipping")
31
- return
32
- else:
33
- raise ValueError(f"{target} already exists, use --overwrite to replace")
34
-
35
- size = os.path.getsize(source)
36
- with closing(tqdm.tqdm(total=size, unit="B", unit_scale=True)) as t:
37
- s3_client.upload_file(source, bucket, key, Callback=lambda x: t.update(x))
38
-
39
- LOG.info(f"{target} is ready")
40
-
41
-
42
- def download(source, target, overwrite=False):
43
- assert source.startswith("s3://")
44
-
45
- _, _, bucket, key = source.split("/", 3)
46
-
47
- s3 = boto3.client("s3")
48
- response = s3.head_object(Bucket=bucket, Key=key)
49
- size = response["ContentLength"]
50
-
51
- if not overwrite:
52
- if os.path.exists(source) and os.path.getsize(source) == size:
53
- LOG.info(f"{source} already exists, skipping")
54
- return
55
-
56
- with closing(tqdm.tqdm(total=size, unit="B", unit_scale=True)) as t:
57
- s3.download_file(bucket, key, target, Callback=lambda x: t.update(x))
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes