anemoi-datasets 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,64 +8,20 @@
8
8
  # nor does it submit to any jurisdiction.
9
9
  #
10
10
 
11
-
12
- import argparse
13
- import logging
14
- import sys
15
- import traceback
11
+ from anemoi.utils.cli import cli_main
12
+ from anemoi.utils.cli import make_parser
16
13
 
17
14
  from . import __version__
18
15
  from .commands import COMMANDS
19
16
 
20
- LOG = logging.getLogger(__name__)
21
-
22
-
23
- def main():
24
- parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
25
-
26
- parser.add_argument(
27
- "--version",
28
- "-V",
29
- action="store_true",
30
- help="show the version and exit",
31
- )
32
- parser.add_argument(
33
- "--debug",
34
- "-d",
35
- action="store_true",
36
- help="Debug mode",
37
- )
38
17
 
39
- subparsers = parser.add_subparsers(help="commands:", dest="command")
40
- for name, command in COMMANDS.items():
41
- command_parser = subparsers.add_parser(name, help=command.__doc__)
42
- command.add_arguments(command_parser)
18
+ # For read-the-docs
19
+ def create_parser():
20
+ return make_parser(__doc__, COMMANDS)
43
21
 
44
- args = parser.parse_args()
45
22
 
46
- if args.version:
47
- print(__version__)
48
- return
49
-
50
- if args.command is None:
51
- parser.print_help()
52
- return
53
-
54
- cmd = COMMANDS[args.command]
55
-
56
- logging.basicConfig(
57
- format="%(asctime)s %(levelname)s %(message)s",
58
- datefmt="%Y-%m-%d %H:%M:%S",
59
- level=logging.DEBUG if args.debug else logging.INFO,
60
- )
61
-
62
- try:
63
- cmd.run(args)
64
- except ValueError as e:
65
- traceback.print_exc()
66
- LOG.error("\n💣 %s", str(e).lstrip())
67
- LOG.error("💣 Exiting")
68
- sys.exit(1)
23
+ def main():
24
+ cli_main(__version__, __doc__, COMMANDS)
69
25
 
70
26
 
71
27
  if __name__ == "__main__":
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.2.1'
16
- __version_tuple__ = version_tuple = (0, 2, 1)
15
+ __version__ = version = '0.3.1'
16
+ __version_tuple__ = version_tuple = (0, 3, 1)
@@ -8,69 +8,15 @@
8
8
  # nor does it submit to any jurisdiction.
9
9
  #
10
10
 
11
- import argparse
12
- import importlib
13
- import logging
14
11
  import os
15
- import sys
16
12
 
17
- LOG = logging.getLogger(__name__)
13
+ from anemoi.utils.cli import Command
14
+ from anemoi.utils.cli import Failed
15
+ from anemoi.utils.cli import register_commands
18
16
 
17
+ __all__ = ["Command"]
19
18
 
20
- def register(here, package, select, fail=None):
21
- result = {}
22
- not_available = {}
23
-
24
- for p in os.listdir(here):
25
- full = os.path.join(here, p)
26
- if p.startswith("_"):
27
- continue
28
- if not (p.endswith(".py") or (os.path.isdir(full) and os.path.exists(os.path.join(full, "__init__.py")))):
29
- continue
30
-
31
- name, _ = os.path.splitext(p)
32
-
33
- try:
34
- imported = importlib.import_module(
35
- f".{name}",
36
- package=package,
37
- )
38
- except ImportError as e:
39
- not_available[name] = e
40
- continue
41
-
42
- obj = select(imported)
43
- if obj is not None:
44
- result[name] = obj
45
-
46
- for name, e in not_available.items():
47
- if fail is None:
48
- pass
49
- if callable(fail):
50
- result[name] = fail(name, e)
51
-
52
- return result
53
-
54
-
55
- class Command:
56
- def run(self, args):
57
- raise NotImplementedError(f"Command not implemented: {args.command}")
58
-
59
-
60
- class Failed(Command):
61
- def __init__(self, name, error):
62
- self.name = name
63
- self.error = error
64
-
65
- def add_arguments(self, command_parser):
66
- command_parser.add_argument("x", nargs=argparse.REMAINDER)
67
-
68
- def run(self, args):
69
- print(f"Command '{self.name}' not available: {self.error}")
70
- sys.exit(1)
71
-
72
-
73
- COMMANDS = register(
19
+ COMMANDS = register_commands(
74
20
  os.path.dirname(__file__),
75
21
  __name__,
76
22
  lambda x: x.command(),
@@ -41,24 +41,19 @@ zinfo https://object-store.os-api.cci1.ecmwf.int/
41
41
  """
42
42
 
43
43
 
44
- class CopyMixin:
45
- internal = True
46
- timestamp = True
47
-
48
- def add_arguments(self, command_parser):
49
- command_parser.add_argument("--transfers", type=int, default=8)
50
- command_parser.add_argument("--block-size", type=int, default=100)
51
- command_parser.add_argument("--overwrite", action="store_true")
52
- command_parser.add_argument("--progress", action="store_true")
53
- command_parser.add_argument("--nested", action="store_true", help="Use ZARR's nested directpry backend.")
54
- command_parser.add_argument(
55
- "--rechunk",
56
- nargs="+",
57
- help="Rechunk given array.",
58
- metavar="array=i,j,k,l",
59
- )
60
- command_parser.add_argument("source")
61
- command_parser.add_argument("target")
44
+ class Copier:
45
+ def __init__(self, source, target, transfers, block_size, overwrite, resume, progress, nested, rechunk, **kwargs):
46
+ self.source = source
47
+ self.target = target
48
+ self.transfers = transfers
49
+ self.block_size = block_size
50
+ self.overwrite = overwrite
51
+ self.resume = resume
52
+ self.progress = progress
53
+ self.nested = nested
54
+ self.rechunk = rechunk
55
+
56
+ self.rechunking = rechunk.split(",") if rechunk else []
62
57
 
63
58
  def _store(self, path, nested=False):
64
59
  if nested:
@@ -67,30 +62,56 @@ class CopyMixin:
67
62
  return zarr.storage.NestedDirectoryStore(path)
68
63
  return path
69
64
 
70
- def copy_chunk(self, n, m, source, target, block_size, _copy, progress):
65
+ def copy_chunk(self, n, m, source, target, _copy, progress):
71
66
  if _copy[n:m].all():
72
67
  LOG.info(f"Skipping {n} to {m}")
73
68
  return None
74
69
 
75
- for i in tqdm.tqdm(
76
- range(n, m),
77
- desc=f"Copying {n} to {m}",
78
- leave=False,
79
- disable=not isatty and not progress,
80
- ):
81
- target[i] = source[i]
70
+ if self.block_size % self.data_chunks[0] == 0:
71
+ target[slice(n, m)] = source[slice(n, m)]
72
+ else:
73
+ LOG.warning(
74
+ f"Block size ({self.block_size}) is not a multiple of target chunk size ({self.data_chunks[0]}). Slow copy expected."
75
+ )
76
+ if self.transfers > 1:
77
+ # race condition, different threads might copy the same data to the same chunk
78
+ raise NotImplementedError(
79
+ "Block size is not a multiple of target chunk size. Parallel copy not supported."
80
+ )
81
+ for i in tqdm.tqdm(
82
+ range(n, m),
83
+ desc=f"Copying {n} to {m}",
84
+ leave=False,
85
+ disable=not isatty and not progress,
86
+ ):
87
+ target[i] = source[i]
88
+
82
89
  return slice(n, m)
83
90
 
84
- def copy_data(self, source, target, transfers, block_size, _copy, progress, rechunking):
91
+ def parse_rechunking(self, rechunking, source_data):
92
+ shape = source_data.shape
93
+ chunks = list(source_data.chunks)
94
+ for i, c in enumerate(rechunking):
95
+ if not c:
96
+ continue
97
+ elif c == "full":
98
+ chunks[i] = shape[i]
99
+ c = int(c)
100
+ c = min(c, shape[i])
101
+ chunks[i] = c
102
+ chunks = tuple(chunks)
103
+
104
+ if chunks != source_data.chunks:
105
+ LOG.info(f"Rechunking data from {source_data.chunks} to {chunks}")
106
+ # if self.transfers > 1:
107
+ # raise NotImplementedError("Rechunking with multiple transfers is not implemented")
108
+ return chunks
109
+
110
+ def copy_data(self, source, target, _copy, progress):
85
111
  LOG.info("Copying data")
86
112
  source_data = source["data"]
87
113
 
88
- chunks = list(source_data.chunks)
89
- if "data" in rechunking:
90
- assert len(chunks) == len(rechunking["data"]), (chunks, rechunking["data"])
91
- for i, c in enumerate(rechunking["data"]):
92
- if c != -1:
93
- chunks[i] = c
114
+ self.data_chunks = self.parse_rechunking(self.rechunking, source_data)
94
115
 
95
116
  target_data = (
96
117
  target["data"]
@@ -98,12 +119,12 @@ class CopyMixin:
98
119
  else target.create_dataset(
99
120
  "data",
100
121
  shape=source_data.shape,
101
- chunks=chunks,
122
+ chunks=self.data_chunks,
102
123
  dtype=source_data.dtype,
103
124
  )
104
125
  )
105
126
 
106
- executor = ThreadPoolExecutor(max_workers=transfers)
127
+ executor = ThreadPoolExecutor(max_workers=self.transfers)
107
128
  tasks = []
108
129
  n = 0
109
130
  while n < target_data.shape[0]:
@@ -111,15 +132,14 @@ class CopyMixin:
111
132
  executor.submit(
112
133
  self.copy_chunk,
113
134
  n,
114
- min(n + block_size, target_data.shape[0]),
135
+ min(n + self.block_size, target_data.shape[0]),
115
136
  source_data,
116
137
  target_data,
117
- block_size,
118
138
  _copy,
119
139
  progress,
120
140
  )
121
141
  )
122
- n += block_size
142
+ n += self.block_size
123
143
 
124
144
  for future in tqdm.tqdm(as_completed(tasks), total=len(tasks), smoothing=0):
125
145
  copied = future.result()
@@ -131,7 +151,7 @@ class CopyMixin:
131
151
 
132
152
  LOG.info("Copied data")
133
153
 
134
- def copy_array(self, name, source, target, transfers, block_size, _copy, progress, rechunking):
154
+ def copy_array(self, name, source, target, _copy, progress):
135
155
  for k, v in source.attrs.items():
136
156
  target.attrs[k] = v
137
157
 
@@ -139,14 +159,14 @@ class CopyMixin:
139
159
  return
140
160
 
141
161
  if name == "data":
142
- self.copy_data(source, target, transfers, block_size, _copy, progress, rechunking)
162
+ self.copy_data(source, target, _copy, progress)
143
163
  return
144
164
 
145
165
  LOG.info(f"Copying {name}")
146
166
  target[name] = source[name]
147
167
  LOG.info(f"Copied {name}")
148
168
 
149
- def copy_group(self, source, target, transfers, block_size, _copy, progress, rechunking):
169
+ def copy_group(self, source, target, _copy, progress):
150
170
  import zarr
151
171
 
152
172
  for k, v in source.attrs.items():
@@ -158,25 +178,19 @@ class CopyMixin:
158
178
  self.copy_group(
159
179
  source[name],
160
180
  group,
161
- transfers,
162
- block_size,
163
181
  _copy,
164
182
  progress,
165
- rechunking,
166
183
  )
167
184
  else:
168
185
  self.copy_array(
169
186
  name,
170
187
  source,
171
188
  target,
172
- transfers,
173
- block_size,
174
189
  _copy,
175
190
  progress,
176
- rechunking,
177
191
  )
178
192
 
179
- def copy(self, source, target, transfers, block_size, progress, rechunking):
193
+ def copy(self, source, target, progress):
180
194
  import zarr
181
195
 
182
196
  if "_copy" not in target:
@@ -187,32 +201,26 @@ class CopyMixin:
187
201
  _copy = target["_copy"]
188
202
  _copy_np = _copy[:]
189
203
 
190
- self.copy_group(source, target, transfers, block_size, _copy_np, progress, rechunking)
204
+ self.copy_group(source, target, _copy_np, progress)
191
205
  del target["_copy"]
192
206
 
193
- def run(self, args):
207
+ def run(self):
194
208
  import zarr
195
209
 
196
210
  # base, ext = os.path.splitext(os.path.basename(args.source))
197
211
  # assert ext == ".zarr", ext
198
212
  # assert "." not in base, base
199
- LOG.info(f"Copying {args.source} to {args.target}")
200
-
201
- rechunking = {}
202
- if args.rechunk:
203
- for r in args.rechunk:
204
- k, v = r.split("=")
205
- if k != "data":
206
- raise ValueError(f"Only rechunking data is supported: {k}")
207
- values = v.split(",")
208
- values = [-1 if x == "" else x for x in values]
209
- values = tuple(int(x) for x in values)
210
- rechunking[k] = values
211
- for k, v in rechunking.items():
212
- LOG.info(f"Rechunking {k} to {v}")
213
-
214
- try:
215
- target = zarr.open(self._store(args.target, args.nested), mode="r")
213
+ LOG.info(f"Copying {self.source} to {self.target}")
214
+
215
+ def target_exists():
216
+ try:
217
+ zarr.open(self._store(self.target), mode="r")
218
+ return True
219
+ except ValueError:
220
+ return False
221
+
222
+ def target_finished():
223
+ target = zarr.open(self._store(self.target), mode="r")
216
224
  if "_copy" in target:
217
225
  done = sum(1 if x else 0 for x in target["_copy"])
218
226
  todo = len(target["_copy"])
@@ -222,26 +230,76 @@ class CopyMixin:
222
230
  todo,
223
231
  int(done / todo * 100 + 0.5),
224
232
  )
233
+ return False
225
234
  elif "sums" in target and "data" in target: # sums is copied last
226
- LOG.error("Target already exists")
227
- return
228
- except ValueError as e:
229
- LOG.info(f"Target does not exist: {e}")
230
- pass
231
-
232
- source = zarr.open(self._store(args.source), mode="r")
233
- if args.overwrite:
234
- target = zarr.open(self._store(args.target, args.nested), mode="w")
235
- else:
236
- try:
237
- target = zarr.open(self._store(args.target, args.nested), mode="w+")
238
- except ValueError:
239
- target = zarr.open(self._store(args.target, args.nested), mode="w")
240
- self.copy(source, target, args.transfers, args.block_size, args.progress, rechunking)
235
+ return True
236
+ return False
237
+
238
+ def open_target():
239
+
240
+ if not target_exists():
241
+ return zarr.open(self._store(self.target, self.nested), mode="w")
242
+
243
+ if self.overwrite:
244
+ LOG.error("Target already exists, overwriting.")
245
+ return zarr.open(self._store(self.target, self.nested), mode="w")
246
+
247
+ if self.resume:
248
+ if target_finished():
249
+ LOG.error("Target already exists and is finished.")
250
+ sys.exit(0)
251
+
252
+ LOG.error("Target already exists, resuming copy.")
253
+ return zarr.open(self._store(self.target, self.nested), mode="w+")
254
+
255
+ LOG.error("Target already exists, use either --overwrite or --resume.")
256
+ sys.exit(1)
257
+
258
+ target = open_target()
259
+
260
+ assert target is not None, target
261
+
262
+ source = zarr.open(self._store(self.source), mode="r")
263
+ self.copy(source, target, self.progress)
264
+
265
+
266
+ class CopyMixin:
267
+ internal = True
268
+ timestamp = True
269
+
270
+ def add_arguments(self, command_parser):
271
+ group = command_parser.add_mutually_exclusive_group()
272
+ group.add_argument(
273
+ "--overwrite",
274
+ action="store_true",
275
+ help="Overwrite existing dataset. This will delete the target dataset if it already exists. Cannot be used with --resume.",
276
+ )
277
+ group.add_argument(
278
+ "--resume", action="store_true", help="Resume copying an existing dataset. Cannot be used with --overwrite."
279
+ )
280
+ command_parser.add_argument("--transfers", type=int, default=8, help="Number of parallel transfers.")
281
+ command_parser.add_argument(
282
+ "--progress", action="store_true", help="Force show progress bar, even if not in an interactive shell."
283
+ )
284
+ command_parser.add_argument("--nested", action="store_true", help="Use ZARR's nested directpry backend.")
285
+ command_parser.add_argument(
286
+ "--rechunk", help="Rechunk the target data array. Rechunk size should be a diviser of the block size."
287
+ )
288
+ command_parser.add_argument(
289
+ "--block-size",
290
+ type=int,
291
+ default=100,
292
+ help="For optimisation purposes, data is transfered by blocks. Default is 100.",
293
+ )
294
+ command_parser.add_argument("source", help="Source location.")
295
+ command_parser.add_argument("target", help="Target location.")
296
+
297
+ def run(self, args):
298
+ Copier(**vars(args)).run()
241
299
 
242
300
 
243
301
  class Copy(CopyMixin, Command):
244
- pass
302
+ """Copy a dataset from one location to another."""
245
303
 
246
304
 
247
305
  command = Copy
@@ -4,13 +4,24 @@ from . import Command
4
4
 
5
5
 
6
6
  class Create(Command):
7
+ """Create a dataset."""
8
+
7
9
  internal = True
8
10
  timestamp = True
9
11
 
10
12
  def add_arguments(self, command_parser):
11
- command_parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files")
12
- command_parser.add_argument("config", help="Configuration file")
13
- command_parser.add_argument("path", help="Path to store the created data")
13
+ command_parser.add_argument(
14
+ "--overwrite",
15
+ action="store_true",
16
+ help="Overwrite existing files. This will delete the target dataset if it already exists.",
17
+ )
18
+ command_parser.add_argument(
19
+ "--test",
20
+ action="store_true",
21
+ help="Build a small dataset, using only the first dates. And, when possible, using low resolution and less ensemble members.",
22
+ )
23
+ command_parser.add_argument("config", help="Configuration yaml file defining the recipe to create the dataset.")
24
+ command_parser.add_argument("path", help="Path to store the created data.")
14
25
 
15
26
  def run(self, args):
16
27
  kwargs = vars(args)
@@ -11,16 +11,12 @@ import os
11
11
  from .. import Command
12
12
  from .zarr import InspectZarr
13
13
 
14
- # from .checkpoint import InspectCheckpoint
15
-
16
14
 
17
15
  class Inspect(Command, InspectZarr):
18
- # class Inspect(Command, InspectCheckpoint, InspectZarr):
19
- """Inspect a checkpoint or zarr file."""
16
+ """Inspect a zarr dataset."""
20
17
 
21
18
  def add_arguments(self, command_parser):
22
19
  # g = command_parser.add_mutually_exclusive_group()
23
- # g.add_argument("--inspect", action="store_true", help="Inspect weights")
24
20
  command_parser.add_argument("path", metavar="PATH", nargs="+")
25
21
  command_parser.add_argument("--detailed", action="store_true")
26
22
  # command_parser.add_argument("--probe", action="store_true")
@@ -19,6 +19,7 @@ class Creator:
19
19
  print=print,
20
20
  statistics_tmp=None,
21
21
  overwrite=False,
22
+ test=None,
22
23
  **kwargs,
23
24
  ):
24
25
  self.path = path # Output path
@@ -27,6 +28,7 @@ class Creator:
27
28
  self.print = print
28
29
  self.statistics_tmp = statistics_tmp
29
30
  self.overwrite = overwrite
31
+ self.test = test
30
32
 
31
33
  def init(self, check_name=False):
32
34
  # check path
@@ -43,6 +45,7 @@ class Creator:
43
45
  config=self.config,
44
46
  statistics_tmp=self.statistics_tmp,
45
47
  print=self.print,
48
+ test=self.test,
46
49
  )
47
50
  obj.initialise(check_name=check_name)
48
51
 
@@ -25,6 +25,7 @@ from anemoi.datasets.dates.groups import Groups
25
25
  from .check import DatasetName
26
26
  from .check import check_data_values
27
27
  from .chunks import ChunkFilter
28
+ from .config import DictObj
28
29
  from .config import build_output
29
30
  from .config import loader_config
30
31
  from .input import build_input
@@ -55,6 +56,8 @@ class GenericDatasetHandler:
55
56
  self.path = path
56
57
  self.kwargs = kwargs
57
58
  self.print = print
59
+ if "test" in kwargs:
60
+ self.test = kwargs["test"]
58
61
 
59
62
  @classmethod
60
63
  def from_config(cls, *, config, path, print=print, **kwargs):
@@ -157,7 +160,35 @@ class InitialiserLoader(Loader):
157
160
 
158
161
  self.tmp_statistics.delete()
159
162
 
163
+ if self.test:
164
+
165
+ def test_dates(cfg, n=4):
166
+ LOG.warn("Running in test mode. Changing the list of dates to use only 4.")
167
+ groups = Groups(**cfg)
168
+ dates = groups.dates
169
+ return dict(start=dates[0], end=dates[n - 1], frequency=dates.frequency, group_by=n)
170
+
171
+ self.main_config.dates = test_dates(self.main_config.dates)
172
+
173
+ def set_to_test_mode(obj):
174
+ if isinstance(obj, (list, tuple)):
175
+ for v in obj:
176
+ set_to_test_mode(v)
177
+ return
178
+ if isinstance(obj, (dict, DictObj)):
179
+ if "grid" in obj:
180
+ obj["grid"] = "20./20."
181
+ LOG.warn(f"Running in test mode. Setting grid to {obj['grid']}")
182
+ if "number" in obj:
183
+ obj["number"] = obj["number"][0:3]
184
+ LOG.warn(f"Running in test mode. Setting number to {obj['number']}")
185
+ for k, v in obj.items():
186
+ set_to_test_mode(v)
187
+
188
+ set_to_test_mode(self.main_config)
189
+
160
190
  LOG.info(self.main_config.dates)
191
+
161
192
  self.groups = Groups(**self.main_config.dates)
162
193
 
163
194
  self.output = build_output(self.main_config.output, parent=self)
@@ -28,9 +28,7 @@ class PersistentDict:
28
28
  # Used in parrallel, during data loading,
29
29
  # to write data in pickle files.
30
30
  def __init__(self, directory, create=True):
31
- """dirname: str
32
- The directory where the data will be stored.
33
- """
31
+ """dirname: str The directory where the data will be stored."""
34
32
  self.dirname = directory
35
33
  self.name, self.ext = os.path.splitext(os.path.basename(self.dirname))
36
34
  if create:
@@ -26,14 +26,14 @@ LOG = logging.getLogger(__name__)
26
26
 
27
27
 
28
28
  def default_statistics_dates(dates):
29
- """
30
- Calculate default statistics dates based on the given list of dates.
29
+ """Calculate default statistics dates based on the given list of dates.
31
30
 
32
31
  Args:
33
32
  dates (list): List of datetime objects representing dates.
34
33
 
35
34
  Returns:
36
35
  tuple: A tuple containing the default start and end dates.
36
+
37
37
  """
38
38
 
39
39
  def to_datetime(d):
@@ -17,10 +17,7 @@ from ..check import check_stats
17
17
 
18
18
 
19
19
  class Summary(dict):
20
- """This class is used to store the summary statistics of a dataset.
21
- It can be saved and loaded from a json file.
22
- And does some basic checks on the data.
23
- """
20
+ """This class is used to store the summary statistics of a dataset. It can be saved and loaded from a json file. And does some basic checks on the data."""
24
21
 
25
22
  STATS_NAMES = [
26
23
  "minimum",
@@ -17,11 +17,12 @@ LOG = logging.getLogger(__name__)
17
17
  class ViewCacheArray:
18
18
  """A class that provides a caching mechanism for writing to a NumPy-like array.
19
19
 
20
- The is initialized with a NumPy-like array, a shape and a list to reindex the first dimension.
21
- The array is used to store the final data, while the cache is used to temporarily
22
- store the data before flushing it to the array.
20
+ The is initialized with a NumPy-like array, a shape and a list to reindex the first
21
+ dimension. The array is used to store the final data, while the cache is used to
22
+ temporarily store the data before flushing it to the array.
23
23
 
24
24
  The `flush` method copies the contents of the cache to the final array.
25
+
25
26
  """
26
27
 
27
28
  def __init__(self, array, *, shape, indexes):
@@ -119,9 +119,7 @@ def _as_tuples(index):
119
119
 
120
120
 
121
121
  def expand_list_indexing(method):
122
- """Allows to use slices, lists, and tuples to select data from the dataset.
123
- Zarr does not support indexing with lists/arrays directly, so we need to implement it ourselves.
124
- """
122
+ """Allows to use slices, lists, and tuples to select data from the dataset. Zarr does not support indexing with lists/arrays directly, so we need to implement it ourselves."""
125
123
 
126
124
  @wraps(method)
127
125
  def wrapper(self, index):
@@ -88,13 +88,12 @@ def _frequency_to_hours(frequency):
88
88
 
89
89
 
90
90
  def _as_date(d, dates, last):
91
- if isinstance(d, np.datetime64):
92
- d = d.astype(datetime.datetime)
93
91
 
94
- if isinstance(d, datetime.datetime):
95
- if not d.minute == 0 and d.hour == 0 and d.second == 0:
96
- return np.datetime64(d)
97
- d = datetime.date(d.year, d.month, d.day)
92
+ # WARNING, datetime.datetime is a subclass of datetime.date
93
+ # so we need to check for datetime.datetime first
94
+
95
+ if isinstance(d, (np.datetime64, datetime.datetime)):
96
+ return d
98
97
 
99
98
  if isinstance(d, datetime.date):
100
99
  d = d.year * 10_000 + d.month * 100 + d.day
@@ -39,9 +39,7 @@ class ReadOnlyStore(zarr.storage.BaseStore):
39
39
 
40
40
 
41
41
  class HTTPStore(ReadOnlyStore):
42
- """We write our own HTTPStore because the one used by zarr (fsspec) does not play
43
- well with fork() and multiprocessing.
44
- """
42
+ """We write our own HTTPStore because the one used by zarr (fsspec) does not play well with fork() and multiprocessing."""
45
43
 
46
44
  def __init__(self, url):
47
45
  self.url = url
@@ -59,9 +57,7 @@ class HTTPStore(ReadOnlyStore):
59
57
 
60
58
 
61
59
  class S3Store(ReadOnlyStore):
62
- """We write our own S3Store because the one used by zarr (fsspec) does not play well
63
- with fork() and multiprocessing.
64
- """
60
+ """We write our own S3Store because the one used by zarr (fsspec) does not play well with fork() and multiprocessing."""
65
61
 
66
62
  def __init__(self, url):
67
63
  import boto3
@@ -29,12 +29,7 @@ class check:
29
29
 
30
30
  @wraps(method)
31
31
  def wrapper(obj):
32
- """
33
- This is a decorator that checks the compatibility of the datasets
34
- before calling the method. If the datasets are compatible, it
35
- will return the result of the method, otherwise it will raise an
36
- exception.
37
- """
32
+ """This is a decorator that checks the compatibility of the datasets before calling the method. If the datasets are compatible, it will return the result of the method, otherwise it will raise an exception."""
38
33
 
39
34
  for d in obj.datasets[1:]:
40
35
  getattr(obj, check)(obj.datasets[0], d)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: anemoi-datasets
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: A package to hold various functions to support training of ML models on ECMWF data.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License: Apache License
@@ -205,59 +205,68 @@ License: Apache License
205
205
  See the License for the specific language governing permissions and
206
206
  limitations under the License.
207
207
 
208
- Project-URL: Homepage, https://github.com/ecmwf/anemoi-datasets/
209
208
  Project-URL: Documentation, https://anemoi-datasets.readthedocs.io/
210
- Project-URL: Repository, https://github.com/ecmwf/anemoi-datasets/
209
+ Project-URL: Homepage, https://github.com/ecmwf/anemoi-datasets/
211
210
  Project-URL: Issues, https://github.com/ecmwf/anemoi-datasets/issues
212
- Keywords: tools,datasets,ai
211
+ Project-URL: Repository, https://github.com/ecmwf/anemoi-datasets/
212
+ Keywords: ai,datasets,tools
213
213
  Classifier: Development Status :: 4 - Beta
214
214
  Classifier: Intended Audience :: Developers
215
215
  Classifier: License :: OSI Approved :: Apache Software License
216
- Classifier: Programming Language :: Python :: 3
216
+ Classifier: Operating System :: OS Independent
217
+ Classifier: Programming Language :: Python :: 3 :: Only
217
218
  Classifier: Programming Language :: Python :: 3.9
218
219
  Classifier: Programming Language :: Python :: 3.10
219
220
  Classifier: Programming Language :: Python :: 3.11
221
+ Classifier: Programming Language :: Python :: 3.12
220
222
  Classifier: Programming Language :: Python :: Implementation :: CPython
221
223
  Classifier: Programming Language :: Python :: Implementation :: PyPy
222
- Classifier: Operating System :: OS Independent
223
224
  Requires-Python: >=3.9
224
225
  License-File: LICENSE
225
- Requires-Dist: anemoi-utils[provenance] >=0.1.7
226
- Requires-Dist: zarr <=2.17.0
227
- Requires-Dist: pyyaml
226
+ Requires-Dist: anemoi-utils[provenance] >=0.3
228
227
  Requires-Dist: numpy
229
- Requires-Dist: tqdm
228
+ Requires-Dist: pyyaml
230
229
  Requires-Dist: semantic-version
230
+ Requires-Dist: tqdm
231
+ Requires-Dist: zarr <=2.17
231
232
  Provides-Extra: all
233
+ Requires-Dist: anemoi-utils[provenance] >=0.3 ; extra == 'all'
232
234
  Requires-Dist: boto3 ; extra == 'all'
233
- Requires-Dist: requests ; extra == 'all'
234
- Requires-Dist: s3fs ; extra == 'all'
235
235
  Requires-Dist: climetlab >=0.22.1 ; extra == 'all'
236
236
  Requires-Dist: earthkit-meteo ; extra == 'all'
237
- Requires-Dist: pyproj ; extra == 'all'
238
237
  Requires-Dist: ecmwflibs >=0.6.3 ; extra == 'all'
238
+ Requires-Dist: numpy ; extra == 'all'
239
+ Requires-Dist: pyproj ; extra == 'all'
240
+ Requires-Dist: pyyaml ; extra == 'all'
241
+ Requires-Dist: requests ; extra == 'all'
242
+ Requires-Dist: s3fs ; extra == 'all'
243
+ Requires-Dist: semantic-version ; extra == 'all'
244
+ Requires-Dist: tqdm ; extra == 'all'
245
+ Requires-Dist: zarr <=2.17 ; extra == 'all'
239
246
  Provides-Extra: create
240
247
  Requires-Dist: climetlab >=0.22.1 ; extra == 'create'
241
248
  Requires-Dist: earthkit-meteo ; extra == 'create'
242
- Requires-Dist: pyproj ; extra == 'create'
243
249
  Requires-Dist: ecmwflibs >=0.6.3 ; extra == 'create'
250
+ Requires-Dist: pyproj ; extra == 'create'
244
251
  Provides-Extra: dev
245
252
  Requires-Dist: boto3 ; extra == 'dev'
246
- Requires-Dist: requests ; extra == 'dev'
247
- Requires-Dist: s3fs ; extra == 'dev'
248
253
  Requires-Dist: climetlab >=0.22.1 ; extra == 'dev'
249
254
  Requires-Dist: earthkit-meteo ; extra == 'dev'
250
- Requires-Dist: pyproj ; extra == 'dev'
251
255
  Requires-Dist: ecmwflibs >=0.6.3 ; extra == 'dev'
252
- Requires-Dist: sphinx ; extra == 'dev'
253
- Requires-Dist: sphinx-rtd-theme ; extra == 'dev'
254
256
  Requires-Dist: nbsphinx ; extra == 'dev'
255
257
  Requires-Dist: pandoc ; extra == 'dev'
258
+ Requires-Dist: pyproj ; extra == 'dev'
259
+ Requires-Dist: requests ; extra == 'dev'
260
+ Requires-Dist: s3fs ; extra == 'dev'
261
+ Requires-Dist: sphinx ; extra == 'dev'
262
+ Requires-Dist: sphinx-argparse ; extra == 'dev'
263
+ Requires-Dist: sphinx-rtd-theme ; extra == 'dev'
256
264
  Provides-Extra: docs
257
- Requires-Dist: sphinx ; extra == 'docs'
258
- Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
259
265
  Requires-Dist: nbsphinx ; extra == 'docs'
260
266
  Requires-Dist: pandoc ; extra == 'docs'
267
+ Requires-Dist: sphinx ; extra == 'docs'
268
+ Requires-Dist: sphinx-argparse ; extra == 'docs'
269
+ Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
261
270
  Provides-Extra: remote
262
271
  Requires-Dist: boto3 ; extra == 'remote'
263
272
  Requires-Dist: requests ; extra == 'remote'
@@ -1,28 +1,28 @@
1
1
  anemoi/datasets/__init__.py,sha256=DC7ttKT--pmhBQALX_Cn7P28dngsJucKi5y-Ydm28QM,700
2
- anemoi/datasets/__main__.py,sha256=CGl8WF7rWMx9EoArysla0-ThjUFtEZUEGM58LbdU488,1798
3
- anemoi/datasets/_version.py,sha256=MxUhzLJIZQfEpDTTcKSxciTGrMLd5v2VmMlHa2HGeo0,411
2
+ anemoi/datasets/__main__.py,sha256=cLA2PidDTOUHaDGzd0_E5iioKYNe-PSTv567Y2fuwQk,723
3
+ anemoi/datasets/_version.py,sha256=HzPz9rq3s1AiZXregKlqKaJJ2wGMtvH_a3V9la9CnpM,411
4
4
  anemoi/datasets/grids.py,sha256=3YBMMJodgYhavarXPAlMZHaMtDT9v2IbTmAXZTqf8Qo,8481
5
- anemoi/datasets/commands/__init__.py,sha256=Pc5bhVgW92ox1lMR5WUOLuhiY2HT6PsadSHclyw99Vc,1983
5
+ anemoi/datasets/commands/__init__.py,sha256=qAybFZPBBQs0dyx7dZ3X5JsLpE90pwrqt1vSV7cqEIw,706
6
6
  anemoi/datasets/commands/compare.py,sha256=tN3eqihvnZ0rFc0OUzrfI34PHDlYfc2l90ZIQBE1TDQ,1300
7
- anemoi/datasets/commands/copy.py,sha256=GZ5TmJKDOAKka9zc0YUtvmqynRqBTeb3hI_v3jLtUDM,7995
8
- anemoi/datasets/commands/create.py,sha256=UVieF0g1cEgNP_myklUZOSH_MuxwfYzKay5s8WDRzro,562
7
+ anemoi/datasets/commands/copy.py,sha256=fba-zjD0iTHHXHhPEcm8VhDzsXQXDUxlbtTA1TovyT0,9991
8
+ anemoi/datasets/commands/create.py,sha256=POdOsVDlvRrHFFkI3SNXNgNIbSxkVUUPMoo660x7Ma0,987
9
9
  anemoi/datasets/commands/scan.py,sha256=HxsLdCgBMSdEXjlJfPq5M_9LxXHHQIoZ1ZEHO_AoPgA,2881
10
- anemoi/datasets/commands/inspect/__init__.py,sha256=SqiWlIJSov7-RnZmIQBzsE4Br7hgl9CqshpXaQqpios,1701
10
+ anemoi/datasets/commands/inspect/__init__.py,sha256=v6fPUTdMRdmUiEUUs0F74QlzPr-x5XEEOql3mkFme7E,1500
11
11
  anemoi/datasets/commands/inspect/zarr.py,sha256=Q1waDTgdJZwJXNST4jkO4DCIbqbf2T_2Us2k6yKGToo,19684
12
12
  anemoi/datasets/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  anemoi/datasets/compute/recentre.py,sha256=j8LdC8kq1t4PW7WFTXf93hSxok10un8ENIPwCehzbP8,4768
14
- anemoi/datasets/create/__init__.py,sha256=o7pZTL71XqoD3a10VrSnwroAFvN4g_9o98jEoMArjfk,5731
14
+ anemoi/datasets/create/__init__.py,sha256=jji65Zni5aPTvS269fAMix4pN9ukmSoK0z5SVsbpr5E,5807
15
15
  anemoi/datasets/create/check.py,sha256=DLjw-eyaCNxPhoKFsP4Yn_l3SIr57YHdyPR-tE5vx80,5791
16
16
  anemoi/datasets/create/chunks.py,sha256=YEDcr0K2KiiceSTiBuZzj0TbRbzZ9J546XO7rrrTFQw,2441
17
17
  anemoi/datasets/create/config.py,sha256=uLIp1WHg3hbqwwMV9EepMwJQsXJAGImkbo0okBeEVd4,7683
18
18
  anemoi/datasets/create/input.py,sha256=UqEIqbsld0whUJUPPVKMfF_LoeKTaTyxP5kBE6zjhsE,27888
19
- anemoi/datasets/create/loaders.py,sha256=5KzbkZMV5c64avDwanGznj54gMIbLvwb0dXWvgUuD0Q,28611
19
+ anemoi/datasets/create/loaders.py,sha256=BDeb2CI_oWqIGcBnt39nRGIt3r3dK4rIidNO3pBunTk,29865
20
20
  anemoi/datasets/create/patch.py,sha256=xjCLhvIQKRqmypsKInRU1CvFh1uoaB3YGSQP1UVZZik,3682
21
- anemoi/datasets/create/persistent.py,sha256=vQuKuEggLGhNO8A7lsUHXzdVOhqAzZh50xsb-eSF6qQ,4307
21
+ anemoi/datasets/create/persistent.py,sha256=nT8gvhVPdI1H3zW_F7uViGKIlQQ94jCDrMSWTmhQ2_A,4290
22
22
  anemoi/datasets/create/size.py,sha256=A1w6RkaL0L9IlwIdmYsCTJTecmY_QtvbkGf__jvQle0,1068
23
23
  anemoi/datasets/create/template.py,sha256=2roItOYJzjGB0bKS28f6EjfpomP0ppT4v6T9fYzjRxQ,4263
24
24
  anemoi/datasets/create/utils.py,sha256=H1-auNSZUSDW0Aog8CHnIfZlzgKE1XPoi1I40CqquA4,3676
25
- anemoi/datasets/create/writer.py,sha256=BHzPDhET2BnPt-359CZ_yaaR2otIz2iENbsyQIaktxU,1378
25
+ anemoi/datasets/create/writer.py,sha256=G1qAPvdn8anGnpWYhvSSP4u3Km_tHKPdMXm0G4skKSk,1379
26
26
  anemoi/datasets/create/zarr.py,sha256=hwM_PaYTa_IgFY1VC7qdYTWQ5MXCWWlMrzXsV_eAY0Q,4776
27
27
  anemoi/datasets/create/functions/__init__.py,sha256=K-Wi11mZI5Y6od0y6I_apDutoeay7wNrtB1P3-PizgI,513
28
28
  anemoi/datasets/create/functions/filters/__init__.py,sha256=Xe9G54CKvCI3ji-7k0R5l0WZZdhlydRgawsXuBcX_hg,379
@@ -44,8 +44,8 @@ anemoi/datasets/create/functions/sources/opendap.py,sha256=T0CPinscfafrVLaye5ue-
44
44
  anemoi/datasets/create/functions/sources/recentre.py,sha256=t07LIXG3Hp9gmPkPriILVt86TxubsHyS1EL1lzwgtXY,1810
45
45
  anemoi/datasets/create/functions/sources/source.py,sha256=hPQnV_6UIxFw97uRKcTA8TplcgG1kC8NlFHoEaaLet4,1418
46
46
  anemoi/datasets/create/functions/sources/tendencies.py,sha256=kwS_GZt8R9kpfs5RrvxPb0Gj-5nDP0sgJgfSRCAwwww,4057
47
- anemoi/datasets/create/statistics/__init__.py,sha256=b5LXV1J3uKpmTkNHt8hLWgUo-C5WHA0ltxVJa7b0aLc,15449
48
- anemoi/datasets/create/statistics/summary.py,sha256=NHzKwsMOlJENBGs6GlbmcIq4mAwsfvR9q6mdfXXgCXk,3383
47
+ anemoi/datasets/create/statistics/__init__.py,sha256=X50drgE-ltuNe7bSIyvyeC4GeTqGTQGbglh2-2aVWKE,15445
48
+ anemoi/datasets/create/statistics/summary.py,sha256=sgmhA24y3VRyjmDUgTnPIqcHSlWBbFA0qynx6gJ9Xw8,3370
49
49
  anemoi/datasets/data/__init__.py,sha256=tacn6K_VZ-pYhLmGePG5sze8kmqGpqscYb-bMyQnWtk,888
50
50
  anemoi/datasets/data/concat.py,sha256=U6IZi6NkI6yccrDamgasENBqwyJ1m0ZesuDtHXoqEh8,3551
51
51
  anemoi/datasets/data/dataset.py,sha256=UDnidq2amyCT2COH05pGfDCJcmkdMj1ubtHk9cl-qcE,7384
@@ -54,21 +54,21 @@ anemoi/datasets/data/debug.py,sha256=PcyrjgxaLzeb_vf12pvUtPPVvBRHNm1SimythZvqsP4
54
54
  anemoi/datasets/data/ensemble.py,sha256=PcrdNL4DhAuWYSXgNxC6igDXpDndXC_QrbLrL4Lvj-Y,1138
55
55
  anemoi/datasets/data/forewards.py,sha256=4IsaNDhYlLiCbawUvTynm2vdpGPqdXcrSoAENwsJoqI,7456
56
56
  anemoi/datasets/data/grids.py,sha256=vgZMIQbv5SnIcnPu2ujsrAQ8VyBz5o2a1SnxsjXkDuw,7495
57
- anemoi/datasets/data/indexing.py,sha256=ymuFO2yH12ztYnP_gmHpuBuLmKAxv2t8Pz5m1gGmBzk,4808
57
+ anemoi/datasets/data/indexing.py,sha256=625m__JG5m_tDMrkz1hB6Vydenwt0oHuyAlc-o3Zwos,4799
58
58
  anemoi/datasets/data/join.py,sha256=m_lpxWPy8-xYOjPbVoBV3V92VGtBFIriiDWvQM6KqXc,4893
59
59
  anemoi/datasets/data/masked.py,sha256=KZZ-3nq9saj_W8PTN9V4YdZ24BayHgECj12i4yjyKpc,3525
60
- anemoi/datasets/data/misc.py,sha256=a-YIrCaSkOuEKHT_Q1UYADkb2wYycekRrFwZCgyW8-s,10428
60
+ anemoi/datasets/data/misc.py,sha256=m_28VIhX546RIoVfGpimPOThl5EwOhkun2UgWMAUxqw,10355
61
61
  anemoi/datasets/data/select.py,sha256=JoEepq8iRSSX6L75hzhLrBFhy0RJInuBM3C_Eu2Ryv0,3608
62
62
  anemoi/datasets/data/statistics.py,sha256=rWuG5qlfQoo9shOXR6TleJbJONwYggxxLy_HRet8azM,1582
63
- anemoi/datasets/data/stores.py,sha256=gJVyg4ydIsVXWwnww-UV3uaWNXLkcz_dx2r9AREPZrE,10869
63
+ anemoi/datasets/data/stores.py,sha256=damJzNScaGenARAv8xpNa7d32f03MpGk5adRoRi34yw,10851
64
64
  anemoi/datasets/data/subset.py,sha256=RjfOMu7p69DZXRxQpvTfDOjVAURhgUO2pWyuZpXlJGY,3671
65
- anemoi/datasets/data/unchecked.py,sha256=LSBLSQXzkLhoprkI2PY6OEoeX0lVT-nIe-ZyibH2jv0,4100
65
+ anemoi/datasets/data/unchecked.py,sha256=qeUKthbvVVSPH-P366q1DEofvPzZSSXCXA49x-RkBOc,4038
66
66
  anemoi/datasets/dates/__init__.py,sha256=zOph2N_mXYbjSvqEWYF1mmm-UZpljb61WLrdFJmi0qQ,4469
67
67
  anemoi/datasets/dates/groups.py,sha256=iq310Pi7ullglOhcNblv14MmcT8FPgYCD5s45qAfV_s,3383
68
68
  anemoi/datasets/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- anemoi_datasets-0.2.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
70
- anemoi_datasets-0.2.1.dist-info/METADATA,sha256=2RnNBqotAC66veovXZnAhzlcbN5V9qIHeQ7DQAFgIMs,15628
71
- anemoi_datasets-0.2.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
72
- anemoi_datasets-0.2.1.dist-info/entry_points.txt,sha256=yR-o-4uiPEA_GLBL81SkMYnUoxq3CAV3hHulQiRtGG0,66
73
- anemoi_datasets-0.2.1.dist-info/top_level.txt,sha256=DYn8VPs-fNwr7fNH9XIBqeXIwiYYd2E2k5-dUFFqUz0,7
74
- anemoi_datasets-0.2.1.dist-info/RECORD,,
69
+ anemoi_datasets-0.3.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
70
+ anemoi_datasets-0.3.1.dist-info/METADATA,sha256=cLrTNyT23kitgUq05PNMK5Ni-pI2AMwdzy4dOh7jZjo,16050
71
+ anemoi_datasets-0.3.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
72
+ anemoi_datasets-0.3.1.dist-info/entry_points.txt,sha256=yR-o-4uiPEA_GLBL81SkMYnUoxq3CAV3hHulQiRtGG0,66
73
+ anemoi_datasets-0.3.1.dist-info/top_level.txt,sha256=DYn8VPs-fNwr7fNH9XIBqeXIwiYYd2E2k5-dUFFqUz0,7
74
+ anemoi_datasets-0.3.1.dist-info/RECORD,,