anemoi-datasets 0.5.20__py3-none-any.whl → 0.5.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/check.py +93 -0
  3. anemoi/datasets/commands/check.py +101 -0
  4. anemoi/datasets/commands/copy.py +43 -3
  5. anemoi/datasets/commands/create.py +2 -3
  6. anemoi/datasets/commands/grib-index.py +0 -3
  7. anemoi/datasets/commands/inspect.py +2 -2
  8. anemoi/datasets/commands/scan.py +17 -5
  9. anemoi/datasets/create/__init__.py +19 -8
  10. anemoi/datasets/create/check.py +19 -1
  11. anemoi/datasets/create/input/action.py +2 -0
  12. anemoi/datasets/create/input/result.py +6 -2
  13. anemoi/datasets/create/sources/accumulations.py +400 -34
  14. anemoi/datasets/create/sources/forcings.py +1 -1
  15. anemoi/datasets/create/sources/grib.py +27 -181
  16. anemoi/datasets/create/sources/xarray_support/metadata.py +6 -0
  17. anemoi/datasets/create/sources/xarray_zarr.py +1 -1
  18. anemoi/datasets/create/writer.py +1 -1
  19. anemoi/datasets/data/complement.py +28 -11
  20. anemoi/datasets/data/forwards.py +4 -0
  21. anemoi/datasets/data/grids.py +3 -3
  22. anemoi/datasets/data/misc.py +1 -1
  23. anemoi/datasets/data/stores.py +36 -4
  24. {anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/METADATA +5 -3
  25. {anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/RECORD +29 -27
  26. {anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/WHEEL +1 -1
  27. {anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/entry_points.txt +0 -0
  28. {anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/licenses/LICENSE +0 -0
  29. {anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/top_level.txt +0 -0
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.5.20'
21
- __version_tuple__ = version_tuple = (0, 5, 20)
20
+ __version__ = version = '0.5.22'
21
+ __version_tuple__ = version_tuple = (0, 5, 22)
@@ -0,0 +1,93 @@
1
+ # (C) Copyright 2025 Anemoi contributors.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ #
6
+ # In applying this licence, ECMWF does not waive the privileges and immunities
7
+ # granted to it by virtue of its status as an intergovernmental organisation
8
+ # nor does it submit to any jurisdiction.
9
+
10
+
11
+ # A collection of functions to support pytest testing
12
+
13
+ import logging
14
+ import math
15
+ import os
16
+ import re
17
+
18
+ LOG = logging.getLogger(__name__)
19
+
20
+
21
+ def _check_group(group, verbosity: int, *path) -> None:
22
+ import zarr
23
+
24
+ group_keys = sorted(group.keys())
25
+ if not group_keys:
26
+ raise ValueError(f"Check group: {group} is empty.")
27
+
28
+ for name in sorted(group_keys):
29
+ if name.startswith("."):
30
+ if verbosity > 1:
31
+ LOG.info(f"Check group: skipping {name}")
32
+ continue
33
+
34
+ if isinstance(group[name], zarr.hierarchy.Group):
35
+ _check_group(group[name], verbosity, *path, name)
36
+ else:
37
+ _check_array(group[name], verbosity, *path, name)
38
+
39
+
40
+ def _check_array(array, verbosity: int, *path) -> None:
41
+ assert len(array.chunks) == len(array.shape)
42
+ assert math.prod(array.shape) % math.prod(array.chunks) == 0
43
+
44
+ file_count = math.prod(array.shape) // math.prod(array.chunks)
45
+
46
+ full = os.path.join(*path)
47
+
48
+ chunks = array.chunks
49
+
50
+ count = 0
51
+ for f in os.listdir(full):
52
+ if verbosity > 1:
53
+ LOG.info(f"Check array: checking {f}")
54
+
55
+ if f.startswith("."):
56
+ if verbosity > 1:
57
+ LOG.info(f"Check array: skipping {f}")
58
+ continue
59
+
60
+ bits = f.split(".")
61
+
62
+ if len(bits) != len(chunks):
63
+ raise ValueError(f"File {f} is not a valid chunk file.")
64
+
65
+ if not all(re.match(r"^\d+$", bit) for bit in bits):
66
+ raise ValueError(f"File {f} is not a valid chunk file.")
67
+
68
+ count += 1
69
+
70
+ if count != file_count:
71
+ raise ValueError(f"File count {count} does not match expected {file_count} for {array.name}.")
72
+
73
+
74
+ def check_zarr(path: str, verbosity: int = 0) -> None:
75
+ """Check if a Zarr archive is valid, that no files are missing, and that the chunking is correct.
76
+
77
+ Parameters
78
+ ----------
79
+ path : str
80
+ Path to the Zarr archive.
81
+ verbosity : int, optional
82
+ Verbosity level for logging. Default is 0 (no logging).
83
+ """
84
+ import zarr
85
+
86
+ if verbosity > 0:
87
+ LOG.info(f"Checking Zarr archive {path}")
88
+
89
+ if not os.path.exists(path) and not os.path.isdir(path):
90
+ # This does not work with non-directory Zarr archives
91
+ raise ValueError(f"Path {path} does not exist.")
92
+
93
+ _check_group(zarr.open(path, mode="r"), verbosity, path)
@@ -0,0 +1,101 @@
1
+ # (C) Copyright 2024 Anemoi contributors.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ #
6
+ # In applying this licence, ECMWF does not waive the privileges and immunities
7
+ # granted to it by virtue of its status as an intergovernmental organisation
8
+ # nor does it submit to any jurisdiction.
9
+
10
+ import logging
11
+ import os
12
+ from typing import Any
13
+
14
+ import yaml
15
+
16
+ from anemoi.datasets.create.check import DatasetName
17
+
18
+ from . import Command
19
+
20
+ LOG = logging.getLogger(__name__)
21
+
22
+
23
+ class Check(Command):
24
+ """Check if a dataset name follow naming conventions."""
25
+
26
+ timestamp = True
27
+
28
+ def add_arguments(self, command_parser: Any) -> None:
29
+ """Add command line arguments to the parser.
30
+
31
+ Parameters
32
+ ----------
33
+ command_parser : Any
34
+ The command line argument parser.
35
+ """
36
+
37
+ exclusive_group = command_parser.add_mutually_exclusive_group(required=True)
38
+
39
+ exclusive_group.add_argument(
40
+ "--name",
41
+ help="Check a dataset name.",
42
+ )
43
+
44
+ exclusive_group.add_argument(
45
+ "--recipe",
46
+ help="Specify the recipe file to check.",
47
+ )
48
+
49
+ exclusive_group.add_argument(
50
+ "--zarr",
51
+ help="Specify the Zarr archive to check.",
52
+ )
53
+
54
+ exclusive_group.add_argument(
55
+ "--metadata",
56
+ help="Specify the metadata file to check.",
57
+ )
58
+
59
+ def run(self, args: Any) -> None:
60
+
61
+ if args.recipe:
62
+ self._check_recipe(args.recipe)
63
+
64
+ if args.metadata:
65
+ self._check_metadata(args.metadata)
66
+
67
+ if args.name:
68
+ self._check_name(args.name)
69
+
70
+ if args.zarr:
71
+ self._check_zarr(args.zarr)
72
+
73
+ def _check_metadata(self, metadata: str) -> None:
74
+ pass
75
+
76
+ def _check_recipe(self, recipe: str) -> None:
77
+
78
+ recipe_filename = os.path.basename(recipe)
79
+ recipe_name = os.path.splitext(recipe_filename)[0]
80
+ in_recipe_name = yaml.safe_load(open(recipe, "r", encoding="utf-8"))["name"]
81
+ if recipe_name != in_recipe_name:
82
+ print(f"Recipe name {recipe_name} does not match the name in the recipe file {in_recipe_name}")
83
+
84
+ name = in_recipe_name
85
+ DatasetName(name=name).raise_if_not_valid()
86
+
87
+ def _check_name(self, name: str) -> None:
88
+
89
+ DatasetName(name=name).raise_if_not_valid()
90
+
91
+ def _check_zarr(self, zarr: str) -> None:
92
+
93
+ from anemoi.datasets.check import check_zarr
94
+
95
+ check_zarr(zarr)
96
+
97
+ # ds = xr.open_dataset(zarr)
98
+ # print(ds)
99
+
100
+
101
+ command = Check
@@ -20,6 +20,8 @@ import tqdm
20
20
  from anemoi.utils.remote import Transfer
21
21
  from anemoi.utils.remote import TransferMethodNotImplementedError
22
22
 
23
+ from anemoi.datasets.check import check_zarr
24
+
23
25
  from . import Command
24
26
 
25
27
  LOG = logging.getLogger(__name__)
@@ -319,10 +321,30 @@ class ZarrCopier:
319
321
  """
320
322
  import zarr
321
323
 
324
+ if self.verbosity > 0:
325
+ LOG.info(f"Copying group {source} to {target}")
326
+
322
327
  for k, v in source.attrs.items():
328
+ if self.verbosity > 1:
329
+ import textwrap
330
+
331
+ LOG.info(f"Copying attribute {k} = {textwrap.shorten(str(v), 40)}")
323
332
  target.attrs[k] = v
324
333
 
325
- for name in sorted(source.keys()):
334
+ source_keys = list(source.keys())
335
+
336
+ if not source_keys:
337
+ raise ValueError(f"Source group {source} is empty.")
338
+
339
+ if self.verbosity > 1:
340
+ LOG.info(f"Keys {source_keys}")
341
+
342
+ for name in sorted(source_keys):
343
+ if name.startswith("."):
344
+ if self.verbosity > 1:
345
+ LOG.info(f"Skipping {name}")
346
+ continue
347
+
326
348
  if isinstance(source[name], zarr.hierarchy.Group):
327
349
  group = target[name] if name in target else target.create_group(name)
328
350
  self.copy_group(
@@ -362,6 +384,11 @@ class ZarrCopier:
362
384
  _copy = target["_copy"]
363
385
  _copy_np = _copy[:]
364
386
 
387
+ if self.verbosity > 1:
388
+ import numpy as np
389
+
390
+ LOG.info(f"copy {np.sum(_copy_np)} of {len(_copy_np)}")
391
+
365
392
  self.copy_group(source, target, _copy_np, verbosity)
366
393
  del target["_copy"]
367
394
 
@@ -417,12 +444,25 @@ class ZarrCopier:
417
444
  LOG.error("Target already exists, use either --overwrite or --resume.")
418
445
  sys.exit(1)
419
446
 
447
+ if self.verbosity > 0:
448
+ LOG.info(f"Open target: {self.target}")
449
+
420
450
  target = open_target()
421
451
 
422
452
  assert target is not None, target
423
453
 
454
+ if self.verbosity > 0:
455
+ LOG.info(f"Open source: {self.source}")
456
+
424
457
  source = zarr.open(self._store(self.source), mode="r")
458
+ # zarr.consolidate_metadata(source)
459
+
425
460
  self.copy(source, target, self.verbosity)
461
+ if os.path.exists(self.target) and os.path.isdir(self.target):
462
+ LOG.info(f"Checking target: {self.target}")
463
+ check_zarr(self.target, self.verbosity)
464
+ else:
465
+ LOG.info(f"Target {self.target} is not a local directory, skipping check.")
426
466
 
427
467
 
428
468
  class CopyMixin:
@@ -488,8 +528,8 @@ class CopyMixin:
488
528
  if args.source.startswith("s3://") and not args.source.endswith("/"):
489
529
  args.source = args.source + "/"
490
530
  copier = Transfer(
491
- args.source,
492
- args.target,
531
+ source=args.source,
532
+ target=args.target,
493
533
  overwrite=args.overwrite,
494
534
  resume=args.resume,
495
535
  verbosity=args.verbosity,
@@ -180,10 +180,9 @@ class Create(Command):
180
180
  executor.submit(task, "init-additions", options).result()
181
181
 
182
182
  with ExecutorClass(max_workers=parallel) as executor:
183
- opt = options.copy()
184
- opt["parts"] = f"{n+1}/{total}"
185
- futures.append(executor.submit(task, "load", opt))
186
183
  for n in range(total):
184
+ opt = options.copy()
185
+ opt["parts"] = f"{n+1}/{total}"
187
186
  futures.append(executor.submit(task, "load-additions", opt))
188
187
 
189
188
  for future in tqdm.tqdm(
@@ -29,8 +29,6 @@ class GribIndexCmd(Command):
29
29
  The command parser to which arguments are added.
30
30
  """
31
31
 
32
- from anemoi.datasets.create.sources.grib_index import KEYS
33
-
34
32
  command_parser.add_argument(
35
33
  "--index",
36
34
  help="Path to the index file to create or update",
@@ -52,7 +50,6 @@ class GribIndexCmd(Command):
52
50
  command_parser.add_argument(
53
51
  "--keys",
54
52
  help="GRIB keys to add to the index, separated by commas. If the list starts with a +, the keys are added to default list.",
55
- default=",".join(KEYS),
56
53
  )
57
54
 
58
55
  command_parser.add_argument(
@@ -401,7 +401,7 @@ class Version:
401
401
  return
402
402
 
403
403
  if self.build_flags is None:
404
- print("🪫 Dataset not initialized")
404
+ print("🪫 Dataset not initialised")
405
405
  return
406
406
 
407
407
  build_flags = self.build_flags
@@ -426,7 +426,7 @@ class Version:
426
426
  )
427
427
  start = self.initialised
428
428
  if self.initialised:
429
- print(f"🕰️ Dataset initialized {when(start)}.")
429
+ print(f"🕰️ Dataset initialised {when(start)}.")
430
430
  if built and latest:
431
431
  speed = (latest - start) / built
432
432
  eta = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) + speed * (total - built)
@@ -23,6 +23,16 @@ KEYS = ("class", "type", "stream", "expver", "levtype", "domain")
23
23
 
24
24
 
25
25
  class Scan(Command):
26
+ """Command to scan files and generate a configuration file.
27
+
28
+ Attributes
29
+ ----------
30
+ internal : bool
31
+ Indicates whether the command is internal.
32
+ timestamp : bool
33
+ Indicates whether to include a timestamp.
34
+ """
35
+
26
36
  internal = True
27
37
  timestamp = True
28
38
 
@@ -32,8 +42,9 @@ class Scan(Command):
32
42
  Parameters
33
43
  ----------
34
44
  command_parser : Any
35
- The command parser to which arguments are added.
45
+ The command-line argument parser.
36
46
  """
47
+
37
48
  command_parser.add_argument(
38
49
  "--match",
39
50
  help="Give a glob pattern to match files (default: *.grib)",
@@ -51,22 +62,23 @@ class Scan(Command):
51
62
  Parameters
52
63
  ----------
53
64
  args : Any
54
- The arguments passed to the command.
65
+ Parsed command-line arguments.
55
66
  """
56
67
 
57
68
  def match(path: str) -> bool:
58
- """Check if a path matches the given pattern.
69
+ """Check if a file path matches the given glob pattern.
59
70
 
60
71
  Parameters
61
72
  ----------
62
73
  path : str
63
- The path to check.
74
+ The file path to check.
64
75
 
65
76
  Returns
66
77
  -------
67
78
  bool
68
- True if the path matches, False otherwise.
79
+ True if the path matches the pattern, False otherwise.
69
80
  """
81
+
70
82
  return fnmatch.fnmatch(path, args.match)
71
83
 
72
84
  paths = []
@@ -938,13 +938,23 @@ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
938
938
  check_shape(cube, dates, dates_in_data)
939
939
 
940
940
  def check_dates_in_data(dates_in_data, requested_dates):
941
- requested_dates = [np.datetime64(_) for _ in requested_dates]
942
- dates_in_data = [np.datetime64(_) for _ in dates_in_data]
943
- assert dates_in_data == requested_dates, (
944
- "Dates in data are not the requested ones:",
945
- dates_in_data,
946
- requested_dates,
947
- )
941
+ _requested_dates = [np.datetime64(_) for _ in requested_dates]
942
+ _dates_in_data = [np.datetime64(_) for _ in dates_in_data]
943
+ if _dates_in_data != _requested_dates:
944
+ LOG.error("Dates in data are not the requested ones:")
945
+
946
+ dates_in_data = set(dates_in_data)
947
+ requested_dates = set(requested_dates)
948
+
949
+ missing = sorted(requested_dates - dates_in_data)
950
+ extra = sorted(dates_in_data - requested_dates)
951
+
952
+ if missing:
953
+ LOG.error(f"Missing dates: {[_.isoformat() for _ in missing]}")
954
+ if extra:
955
+ LOG.error(f"Extra dates: {[_.isoformat() for _ in extra]}")
956
+
957
+ raise ValueError("Dates in data are not the requested ones")
948
958
 
949
959
  check_dates_in_data(dates_in_data, dates)
950
960
 
@@ -1075,6 +1085,7 @@ class Cleanup(Actor, HasRegistryMixin, HasStatisticTempMixin):
1075
1085
 
1076
1086
  def run(self) -> None:
1077
1087
  """Run the cleanup."""
1088
+
1078
1089
  self.tmp_statistics.delete()
1079
1090
  self.registry.clean()
1080
1091
  for actor in self.actors:
@@ -1215,7 +1226,7 @@ class _InitAdditions(Actor, HasRegistryMixin, AdditionsMixin):
1215
1226
  self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=True)
1216
1227
  self.tmp_storage.delete()
1217
1228
  self.tmp_storage.create()
1218
- LOG.info(f"Dataset {self.tmp_storage_path} additions initialized.")
1229
+ LOG.info(f"Dataset {self.tmp_storage_path} additions initialised.")
1219
1230
 
1220
1231
  def cleanup(self) -> None:
1221
1232
  """Clean up the temporary storage."""
@@ -18,6 +18,7 @@ from typing import Optional
18
18
  from typing import Union
19
19
 
20
20
  import numpy as np
21
+ from anemoi.utils.config import load_config
21
22
  from anemoi.utils.dates import frequency_to_string
22
23
  from numpy.typing import NDArray
23
24
 
@@ -25,7 +26,7 @@ LOG = logging.getLogger(__name__)
25
26
 
26
27
 
27
28
  class DatasetName:
28
- """Class to validate and parse dataset names according to naming conventions."""
29
+ """Validate and parse dataset names according to naming conventions."""
29
30
 
30
31
  def __init__(
31
32
  self,
@@ -58,6 +59,14 @@ class DatasetName:
58
59
 
59
60
  self.messages = []
60
61
 
62
+ config = load_config().get("datasets", {})
63
+
64
+ if config.get("ignore_naming_conventions", False):
65
+ # setting the env variable ANEMOI_CONFIG_DATASETS_IGNORE_NAMING_CONVENTIONS=1
66
+ # will ignore the naming conventions
67
+ return
68
+
69
+ self.check_characters()
61
70
  self.check_parsed()
62
71
  self.check_resolution(resolution)
63
72
  self.check_frequency(frequency)
@@ -157,6 +166,15 @@ class DatasetName:
157
166
  self._check_missing("resolution", resolution_str)
158
167
  self._check_mismatch("resolution", resolution_str)
159
168
 
169
+ def check_characters(self) -> None:
170
+ if not self.name.islower():
171
+ self.messages.append(f"the {self.name} should be in lower case.")
172
+ if "_" in self.name:
173
+ self.messages.append(f"the {self.name} should use '-' instead of '_'.")
174
+ for c in self.name:
175
+ if not c.isalnum() and c not in "-":
176
+ self.messages.append(f"the {self.name} should only contain alphanumeric characters and '-'.")
177
+
160
178
  def check_frequency(self, frequency: Optional[datetime.timedelta]) -> None:
161
179
  """Check if the frequency matches the expected format.
162
180
 
@@ -7,6 +7,7 @@
7
7
  # granted to it by virtue of its status as an intergovernmental organisation
8
8
  # nor does it submit to any jurisdiction.
9
9
 
10
+ import json
10
11
  import logging
11
12
  from copy import deepcopy
12
13
  from typing import Any
@@ -225,6 +226,7 @@ def action_factory(config: Dict[str, Any], context: ActionContext, action_path:
225
226
  if not isinstance(config, dict):
226
227
  raise ValueError(f"Invalid input config {config}")
227
228
  if len(config) != 1:
229
+ print(json.dumps(config, indent=2, default=str))
228
230
  raise ValueError(f"Invalid input config. Expecting dict with only one key, got {list(config.keys())}")
229
231
 
230
232
  config = deepcopy(config)
@@ -132,7 +132,8 @@ def _fields_metatata(variables: Tuple[str, ...], cube: Any) -> Dict[str, Any]:
132
132
 
133
133
  # GRIB1 precipitation accumulations are not correctly encoded
134
134
  if startStep == endStep and stepTypeForConversion == "accum":
135
- startStep = 0
135
+ endStep = f.metadata("P1")
136
+ startStep = f.metadata("P2")
136
137
 
137
138
  if startStep != endStep:
138
139
  # https://codes.ecmwf.int/grib/format/grib2/ctables/4/10/
@@ -415,7 +416,10 @@ class Result:
415
416
  print()
416
417
  print("Number of unique values found for each coordinate:")
417
418
  for k, v in user_coords.items():
418
- print(f" {k:20}:", len(v), shorten_list(v, max_length=10))
419
+ print(f" {k:20}:", len(v))
420
+ for n in sorted(v):
421
+ print(" ", n)
422
+
419
423
  print()
420
424
  user_shape: Tuple[int, ...] = tuple(len(v) for k, v in user_coords.items())
421
425
  print("Shape of the hypercube :", user_shape)