PyPI - anemoi-datasets - Versions diffs - 0.5.20__py3-none-any.whl → 0.5.22__py3-none-any.whl - Mend

anemoi-datasets 0.5.20py3-none-any.whl → 0.5.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

anemoi/datasets/_version.py +2 -2
anemoi/datasets/check.py +93 -0
anemoi/datasets/commands/check.py +101 -0
anemoi/datasets/commands/copy.py +43 -3
anemoi/datasets/commands/create.py +2 -3
anemoi/datasets/commands/grib-index.py +0 -3
anemoi/datasets/commands/inspect.py +2 -2
anemoi/datasets/commands/scan.py +17 -5
anemoi/datasets/create/__init__.py +19 -8
anemoi/datasets/create/check.py +19 -1
anemoi/datasets/create/input/action.py +2 -0
anemoi/datasets/create/input/result.py +6 -2
anemoi/datasets/create/sources/accumulations.py +400 -34
anemoi/datasets/create/sources/forcings.py +1 -1
anemoi/datasets/create/sources/grib.py +27 -181
anemoi/datasets/create/sources/xarray_support/metadata.py +6 -0
anemoi/datasets/create/sources/xarray_zarr.py +1 -1
anemoi/datasets/create/writer.py +1 -1
anemoi/datasets/data/complement.py +28 -11
anemoi/datasets/data/forwards.py +4 -0
anemoi/datasets/data/grids.py +3 -3
anemoi/datasets/data/misc.py +1 -1
anemoi/datasets/data/stores.py +36 -4
{anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/METADATA +5 -3
{anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/RECORD +29 -27
{anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/WHEEL +1 -1
{anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/licenses/LICENSE +0 -0
{anemoi_datasets-0.5.20.dist-info → anemoi_datasets-0.5.22.dist-info}/top_level.txt +0 -0

anemoi/datasets/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.5.20'
-__version_tuple__ = version_tuple = (0, 5, 20)
+__version__ = version = '0.5.22'
+__version_tuple__ = version_tuple = (0, 5, 22)

anemoi/datasets/check.py ADDED Viewed

@@ -0,0 +1,93 @@
+# (C) Copyright 2025 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+# A collection of functions to support pytest testing
+import logging
+import math
+import os
+import re
+LOG = logging.getLogger(__name__)
+def _check_group(group, verbosity: int, *path) -> None:
+    import zarr
+    group_keys = sorted(group.keys())
+    if not group_keys:
+        raise ValueError(f"Check group: {group} is empty.")
+    for name in sorted(group_keys):
+        if name.startswith("."):
+            if verbosity > 1:
+                LOG.info(f"Check group: skipping {name}")
+            continue
+        if isinstance(group[name], zarr.hierarchy.Group):
+            _check_group(group[name], verbosity, *path, name)
+        else:
+            _check_array(group[name], verbosity, *path, name)
+def _check_array(array, verbosity: int, *path) -> None:
+    assert len(array.chunks) == len(array.shape)
+    assert math.prod(array.shape) % math.prod(array.chunks) == 0
+    file_count = math.prod(array.shape) // math.prod(array.chunks)
+    full = os.path.join(*path)
+    chunks = array.chunks
+    count = 0
+    for f in os.listdir(full):
+        if verbosity > 1:
+            LOG.info(f"Check array: checking {f}")
+        if f.startswith("."):
+            if verbosity > 1:
+                LOG.info(f"Check array: skipping {f}")
+            continue
+        bits = f.split(".")
+        if len(bits) != len(chunks):
+            raise ValueError(f"File {f} is not a valid chunk file.")
+        if not all(re.match(r"^\d+$", bit) for bit in bits):
+            raise ValueError(f"File {f} is not a valid chunk file.")
+        count += 1
+    if count != file_count:
+        raise ValueError(f"File count {count} does not match expected {file_count} for {array.name}.")
+def check_zarr(path: str, verbosity: int = 0) -> None:
+    """Check if a Zarr archive is valid, that no files are missing, and that the chunking is correct.
+    Parameters
+    ----------
+    path : str
+        Path to the Zarr archive.
+    verbosity : int, optional
+        Verbosity level for logging. Default is 0 (no logging).
+    """
+    import zarr
+    if verbosity > 0:
+        LOG.info(f"Checking Zarr archive {path}")
+    if not os.path.exists(path) and not os.path.isdir(path):
+        # This does not work with non-directory Zarr archives
+        raise ValueError(f"Path {path} does not exist.")
+    _check_group(zarr.open(path, mode="r"), verbosity, path)

anemoi/datasets/commands/check.py ADDED Viewed

@@ -0,0 +1,101 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+import os
+from typing import Any
+import yaml
+from anemoi.datasets.create.check import DatasetName
+from . import Command
+LOG = logging.getLogger(__name__)
+class Check(Command):
+    """Check if a dataset name follow naming conventions."""
+    timestamp = True
+    def add_arguments(self, command_parser: Any) -> None:
+        """Add command line arguments to the parser.
+        Parameters
+        ----------
+        command_parser : Any
+            The command line argument parser.
+        """
+        exclusive_group = command_parser.add_mutually_exclusive_group(required=True)
+        exclusive_group.add_argument(
+            "--name",
+            help="Check a dataset name.",
+        )
+        exclusive_group.add_argument(
+            "--recipe",
+            help="Specify the recipe file to check.",
+        )
+        exclusive_group.add_argument(
+            "--zarr",
+            help="Specify the Zarr archive to check.",
+        )
+        exclusive_group.add_argument(
+            "--metadata",
+            help="Specify the metadata file to check.",
+        )
+    def run(self, args: Any) -> None:
+        if args.recipe:
+            self._check_recipe(args.recipe)
+        if args.metadata:
+            self._check_metadata(args.metadata)
+        if args.name:
+            self._check_name(args.name)
+        if args.zarr:
+            self._check_zarr(args.zarr)
+    def _check_metadata(self, metadata: str) -> None:
+        pass
+    def _check_recipe(self, recipe: str) -> None:
+        recipe_filename = os.path.basename(recipe)
+        recipe_name = os.path.splitext(recipe_filename)[0]
+        in_recipe_name = yaml.safe_load(open(recipe, "r", encoding="utf-8"))["name"]
+        if recipe_name != in_recipe_name:
+            print(f"Recipe name {recipe_name} does not match the name in the recipe file {in_recipe_name}")
+        name = in_recipe_name
+        DatasetName(name=name).raise_if_not_valid()
+    def _check_name(self, name: str) -> None:
+        DatasetName(name=name).raise_if_not_valid()
+    def _check_zarr(self, zarr: str) -> None:
+        from anemoi.datasets.check import check_zarr
+        check_zarr(zarr)
+        # ds = xr.open_dataset(zarr)
+        # print(ds)
+command = Check

anemoi/datasets/commands/copy.py CHANGED Viewed

@@ -20,6 +20,8 @@ import tqdm
 from anemoi.utils.remote import Transfer
 from anemoi.utils.remote import TransferMethodNotImplementedError
+from anemoi.datasets.check import check_zarr
 from . import Command
 LOG = logging.getLogger(__name__)
@@ -319,10 +321,30 @@ class ZarrCopier:
         """
         import zarr
+        if self.verbosity > 0:
+            LOG.info(f"Copying group {source} to {target}")
         for k, v in source.attrs.items():
+            if self.verbosity > 1:
+                import textwrap
+                LOG.info(f"Copying attribute {k} = {textwrap.shorten(str(v), 40)}")
             target.attrs[k] = v
-        for name in sorted(source.keys()):
+        source_keys = list(source.keys())
+        if not source_keys:
+            raise ValueError(f"Source group {source} is empty.")
+        if self.verbosity > 1:
+            LOG.info(f"Keys {source_keys}")
+        for name in sorted(source_keys):
+            if name.startswith("."):
+                if self.verbosity > 1:
+                    LOG.info(f"Skipping {name}")
+                continue
             if isinstance(source[name], zarr.hierarchy.Group):
                 group = target[name] if name in target else target.create_group(name)
                 self.copy_group(
@@ -362,6 +384,11 @@ class ZarrCopier:
         _copy = target["_copy"]
         _copy_np = _copy[:]
+        if self.verbosity > 1:
+            import numpy as np
+            LOG.info(f"copy {np.sum(_copy_np)} of {len(_copy_np)}")
         self.copy_group(source, target, _copy_np, verbosity)
         del target["_copy"]
@@ -417,12 +444,25 @@ class ZarrCopier:
             LOG.error("Target already exists, use either --overwrite or --resume.")
             sys.exit(1)
+        if self.verbosity > 0:
+            LOG.info(f"Open target: {self.target}")
         target = open_target()
         assert target is not None, target
+        if self.verbosity > 0:
+            LOG.info(f"Open source: {self.source}")
         source = zarr.open(self._store(self.source), mode="r")
+        # zarr.consolidate_metadata(source)
         self.copy(source, target, self.verbosity)
+        if os.path.exists(self.target) and os.path.isdir(self.target):
+            LOG.info(f"Checking target: {self.target}")
+            check_zarr(self.target, self.verbosity)
+        else:
+            LOG.info(f"Target {self.target} is not a local directory, skipping check.")
 class CopyMixin:
@@ -488,8 +528,8 @@ class CopyMixin:
                 if args.source.startswith("s3://") and not args.source.endswith("/"):
                     args.source = args.source + "/"
                 copier = Transfer(
-                    args.source,
-                    args.target,
+                    source=args.source,
+                    target=args.target,
                     overwrite=args.overwrite,
                     resume=args.resume,
                     verbosity=args.verbosity,

anemoi/datasets/commands/create.py CHANGED Viewed

@@ -180,10 +180,9 @@ class Create(Command):
             executor.submit(task, "init-additions", options).result()
         with ExecutorClass(max_workers=parallel) as executor:
-            opt = options.copy()
-            opt["parts"] = f"{n+1}/{total}"
-            futures.append(executor.submit(task, "load", opt))
             for n in range(total):
+                opt = options.copy()
+                opt["parts"] = f"{n+1}/{total}"
                 futures.append(executor.submit(task, "load-additions", opt))
             for future in tqdm.tqdm(

anemoi/datasets/commands/grib-index.py CHANGED Viewed

@@ -29,8 +29,6 @@ class GribIndexCmd(Command):
             The command parser to which arguments are added.
         """
-        from anemoi.datasets.create.sources.grib_index import KEYS
         command_parser.add_argument(
             "--index",
             help="Path to the index file to create or update",
@@ -52,7 +50,6 @@ class GribIndexCmd(Command):
         command_parser.add_argument(
             "--keys",
             help="GRIB keys to add to the index, separated by commas. If the list starts with a +, the keys are added to default list.",
-            default=",".join(KEYS),
         )
         command_parser.add_argument(

anemoi/datasets/commands/inspect.py CHANGED Viewed

@@ -401,7 +401,7 @@ class Version:
             return
         if self.build_flags is None:
-            print("🪫 Dataset not initialized")
+            print("🪫 Dataset not initialised")
             return
         build_flags = self.build_flags
@@ -426,7 +426,7 @@ class Version:
             )
             start = self.initialised
             if self.initialised:
-                print(f"🕰️  Dataset initialized {when(start)}.")
+                print(f"🕰️  Dataset initialised {when(start)}.")
                 if built and latest:
                     speed = (latest - start) / built
                     eta = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) + speed * (total - built)

anemoi/datasets/commands/scan.py CHANGED Viewed

@@ -23,6 +23,16 @@ KEYS = ("class", "type", "stream", "expver", "levtype", "domain")
 class Scan(Command):
+    """Command to scan files and generate a configuration file.
+    Attributes
+    ----------
+    internal : bool
+        Indicates whether the command is internal.
+    timestamp : bool
+        Indicates whether to include a timestamp.
+    """
     internal = True
     timestamp = True
@@ -32,8 +42,9 @@ class Scan(Command):
         Parameters
         ----------
         command_parser : Any
-            The command parser to which arguments are added.
+            The command-line argument parser.
         """
         command_parser.add_argument(
             "--match",
             help="Give a glob pattern to match files (default: *.grib)",
@@ -51,22 +62,23 @@ class Scan(Command):
         Parameters
         ----------
         args : Any
-            The arguments passed to the command.
+            Parsed command-line arguments.
         """
         def match(path: str) -> bool:
-            """Check if a path matches the given pattern.
+            """Check if a file path matches the given glob pattern.
             Parameters
             ----------
             path : str
-                The path to check.
+                The file path to check.
             Returns
             -------
             bool
-                True if the path matches, False otherwise.
+                True if the path matches the pattern, False otherwise.
             """
             return fnmatch.fnmatch(path, args.match)
         paths = []

anemoi/datasets/create/__init__.py CHANGED Viewed

@@ -938,13 +938,23 @@ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
         check_shape(cube, dates, dates_in_data)
         def check_dates_in_data(dates_in_data, requested_dates):
-            requested_dates = [np.datetime64(_) for _ in requested_dates]
-            dates_in_data = [np.datetime64(_) for _ in dates_in_data]
-            assert dates_in_data == requested_dates, (
-                "Dates in data are not the requested ones:",
-                dates_in_data,
-                requested_dates,
-            )
+            _requested_dates = [np.datetime64(_) for _ in requested_dates]
+            _dates_in_data = [np.datetime64(_) for _ in dates_in_data]
+            if _dates_in_data != _requested_dates:
+                LOG.error("Dates in data are not the requested ones:")
+                dates_in_data = set(dates_in_data)
+                requested_dates = set(requested_dates)
+                missing = sorted(requested_dates - dates_in_data)
+                extra = sorted(dates_in_data - requested_dates)
+                if missing:
+                    LOG.error(f"Missing dates: {[_.isoformat() for _ in missing]}")
+                if extra:
+                    LOG.error(f"Extra dates: {[_.isoformat() for _ in extra]}")
+                raise ValueError("Dates in data are not the requested ones")
         check_dates_in_data(dates_in_data, dates)
@@ -1075,6 +1085,7 @@ class Cleanup(Actor, HasRegistryMixin, HasStatisticTempMixin):
     def run(self) -> None:
         """Run the cleanup."""
         self.tmp_statistics.delete()
         self.registry.clean()
         for actor in self.actors:
@@ -1215,7 +1226,7 @@ class _InitAdditions(Actor, HasRegistryMixin, AdditionsMixin):
         self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=True)
         self.tmp_storage.delete()
         self.tmp_storage.create()
-        LOG.info(f"Dataset {self.tmp_storage_path} additions initialized.")
+        LOG.info(f"Dataset {self.tmp_storage_path} additions initialised.")
     def cleanup(self) -> None:
         """Clean up the temporary storage."""

anemoi/datasets/create/check.py CHANGED Viewed

@@ -18,6 +18,7 @@ from typing import Optional
 from typing import Union
 import numpy as np
+from anemoi.utils.config import load_config
 from anemoi.utils.dates import frequency_to_string
 from numpy.typing import NDArray
@@ -25,7 +26,7 @@ LOG = logging.getLogger(__name__)
 class DatasetName:
-    """Class to validate and parse dataset names according to naming conventions."""
+    """Validate and parse dataset names according to naming conventions."""
     def __init__(
         self,
@@ -58,6 +59,14 @@ class DatasetName:
         self.messages = []
+        config = load_config().get("datasets", {})
+        if config.get("ignore_naming_conventions", False):
+            # setting the env variable ANEMOI_CONFIG_DATASETS_IGNORE_NAMING_CONVENTIONS=1
+            # will ignore the naming conventions
+            return
+        self.check_characters()
         self.check_parsed()
         self.check_resolution(resolution)
         self.check_frequency(frequency)
@@ -157,6 +166,15 @@ class DatasetName:
         self._check_missing("resolution", resolution_str)
         self._check_mismatch("resolution", resolution_str)
+    def check_characters(self) -> None:
+        if not self.name.islower():
+            self.messages.append(f"the {self.name} should be in lower case.")
+        if "_" in self.name:
+            self.messages.append(f"the {self.name} should use '-' instead of '_'.")
+        for c in self.name:
+            if not c.isalnum() and c not in "-":
+                self.messages.append(f"the {self.name} should only contain alphanumeric characters and '-'.")
     def check_frequency(self, frequency: Optional[datetime.timedelta]) -> None:
         """Check if the frequency matches the expected format.

anemoi/datasets/create/input/action.py CHANGED Viewed

@@ -7,6 +7,7 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
+import json
 import logging
 from copy import deepcopy
 from typing import Any
@@ -225,6 +226,7 @@ def action_factory(config: Dict[str, Any], context: ActionContext, action_path:
     if not isinstance(config, dict):
         raise ValueError(f"Invalid input config {config}")
     if len(config) != 1:
+        print(json.dumps(config, indent=2, default=str))
         raise ValueError(f"Invalid input config. Expecting dict with only one key, got {list(config.keys())}")
     config = deepcopy(config)

anemoi/datasets/create/input/result.py CHANGED Viewed

@@ -132,7 +132,8 @@ def _fields_metatata(variables: Tuple[str, ...], cube: Any) -> Dict[str, Any]:
         # GRIB1 precipitation accumulations are not correctly encoded
         if startStep == endStep and stepTypeForConversion == "accum":
-            startStep = 0
+            endStep = f.metadata("P1")
+            startStep = f.metadata("P2")
         if startStep != endStep:
             # https://codes.ecmwf.int/grib/format/grib2/ctables/4/10/
@@ -415,7 +416,10 @@ class Result:
         print()
         print("Number of unique values found for each coordinate:")
         for k, v in user_coords.items():
-            print(f"  {k:20}:", len(v), shorten_list(v, max_length=10))
+            print(f"  {k:20}:", len(v))
+            for n in sorted(v):
+                print("     ", n)
         print()
         user_shape: Tuple[int, ...] = tuple(len(v) for k, v in user_coords.items())
         print("Shape of the hypercube           :", user_shape)

anemoi-datasets 0.5.20__py3-none-any.whl → 0.5.22__py3-none-any.whl

anemoi-datasets 0.5.20py3-none-any.whl → 0.5.22py3-none-any.whl