PyPI - anemoi-datasets - Versions diffs - 0.5.25__py3-none-any.whl → 0.5.27__py3-none-any.whl - Mend

anemoi-datasets 0.5.25py3-none-any.whl → 0.5.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

anemoi/datasets/__init__.py +1 -2
anemoi/datasets/_version.py +16 -3
anemoi/datasets/commands/check.py +1 -1
anemoi/datasets/commands/copy.py +1 -2
anemoi/datasets/commands/create.py +1 -1
anemoi/datasets/commands/grib-index.py +1 -1
anemoi/datasets/commands/inspect.py +27 -35
anemoi/datasets/commands/validate.py +59 -0
anemoi/datasets/compute/recentre.py +3 -6
anemoi/datasets/create/__init__.py +22 -25
anemoi/datasets/create/check.py +10 -12
anemoi/datasets/create/chunks.py +1 -2
anemoi/datasets/create/config.py +3 -6
anemoi/datasets/create/filter.py +21 -24
anemoi/datasets/create/input/__init__.py +1 -2
anemoi/datasets/create/input/action.py +3 -5
anemoi/datasets/create/input/concat.py +5 -8
anemoi/datasets/create/input/context.py +3 -6
anemoi/datasets/create/input/data_sources.py +5 -8
anemoi/datasets/create/input/empty.py +1 -2
anemoi/datasets/create/input/filter.py +2 -3
anemoi/datasets/create/input/function.py +1 -2
anemoi/datasets/create/input/join.py +4 -5
anemoi/datasets/create/input/misc.py +4 -6
anemoi/datasets/create/input/repeated_dates.py +13 -18
anemoi/datasets/create/input/result.py +29 -33
anemoi/datasets/create/input/step.py +6 -24
anemoi/datasets/create/input/template.py +3 -4
anemoi/datasets/create/input/trace.py +1 -1
anemoi/datasets/create/patch.py +1 -2
anemoi/datasets/create/persistent.py +3 -5
anemoi/datasets/create/size.py +1 -3
anemoi/datasets/create/sources/accumulations.py +47 -52
anemoi/datasets/create/sources/accumulations2.py +4 -8
anemoi/datasets/create/sources/constants.py +1 -3
anemoi/datasets/create/sources/empty.py +1 -2
anemoi/datasets/create/sources/fdb.py +133 -0
anemoi/datasets/create/sources/forcings.py +1 -2
anemoi/datasets/create/sources/grib.py +6 -10
anemoi/datasets/create/sources/grib_index.py +13 -15
anemoi/datasets/create/sources/hindcasts.py +2 -5
anemoi/datasets/create/sources/legacy.py +1 -1
anemoi/datasets/create/sources/mars.py +17 -21
anemoi/datasets/create/sources/netcdf.py +1 -2
anemoi/datasets/create/sources/opendap.py +1 -3
anemoi/datasets/create/sources/patterns.py +4 -6
anemoi/datasets/create/sources/planetary_computer.py +44 -0
anemoi/datasets/create/sources/recentre.py +8 -11
anemoi/datasets/create/sources/source.py +3 -6
anemoi/datasets/create/sources/tendencies.py +2 -5
anemoi/datasets/create/sources/xarray.py +4 -6
anemoi/datasets/create/sources/xarray_support/__init__.py +15 -32
anemoi/datasets/create/sources/xarray_support/coordinates.py +16 -12
anemoi/datasets/create/sources/xarray_support/field.py +17 -16
anemoi/datasets/create/sources/xarray_support/fieldlist.py +11 -15
anemoi/datasets/create/sources/xarray_support/flavour.py +83 -45
anemoi/datasets/create/sources/xarray_support/grid.py +15 -9
anemoi/datasets/create/sources/xarray_support/metadata.py +19 -128
anemoi/datasets/create/sources/xarray_support/patch.py +47 -6
anemoi/datasets/create/sources/xarray_support/time.py +10 -13
anemoi/datasets/create/sources/xarray_support/variable.py +27 -23
anemoi/datasets/create/sources/xarray_zarr.py +1 -2
anemoi/datasets/create/sources/zenodo.py +3 -5
anemoi/datasets/create/statistics/__init__.py +3 -6
anemoi/datasets/create/testing.py +2 -74
anemoi/datasets/create/typing.py +1 -2
anemoi/datasets/create/utils.py +1 -2
anemoi/datasets/create/zarr.py +7 -2
anemoi/datasets/data/__init__.py +15 -6
anemoi/datasets/data/complement.py +52 -23
anemoi/datasets/data/concat.py +5 -8
anemoi/datasets/data/dataset.py +42 -47
anemoi/datasets/data/debug.py +7 -9
anemoi/datasets/data/ensemble.py +4 -6
anemoi/datasets/data/fill_missing.py +7 -10
anemoi/datasets/data/forwards.py +30 -28
anemoi/datasets/data/grids.py +12 -16
anemoi/datasets/data/indexing.py +9 -12
anemoi/datasets/data/interpolate.py +7 -15
anemoi/datasets/data/join.py +8 -12
anemoi/datasets/data/masked.py +6 -11
anemoi/datasets/data/merge.py +5 -9
anemoi/datasets/data/misc.py +41 -45
anemoi/datasets/data/missing.py +11 -16
anemoi/datasets/data/observations/__init__.py +8 -14
anemoi/datasets/data/padded.py +3 -5
anemoi/datasets/data/records/backends/__init__.py +2 -2
anemoi/datasets/data/rescale.py +5 -12
anemoi/datasets/data/select.py +13 -16
anemoi/datasets/data/statistics.py +4 -7
anemoi/datasets/data/stores.py +23 -77
anemoi/datasets/data/subset.py +8 -11
anemoi/datasets/data/unchecked.py +7 -11
anemoi/datasets/data/xy.py +25 -21
anemoi/datasets/dates/__init__.py +13 -18
anemoi/datasets/dates/groups.py +7 -10
anemoi/datasets/grids.py +11 -12
anemoi/datasets/testing.py +93 -7
anemoi/datasets/validate.py +598 -0
{anemoi_datasets-0.5.25.dist-info → anemoi_datasets-0.5.27.dist-info}/METADATA +5 -4
anemoi_datasets-0.5.27.dist-info/RECORD +134 -0
anemoi/datasets/create/filters/__init__.py +0 -33
anemoi/datasets/create/filters/empty.py +0 -37
anemoi/datasets/create/filters/legacy.py +0 -93
anemoi/datasets/create/filters/noop.py +0 -37
anemoi/datasets/create/filters/orog_to_z.py +0 -58
anemoi/datasets/create/filters/pressure_level_relative_humidity_to_specific_humidity.py +0 -83
anemoi/datasets/create/filters/pressure_level_specific_humidity_to_relative_humidity.py +0 -84
anemoi/datasets/create/filters/rename.py +0 -205
anemoi/datasets/create/filters/rotate_winds.py +0 -105
anemoi/datasets/create/filters/single_level_dewpoint_to_relative_humidity.py +0 -78
anemoi/datasets/create/filters/single_level_relative_humidity_to_dewpoint.py +0 -84
anemoi/datasets/create/filters/single_level_relative_humidity_to_specific_humidity.py +0 -163
anemoi/datasets/create/filters/single_level_specific_humidity_to_relative_humidity.py +0 -451
anemoi/datasets/create/filters/speeddir_to_uv.py +0 -95
anemoi/datasets/create/filters/sum.py +0 -68
anemoi/datasets/create/filters/transform.py +0 -51
anemoi/datasets/create/filters/unrotate_winds.py +0 -105
anemoi/datasets/create/filters/uv_to_speeddir.py +0 -94
anemoi/datasets/create/filters/wz_to_w.py +0 -98
anemoi/datasets/utils/__init__.py +0 -8
anemoi_datasets-0.5.25.dist-info/RECORD +0 -150
{anemoi_datasets-0.5.25.dist-info → anemoi_datasets-0.5.27.dist-info}/WHEEL +0 -0
{anemoi_datasets-0.5.25.dist-info → anemoi_datasets-0.5.27.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.5.25.dist-info → anemoi_datasets-0.5.27.dist-info}/licenses/LICENSE +0 -0
{anemoi_datasets-0.5.25.dist-info → anemoi_datasets-0.5.27.dist-info}/top_level.txt +0 -0

anemoi/datasets/create/testing.py CHANGED Viewed

@@ -1,76 +1,4 @@
-# (C) Copyright 2025- Anemoi contributors.
-#
-# This software is licensed under the terms of the Apache Licence Version 2.0
-# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
-#
-# In applying this licence, ECMWF does not waive the privileges and immunities
-# granted to it by virtue of its status as an intergovernmental organisation
-# nor does it submit to any jurisdiction.
-import tempfile
-from typing import Any
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Union
-import yaml
-from anemoi.datasets.create import creator_factory
 class TestingContext:
-    pass
-def create_dataset(
-    *,
-    config: Union[str, Dict[str, Any]],
-    output: Optional[str],
-    delta: Optional[List[str]] = None,
-    is_test: bool = False,
-) -> str:
-    """Create a dataset based on the provided configuration.
-    Parameters
-    ----------
-    config : Union[str, Dict[str, Any]]
-        The configuration for the dataset. Can be a path to a YAML file or a dictionary.
-    output : Optional[str]
-        The output path for the dataset. If None, a temporary directory will be created.
-    delta : Optional[List[str]], optional
-        List of delta for secondary statistics, by default None.
-    is_test : bool, optional
-        Flag indicating if the dataset creation is for testing purposes, by default False.
+    """A context for testing plugins."""
-    Returns
-    -------
-    str
-        The path to the created dataset.
-    """
-    if isinstance(config, dict):
-        temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml")
-        yaml.dump(config, temp_file)
-        config = temp_file.name
-    if output is None:
-        output = tempfile.mkdtemp(suffix=".zarr")
-    creator_factory("init", config=config, path=output, overwrite=True, test=is_test).run()
-    creator_factory("load", path=output).run()
-    creator_factory("finalise", path=output).run()
-    creator_factory("patch", path=output).run()
-    if delta is not None:
-        creator_factory("init_additions", path=output, delta=delta).run()
-        creator_factory("run_additions", path=output, delta=delta).run()
-        creator_factory("finalise_additions", path=output, delta=delta).run()
-    creator_factory("cleanup", path=output).run()
-    if delta is not None:
-        creator_factory("cleanup", path=output, delta=delta).run()
-    creator_factory("verify", path=output).run()
-    return output
+    pass

anemoi/datasets/create/typing.py CHANGED Viewed

@@ -8,8 +8,7 @@
 # nor does it submit to any jurisdiction.
 import datetime
-from typing import List
 Date = datetime.datetime
-DateList = List[Date]
+DateList = list[Date]

anemoi/datasets/create/utils.py CHANGED Viewed

@@ -13,7 +13,6 @@ import os
 import warnings
 from contextlib import contextmanager
 from typing import Any
-from typing import Union
 import numpy as np
 from earthkit.data import settings
@@ -97,7 +96,7 @@ def to_datetime(*args: Any, **kwargs: Any) -> datetime.datetime:
     return to_datetime_(*args, **kwargs)
-def make_list_int(value: Union[str, list, tuple, int]) -> list[int]:
+def make_list_int(value: str | list | tuple | int) -> list[int]:
     """Convert a string, list, tuple, or integer to a list of integers.
     Parameters

anemoi/datasets/create/zarr.py CHANGED Viewed

@@ -11,7 +11,6 @@ import datetime
 import logging
 import shutil
 from typing import Any
-from typing import Optional
 import numpy as np
 import zarr
@@ -120,7 +119,7 @@ class ZarrBuiltRegistry:
     flags = None
     z = None
-    def __init__(self, path: str, synchronizer_path: Optional[str] = None, use_threads: bool = False):
+    def __init__(self, path: str, synchronizer_path: str | None = None, use_threads: bool = False):
         """Initialize the ZarrBuiltRegistry.
         Parameters
@@ -154,6 +153,12 @@ class ZarrBuiltRegistry:
             except FileNotFoundError:
                 pass
+        _build = self.zarr_path + "/_build"
+        try:
+            shutil.rmtree(_build)
+        except FileNotFoundError:
+            pass
     def _open_write(self) -> zarr.Group:
         """Open the Zarr store in write mode."""
         import zarr

anemoi/datasets/data/__init__.py CHANGED Viewed

@@ -8,9 +8,9 @@
 # nor does it submit to any jurisdiction.
 import logging
+import os
 from typing import TYPE_CHECKING
 from typing import Any
-from typing import Set
 # from .dataset import FullIndex
 # from .dataset import Shape
@@ -82,6 +82,9 @@ def open_dataset(*args: Any, **kwargs: Any) -> "Dataset":
     Dataset
         The opened dataset.
     """
+    trace = int(os.environ.get("ANEMOI_DATASETS_TRACE", 0))
     # That will get rid of OmegaConf objects
     args, kwargs = _convert(args), _convert(kwargs)
@@ -90,22 +93,28 @@ def open_dataset(*args: Any, **kwargs: Any) -> "Dataset":
     ds = ds.mutate()
     ds.arguments = {"args": args, "kwargs": kwargs}
     ds._check()
+    if trace:
+        from anemoi.datasets.testing import Trace
+        ds = Trace(ds)
     return ds
-def save_dataset(recipe: dict, zarr_path: str, n_workers: int = 1) -> None:
+def save_dataset(dataset: "Dataset", zarr_path: str, n_workers: int = 1) -> None:
     """Open a dataset and save it to disk.
     Parameters
     ----------
-    recipe : dict
-        Recipe used with open_dataset (not a dataset creation recipe).
+    dataset : Dataset
+        anemoi-dataset opened from python to save to Zarr store
     zarr_path : str
         Path to store the obtained anemoi dataset to disk.
     n_workers : int
         Number of workers to use for parallel processing. If none, sequential processing will be performed.
     """
-    _save_dataset(recipe, zarr_path, n_workers)
+    _save_dataset(dataset, zarr_path, n_workers)
 def list_dataset_names(*args: Any, **kwargs: Any) -> list[str]:
@@ -124,6 +133,6 @@ def list_dataset_names(*args: Any, **kwargs: Any) -> list[str]:
         The list of dataset names.
     """
     ds = _open_dataset(*args, **kwargs)
-    names: Set[str] = set()
+    names: set[str] = set()
     ds.get_dataset_names(names)
     return sorted(names)

anemoi/datasets/data/complement.py CHANGED Viewed

@@ -7,18 +7,13 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 import datetime
 import logging
 from abc import abstractmethod
 from functools import cached_property
 from typing import Any
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Set
-from typing import Tuple
+import numpy as np
 from numpy.typing import NDArray
 from ..grids import nearest_grid_points
@@ -85,29 +80,32 @@ class Complement(Combined):
         for v in self._source.variables:
             if v not in self._target.variables:
                 self._variables.append(v)
+        LOG.info(f"The following variables will be complemented: {self._variables}")
         if not self._variables:
             raise ValueError("Augment: no missing variables")
     @property
-    def variables(self) -> List[str]:
+    def variables(self) -> list[str]:
         """Returns the list of variables to be added to the target dataset."""
         return self._variables
     @property
-    def statistics(self) -> Dict[str, NDArray[Any]]:
-        """Returns the statistics of the complemented dataset."""
-        index = [self._source.name_to_index[v] for v in self._variables]
-        return {k: v[index] for k, v in self._source.statistics.items()}
-    def statistics_tendencies(self, delta: Optional[datetime.timedelta] = None) -> Dict[str, NDArray[Any]]:
+    def statistics(self) -> dict[str, NDArray[Any]]:
+        datasets = [self._source, self._target]
+        return {
+            k: [d.statistics[k][d.name_to_index[i]] for d in datasets for i in d.variables if i in self.variables]
+            for k in datasets[0].statistics
+        }
+    def statistics_tendencies(self, delta: datetime.timedelta | None = None) -> dict[str, NDArray[Any]]:
         index = [self._source.name_to_index[v] for v in self._variables]
         if delta is None:
             delta = self.frequency
         return {k: v[index] for k, v in self._source.statistics_tendencies(delta).items()}
     @property
-    def name_to_index(self) -> Dict[str, int]:
+    def name_to_index(self) -> dict[str, int]:
         """Returns a dictionary mapping variable names to their indices."""
         return {v: i for i, v in enumerate(self.variables)}
@@ -118,9 +116,13 @@ class Complement(Combined):
         return (shape[0], len(self._variables)) + shape[2:]
     @property
-    def variables_metadata(self) -> Dict[str, Any]:
+    def variables_metadata(self) -> dict[str, Any]:
         """Returns the metadata of the variables to be added to the target dataset."""
-        return {k: v for k, v in self._source.variables_metadata.items() if k in self._variables}
+        # Merge the two dicts first
+        all_meta = {**self._source.variables_metadata, **self._target.variables_metadata}
+        # Filter to keep only desired variables
+        return {k: v for k, v in all_meta.items() if k in self._variables}
     def check_same_variables(self, d1: Dataset, d2: Dataset) -> None:
         """Checks if the variables in two datasets are the same.
@@ -135,7 +137,7 @@ class Complement(Combined):
         pass
     @cached_property
-    def missing(self) -> Set[int]:
+    def missing(self) -> set[int]:
         """Returns the set of missing indices in the source and target datasets."""
         missing = self._source.missing.copy()
         missing = missing | self._target.missing
@@ -231,7 +233,7 @@ class ComplementNone(Complement):
 class ComplementNearest(Complement):
     """A class to complement a target dataset with variables from a source dataset using nearest neighbor interpolation."""
-    def __init__(self, target: Any, source: Any, max_distance: float = None) -> None:
+    def __init__(self, target: Any, source: Any, max_distance: float = None, k: int = 1) -> None:
         """Initializes the ComplementNearest class.
         Parameters
@@ -242,17 +244,25 @@ class ComplementNearest(Complement):
             The source dataset.
         max_distance : float, optional
             The maximum distance for nearest neighbor interpolation, default is None.
+        k : int, optional
+            The number of k closest neighbors to consider for interpolation
         """
         super().__init__(target, source)
-        self._nearest_grid_points = nearest_grid_points(
+        self.k = k
+        self._distances, self._nearest_grid_points = nearest_grid_points(
             self._source.latitudes,
             self._source.longitudes,
             self._target.latitudes,
             self._target.longitudes,
             max_distance=max_distance,
+            k=k,
         )
+        if k == 1:
+            self._distances = np.expand_dims(self._distances, axis=1)
+            self._nearest_grid_points = np.expand_dims(self._nearest_grid_points, axis=1)
     def check_compatibility(self, d1: Dataset, d2: Dataset) -> None:
         """Checks the compatibility of two datasets for nearest neighbor interpolation.
@@ -285,12 +295,24 @@ class ComplementNearest(Complement):
         source_data = self._source[index[0], source_index, index[2], ...]
         target_data = source_data[..., self._nearest_grid_points]
-        result = target_data[..., index[3]]
+        epsilon = 1e-8  # prevent division by zero
+        weights = 1.0 / (self._distances + epsilon)
+        weights = weights.astype(target_data.dtype)
+        weights /= weights.sum(axis=1, keepdims=True)  # normalize
+        # Reshape weights to broadcast correctly
+        # Add leading singleton dimensions so it matches target_data shape
+        while weights.ndim < target_data.ndim:
+            weights = np.expand_dims(weights, axis=0)
+        # Compute weighted average along the last dimension
+        final_point = np.sum(target_data * weights, axis=-1)
+        result = final_point[..., index[3]]
         return apply_index_to_slices_changes(result, changes)
-def complement_factory(args: Tuple, kwargs: dict) -> Dataset:
+def complement_factory(args: tuple, kwargs: dict) -> Dataset:
     """Factory function to create a Complement instance based on the provided arguments.
     Parameters
@@ -330,6 +352,13 @@ def complement_factory(args: Tuple, kwargs: dict) -> Dataset:
         "nearest": ComplementNearest,
     }[interpolation]
-    complement = Class(target=target, source=source)._subset(**kwargs)
+    if interpolation == "nearest":
+        k = kwargs.pop("k", "1")
+        complement = Class(target=target, source=source, k=k)._subset(**kwargs)
+    else:
+        complement = Class(target=target, source=source)._subset(**kwargs)
+    joined = _open_dataset([target, complement])
-    return _open_dataset([target, complement], reorder=source.variables)
+    return _open_dataset(joined, reorder=sorted(joined.variables))

anemoi/datasets/data/concat.py CHANGED Viewed

@@ -11,9 +11,6 @@
 import logging
 from functools import cached_property
 from typing import Any
-from typing import List
-from typing import Set
-from typing import Tuple
 import numpy as np
 from anemoi.utils.dates import frequency_to_timedelta
@@ -123,12 +120,12 @@ class ConcatMixin:
         return np.concatenate(result)
     @cached_property
-    def missing(self) -> Set[int]:
+    def missing(self) -> set[int]:
         """Returns the set of missing indices in the concatenated datasets."""
-        result: Set[int] = set()
+        result: set[int] = set()
         offset = 0
         for d in self.datasets:
-            result = result | set(m + offset for m in d.missing)
+            result = result | {m + offset for m in d.missing}
             offset += len(d)
         return result
@@ -195,7 +192,7 @@ class Concat(ConcatMixin, Combined):
         return Node(self, [d.tree() for d in self.datasets])
     @classmethod
-    def check_dataset_compatibility(cls, datasets: List[Any], fill_missing_gaps: bool = False) -> List[Any]:
+    def check_dataset_compatibility(cls, datasets: list[Any], fill_missing_gaps: bool = False) -> list[Any]:
         """Checks the compatibility of the datasets for concatenation and fills missing gaps if required.
         Parameters
@@ -259,7 +256,7 @@ class Concat(ConcatMixin, Combined):
         return {}
-def concat_factory(args: Tuple[Any, ...], kwargs: dict) -> Concat:
+def concat_factory(args: tuple[Any, ...], kwargs: dict) -> Concat:
     """Factory function to create a Concat object.
     Parameters

anemoi-datasets 0.5.25__py3-none-any.whl → 0.5.27__py3-none-any.whl

anemoi-datasets 0.5.25py3-none-any.whl → 0.5.27py3-none-any.whl