PyPI - anemoi-datasets - Versions diffs - 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

anemoi-datasets 0.5.11py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

anemoi/datasets/data/complement.py ADDED Viewed

@@ -0,0 +1,164 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+from functools import cached_property
+from ..grids import nearest_grid_points
+from .debug import Node
+from .forwards import Combined
+from .indexing import apply_index_to_slices_changes
+from .indexing import index_to_slices
+from .indexing import update_tuple
+from .misc import _auto_adjust
+from .misc import _open
+LOG = logging.getLogger(__name__)
+class Complement(Combined):
+    def __init__(self, target, source, what="variables", interpolation="nearest"):
+        super().__init__([target, source])
+        # We had the variables of dataset[1] to dataset[0]
+        # interpoated on the grid of dataset[0]
+        self.target = target
+        self.source = source
+        self._variables = []
+        # Keep the same order as the original dataset
+        for v in self.source.variables:
+            if v not in self.target.variables:
+                self._variables.append(v)
+        if not self._variables:
+            raise ValueError("Augment: no missing variables")
+    @property
+    def variables(self):
+        return self._variables
+    @property
+    def name_to_index(self):
+        return {v: i for i, v in enumerate(self.variables)}
+    @property
+    def shape(self):
+        shape = self.target.shape
+        return (shape[0], len(self._variables)) + shape[2:]
+    @property
+    def variables_metadata(self):
+        return {k: v for k, v in self.source.variables_metadata.items() if k in self._variables}
+    def check_same_variables(self, d1, d2):
+        pass
+    @cached_property
+    def missing(self):
+        missing = self.source.missing.copy()
+        missing = missing | self.target.missing
+        return set(missing)
+    def tree(self):
+        """Generates a hierarchical tree structure for the `Cutout` instance and
+        its associated datasets.
+        Returns:
+            Node: A `Node` object representing the `Cutout` instance as the root
+            node, with each dataset in `self.datasets` represented as a child
+            node.
+        """
+        return Node(self, [d.tree() for d in (self.target, self.source)])
+    def __getitem__(self, index):
+        if isinstance(index, (int, slice)):
+            index = (index, slice(None), slice(None), slice(None))
+        return self._get_tuple(index)
+class ComplementNone(Complement):
+    def __init__(self, target, source):
+        super().__init__(target, source)
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        result = self.source[index]
+        return apply_index_to_slices_changes(result, changes)
+class ComplementNearest(Complement):
+    def __init__(self, target, source):
+        super().__init__(target, source)
+        self._nearest_grid_points = nearest_grid_points(
+            self.source.latitudes,
+            self.source.longitudes,
+            self.target.latitudes,
+            self.target.longitudes,
+        )
+    def check_compatibility(self, d1, d2):
+        pass
+    def _get_tuple(self, index):
+        variable_index = 1
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, variable_index, slice(None))
+        source_index = [self.source.name_to_index[x] for x in self.variables[previous]]
+        source_data = self.source[index[0], source_index, index[2], ...]
+        target_data = source_data[..., self._nearest_grid_points]
+        result = target_data[..., index[3]]
+        return apply_index_to_slices_changes(result, changes)
+def complement_factory(args, kwargs):
+    from .select import Select
+    assert len(args) == 0, args
+    source = kwargs.pop("source")
+    target = kwargs.pop("complement")
+    what = kwargs.pop("what", "variables")
+    interpolation = kwargs.pop("interpolation", "none")
+    if what != "variables":
+        raise NotImplementedError(f"Complement what={what} not implemented")
+    if interpolation not in ("none", "nearest"):
+        raise NotImplementedError(f"Complement method={interpolation} not implemented")
+    source = _open(source)
+    target = _open(target)
+    # `select` is the same as `variables`
+    (source, target), kwargs = _auto_adjust([source, target], kwargs, exclude=["select"])
+    Class = {
+        None: ComplementNone,
+        "none": ComplementNone,
+        "nearest": ComplementNearest,
+    }[interpolation]
+    complement = Class(target=target, source=source)._subset(**kwargs)
+    # Will join the datasets along the variables axis
+    reorder = source.variables
+    complemented = _open([target, complement])
+    ordered = (
+        Select(complemented, complemented._reorder_to_columns(reorder), {"reoder": reorder})._subset(**kwargs).mutate()
+    )
+    return ordered

anemoi/datasets/data/dataset.py CHANGED Viewed

@@ -15,6 +15,7 @@ import pprint
 import warnings
 from functools import cached_property
+import numpy as np
 from anemoi.utils.dates import frequency_to_seconds
 from anemoi.utils.dates import frequency_to_string
 from anemoi.utils.dates import frequency_to_timedelta
@@ -42,6 +43,9 @@ def _tidy(v):
     if isinstance(v, slice):
         return (v.start, v.stop, v.step)
+    if isinstance(v, np.integer):
+        return int(v)
     return v
@@ -164,6 +168,16 @@ class Dataset:
             bbox = kwargs.pop("area")
             return Cropping(self, bbox)._subset(**kwargs).mutate()
+        if "number" in kwargs or "numbers" or "member" in kwargs or "members" in kwargs:
+            from .ensemble import Number
+            members = {}
+            for key in ["number", "numbers", "member", "members"]:
+                if key in kwargs:
+                    members[key] = kwargs.pop(key)
+            return Number(self, **members)._subset(**kwargs).mutate()
         if "set_missing_dates" in kwargs:
             from .missing import MissingDates
@@ -241,18 +255,25 @@ class Dataset:
         if not isinstance(vars, (list, tuple, set)):
             vars = [vars]
-        assert set(vars) <= set(self.name_to_index)
+        if not set(vars) <= set(self.name_to_index):
+            raise ValueError(f"drop: unknown variables: {set(vars) - set(self.name_to_index)}")
         return sorted([v for k, v in self.name_to_index.items() if k not in vars])
     def _reorder_to_columns(self, vars):
+        if isinstance(vars, str) and vars == "sort":
+            # Sorting the variables alphabetically.
+            # This is cruical for pre-training then transfer learning in combination with
+            # cutout and adjust = 'all'
+            indices = [self.name_to_index[k] for k, v in sorted(self.name_to_index.items(), key=lambda x: x[0])]
+            assert set(indices) == set(range(len(self.name_to_index)))
+            return indices
         if isinstance(vars, (list, tuple)):
             vars = {k: i for i, k in enumerate(vars)}
-        indices = []
-        for k, v in sorted(vars.items(), key=lambda x: x[1]):
-            indices.append(self.name_to_index[k])
+        indices = [self.name_to_index[k] for k, v in sorted(vars.items(), key=lambda x: x[1])]
         # Make sure we don't forget any variables
         assert set(indices) == set(range(len(self.name_to_index)))
@@ -464,7 +485,7 @@ class Dataset:
         sample_count = min(4, len(indices))
         count = len(indices)
-        p = slice(0, count, count // (sample_count - 1))
+        p = slice(0, count, count // max(1, sample_count - 1))
         samples = list(range(*p.indices(count)))
         samples.append(count - 1)  # Add last
@@ -497,3 +518,50 @@ class Dataset:
                 result.append(v)
         return result
+    def plot(self, date, variable, member=0, **kwargs):
+        """For debugging purposes, plot a field.
+        Parameters
+        ----------
+        date : int or datetime.datetime or numpy.datetime64 or str
+            The date to plot.
+        variable : int or str
+            The variable to plot.
+        member : int, optional
+            The ensemble member to plot.
+        **kwargs:
+            Additional arguments to pass to matplotlib.pyplot.tricontourf
+        Returns
+        -------
+            matplotlib.pyplot.Axes
+        """
+        from anemoi.utils.devtools import plot_values
+        from earthkit.data.utils.dates import to_datetime
+        if not isinstance(date, int):
+            date = np.datetime64(to_datetime(date)).astype(self.dates[0].dtype)
+            index = np.where(self.dates == date)[0]
+            if len(index) == 0:
+                raise ValueError(
+                    f"Date {date} not found in the dataset {self.dates[0]} to {self.dates[-1]} by {self.frequency}"
+                )
+            date_index = index[0]
+        else:
+            date_index = date
+        if isinstance(variable, int):
+            variable_index = variable
+        else:
+            if variable not in self.variables:
+                raise ValueError(f"Unknown variable {variable} (available: {self.variables})")
+            variable_index = self.name_to_index[variable]
+        values = self[date_index, variable_index, member]
+        return plot_values(values, self.latitudes, self.longitudes, **kwargs)

anemoi/datasets/data/ensemble.py CHANGED Viewed

@@ -10,13 +10,68 @@
 import logging
+import numpy as np
 from .debug import Node
+from .forwards import Forwards
 from .forwards import GivenAxis
+from .indexing import apply_index_to_slices_changes
+from .indexing import index_to_slices
+from .indexing import update_tuple
 from .misc import _auto_adjust
 from .misc import _open
 LOG = logging.getLogger(__name__)
+OFFSETS = dict(number=1, numbers=1, member=0, members=0)
+class Number(Forwards):
+    def __init__(self, forward, **kwargs):
+        super().__init__(forward)
+        self.members = []
+        for key, values in kwargs.items():
+            if not isinstance(values, (list, tuple)):
+                values = [values]
+            self.members.extend([int(v) - OFFSETS[key] for v in values])
+        self.members = sorted(set(self.members))
+        for n in self.members:
+            if not (0 <= n < forward.shape[2]):
+                raise ValueError(f"Member {n} is out of range. `number(s)` is one-based, `member(s)` is zero-based.")
+        self.mask = np.array([n in self.members for n in range(forward.shape[2])], dtype=bool)
+        self._shape, _ = update_tuple(forward.shape, 2, len(self.members))
+    @property
+    def shape(self):
+        return self._shape
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            result = self.forward[index]
+            result = result[:, self.mask, :]
+            return result
+        if isinstance(index, slice):
+            result = self.forward[index]
+            result = result[:, :, self.mask, :]
+            return result
+        index, changes = index_to_slices(index, self.shape)
+        result = self.forward[index]
+        result = result[:, :, self.mask, :]
+        return apply_index_to_slices_changes(result, changes)
+    def tree(self):
+        return Node(self, [self.forward.tree()], numbers=[n + 1 for n in self.members])
+    def metadata_specific(self):
+        return {
+            "numbers": [n + 1 for n in self.members],
+        }
 class Ensemble(GivenAxis):
     def tree(self):

anemoi/datasets/data/grids.py CHANGED Viewed

@@ -289,14 +289,15 @@ class Cutout(GridsBase):
         """
         index, changes = index_to_slices(index, self.shape)
         # Select data from each LAM
-        lam_data = [lam[index] for lam in self.lams]
+        lam_data = [lam[index[:3]] for lam in self.lams]
         # First apply spatial indexing on `self.globe` and then apply the mask
         globe_data_sliced = self.globe[index[:3]]
         globe_data = globe_data_sliced[..., self.global_mask]
-        # Concatenate LAM data with global data
-        result = np.concatenate(lam_data + [globe_data], axis=self.axis)
+        # Concatenate LAM data with global data, apply the grid slicing
+        result = np.concatenate(lam_data + [globe_data], axis=self.axis)[..., index[3]]
         return apply_index_to_slices_changes(result, changes)
     def collect_supporting_arrays(self, collected, *path):
@@ -324,7 +325,8 @@ class Cutout(GridsBase):
         """
         shapes = [np.sum(mask) for mask in self.masks]
         global_shape = np.sum(self.global_mask)
-        return tuple(self.lams[0].shape[:-1] + (sum(shapes) + global_shape,))
+        total_shape = sum(shapes) + global_shape
+        return tuple(self.lams[0].shape[:-1] + (int(total_shape),))
     def check_same_resolution(self, d1, d2):
         # Turned off because we are combining different resolutions

anemoi/datasets/data/join.py CHANGED Viewed

@@ -118,13 +118,19 @@ class Join(Combined):
     def variables_metadata(self):
         result = {}
         variables = [v for v in self.variables if not (v.startswith("(") and v.endswith(")"))]
         for d in self.datasets:
             md = d.variables_metadata
             for v in variables:
                 if v in md:
                     result[v] = md[v]
-        assert len(result) == len(variables), (result, variables)
+        if len(result) != len(variables):
+            LOG.error("Some variables are missing metadata.")
+            for v in variables:
+                if v not in result:
+                    LOG.error("Missing metadata for %r.", v)
         return result
     @cached_property

anemoi/datasets/data/merge.py CHANGED Viewed

@@ -134,6 +134,9 @@ class Merge(Combined):
     def tree(self):
         return Node(self, [d.tree() for d in self.datasets], allow_gaps_in_dates=self.allow_gaps_in_dates)
+    def metadata_specific(self):
+        return {"allow_gaps_in_dates": self.allow_gaps_in_dates}
     @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple):

anemoi/datasets/data/misc.py CHANGED Viewed

@@ -194,7 +194,7 @@ def _open(a):
     raise NotImplementedError(f"Unsupported argument: {type(a)}")
-def _auto_adjust(datasets, kwargs):
+def _auto_adjust(datasets, kwargs, exclude=None):
     if "adjust" not in kwargs:
         return datasets, kwargs
@@ -214,6 +214,9 @@ def _auto_adjust(datasets, kwargs):
     for a in adjust_list:
         adjust_set.update(ALIASES.get(a, [a]))
+    if exclude is not None:
+        adjust_set -= set(exclude)
     extra = set(adjust_set) - set(ALIASES["all"])
     if extra:
         raise ValueError(f"Invalid adjust keys: {extra}")
@@ -335,6 +338,12 @@ def _open_dataset(*args, **kwargs):
         assert not sets, sets
         return cutout_factory(args, kwargs).mutate()
+    if "complement" in kwargs:
+        from .complement import complement_factory
+        assert not sets, sets
+        return complement_factory(args, kwargs).mutate()
     for name in ("datasets", "dataset"):
         if name in kwargs:
             datasets = kwargs.pop(name)

anemoi/datasets/grids.py CHANGED Viewed

@@ -152,7 +152,7 @@ def cutout_mask(
     plot=None,
 ):
     """Return a mask for the points in [global_lats, global_lons] that are inside of [lats, lons]"""
-    from scipy.spatial import KDTree
+    from scipy.spatial import cKDTree
     # TODO: transform min_distance from lat/lon to xyz
@@ -195,13 +195,13 @@ def cutout_mask(
         min_distance = min_distance_km / 6371.0
     else:
         points = {"lam": lam_points, "global": global_points, None: global_points}[min_distance_km]
-        distances, _ = KDTree(points).query(points, k=2)
+        distances, _ = cKDTree(points).query(points, k=2)
         min_distance = np.min(distances[:, 1])
         LOG.info(f"cutout_mask using min_distance = {min_distance * 6371.0} km")
-    # Use a KDTree to find the nearest points
-    distances, indices = KDTree(lam_points).query(global_points, k=neighbours)
+    # Use a cKDTree to find the nearest points
+    distances, indices = cKDTree(lam_points).query(global_points, k=neighbours)
     # Centre of the Earth
     zero = np.array([0.0, 0.0, 0.0])
@@ -255,7 +255,7 @@ def thinning_mask(
     cropping_distance=2.0,
 ):
     """Return the list of points in [lats, lons] closest to [global_lats, global_lons]"""
-    from scipy.spatial import KDTree
+    from scipy.spatial import cKDTree
     assert global_lats.ndim == 1
     assert global_lons.ndim == 1
@@ -291,20 +291,20 @@ def thinning_mask(
     xyx = latlon_to_xyz(lats, lons)
     points = np.array(xyx).transpose()
-    # Use a KDTree to find the nearest points
-    _, indices = KDTree(points).query(global_points, k=1)
+    # Use a cKDTree to find the nearest points
+    _, indices = cKDTree(points).query(global_points, k=1)
     return np.array([i for i in indices])
 def outline(lats, lons, neighbours=5):
-    from scipy.spatial import KDTree
+    from scipy.spatial import cKDTree
     xyx = latlon_to_xyz(lats, lons)
     grid_points = np.array(xyx).transpose()
-    # Use a KDTree to find the nearest points
-    _, indices = KDTree(grid_points).query(grid_points, k=neighbours)
+    # Use a cKDTree to find the nearest points
+    _, indices = cKDTree(grid_points).query(grid_points, k=neighbours)
     # Centre of the Earth
     zero = np.array([0.0, 0.0, 0.0])
@@ -379,6 +379,19 @@ def serialise_mask(mask):
     return result
+def nearest_grid_points(source_latitudes, source_longitudes, target_latitudes, target_longitudes):
+    from scipy.spatial import cKDTree
+    source_xyz = latlon_to_xyz(source_latitudes, source_longitudes)
+    source_points = np.array(source_xyz).transpose()
+    target_xyz = latlon_to_xyz(target_latitudes, target_longitudes)
+    target_points = np.array(target_xyz).transpose()
+    _, indices = cKDTree(source_points).query(target_points, k=1)
+    return indices
 if __name__ == "__main__":
     global_lats, global_lons = np.meshgrid(
         np.linspace(90, -90, 90),

{anemoi_datasets-0.5.11.dist-info → anemoi_datasets-0.5.13.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: anemoi-datasets
-Version: 0.5.11
+Version: 0.5.13
 Summary: A package to hold various functions to support training of ML models on ECMWF data.
 Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
-License: Apache License
+License:                                  Apache License
                                    Version 2.0, January 2004
                                 http://www.apache.org/licenses/
@@ -224,40 +224,39 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Requires-Python: >=3.9
 License-File: LICENSE
-Requires-Dist: anemoi-transform >=0.1
-Requires-Dist: anemoi-utils[provenance] >=0.4.9
+Requires-Dist: anemoi-transform>=0.1
+Requires-Dist: anemoi-utils[provenance]>=0.4.9
 Requires-Dist: cfunits
 Requires-Dist: numpy
 Requires-Dist: pyyaml
 Requires-Dist: semantic-version
 Requires-Dist: tqdm
-Requires-Dist: zarr <=2.17
+Requires-Dist: zarr<=2.17
 Provides-Extra: all
-Requires-Dist: anemoi-datasets[create,remote,xarray] ; extra == 'all'
+Requires-Dist: anemoi-datasets[create,remote,xarray]; extra == "all"
 Provides-Extra: create
-Requires-Dist: earthkit-data[mars] >=0.10.7 ; extra == 'create'
-Requires-Dist: earthkit-geo >=0.2 ; extra == 'create'
-Requires-Dist: earthkit-meteo ; extra == 'create'
-Requires-Dist: eccodes >=2.38.1 ; extra == 'create'
-Requires-Dist: entrypoints ; extra == 'create'
-Requires-Dist: pyproj ; extra == 'create'
+Requires-Dist: earthkit-data[mars]>=0.10.7; extra == "create"
+Requires-Dist: earthkit-geo>=0.2; extra == "create"
+Requires-Dist: earthkit-meteo; extra == "create"
+Requires-Dist: eccodes>=2.38.1; extra == "create"
+Requires-Dist: entrypoints; extra == "create"
+Requires-Dist: pyproj; extra == "create"
 Provides-Extra: dev
-Requires-Dist: anemoi-datasets[all,docs,tests] ; extra == 'dev'
+Requires-Dist: anemoi-datasets[all,docs,tests]; extra == "dev"
 Provides-Extra: docs
-Requires-Dist: nbsphinx ; extra == 'docs'
-Requires-Dist: pandoc ; extra == 'docs'
-Requires-Dist: sphinx ; extra == 'docs'
-Requires-Dist: sphinx-argparse ; extra == 'docs'
-Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
+Requires-Dist: nbsphinx; extra == "docs"
+Requires-Dist: pandoc; extra == "docs"
+Requires-Dist: sphinx; extra == "docs"
+Requires-Dist: sphinx-argparse; extra == "docs"
+Requires-Dist: sphinx-rtd-theme; extra == "docs"
 Provides-Extra: remote
-Requires-Dist: boto3 ; extra == 'remote'
-Requires-Dist: requests ; extra == 'remote'
+Requires-Dist: boto3; extra == "remote"
+Requires-Dist: requests; extra == "remote"
 Provides-Extra: tests
-Requires-Dist: pytest ; extra == 'tests'
+Requires-Dist: pytest; extra == "tests"
 Provides-Extra: xarray
-Requires-Dist: gcsfs ; extra == 'xarray'
-Requires-Dist: kerchunk ; extra == 'xarray'
-Requires-Dist: pandas ; extra == 'xarray'
-Requires-Dist: planetary-computer ; extra == 'xarray'
-Requires-Dist: pystac-client ; extra == 'xarray'
+Requires-Dist: gcsfs; extra == "xarray"
+Requires-Dist: kerchunk; extra == "xarray"
+Requires-Dist: pandas; extra == "xarray"
+Requires-Dist: planetary-computer; extra == "xarray"
+Requires-Dist: pystac-client; extra == "xarray"

anemoi-datasets 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl

anemoi-datasets 0.5.11py3-none-any.whl → 0.5.13py3-none-any.whl