PyPI - anemoi-datasets - Versions diffs - 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl - Mend

anemoi-datasets 0.5.12py3-none-any.whl → 0.5.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

anemoi/datasets/create/functions/sources/xarray/variable.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Variable:
         self.coordinates = coordinates
         self._metadata = metadata.copy()
-        self._metadata.update({"variable": variable.name})
+        self._metadata.update({"variable": variable.name, "param": variable.name})
         self.time = time
@@ -45,6 +45,9 @@ class Variable:
         self.names = {c.variable.name: c for c in coordinates if c.is_dim and not c.scalar and not c.is_grid}
         self.by_name = {c.variable.name: c for c in coordinates}
+        # We need that alias for the time dimension
+        self._aliases = dict(valid_datetime="time")
         self.length = math.prod(self.shape)
     @property
@@ -96,15 +99,28 @@ class Variable:
         k, v = kwargs.popitem()
+        user_provided_k = k
+        if k == "valid_datetime":
+            # Ask the Time object to select the valid datetime
+            k = self.time.select_valid_datetime(self)
+            if k is None:
+                return None
         c = self.by_name.get(k)
+        # assert c is not None, f"Could not find coordinate {k} in {self.variable.name} {self.coordinates} {list(self.by_name)}"
         if c is None:
             missing[k] = v
             return self.sel(missing, **kwargs)
         i = c.index(v)
         if i is None:
-            LOG.warning(f"Could not find {k}={v} in {c}")
+            if k != user_provided_k:
+                LOG.warning(f"Could not find {user_provided_k}={v} in {c} (alias of {k})")
+            else:
+                LOG.warning(f"Could not find {k}={v} in {c}")
             return None
         coordinates = [x.reduced(i) if c is x else x for x in self.coordinates]

anemoi/datasets/create/input/repeated_dates.py CHANGED Viewed

@@ -72,6 +72,11 @@ class DateMapperClosest(DateMapper):
                 end += self.frequency
         to_try = sorted(to_try - self.tried)
+        info = {k: "no-data" for k in to_try}
+        if not to_try:
+            LOG.warning(f"No new dates to try for {group_of_dates} in {self.source}")
+            # return []
         if to_try:
             result = self.source.select(
@@ -82,19 +87,32 @@ class DateMapperClosest(DateMapper):
                 )
             )
+            cnt = 0
             for f in result.datasource:
+                cnt += 1
                 # We could keep the fields in a dictionary, but we don't want to keep the fields in memory
                 date = as_datetime(f.metadata("valid_datetime"))
                 if self.skip_all_nans:
                     if np.isnan(f.to_numpy()).all():
                         LOG.warning(f"Skipping {date} because all values are NaN")
+                        info[date] = "all-nans"
                         continue
+                info[date] = "ok"
                 self.found.add(date)
+            if cnt == 0:
+                raise ValueError(f"No data found for {group_of_dates} in {self.source}")
             self.tried.update(to_try)
+        if not self.found:
+            for k, v in info.items():
+                LOG.warning(f"{k}: {v}")
+            raise ValueError(f"No matching data found for {asked_dates} in {self.source}")
         new_dates = defaultdict(list)
         for date in asked_dates:

anemoi/datasets/create/input/result.py CHANGED Viewed

@@ -459,7 +459,7 @@ class Result:
         if self.group_of_dates is not None:
             dates = f" {len(self.group_of_dates)} dates"
             dates += " ("
-            dates += "/".join(d.strftime("%Y-%m-%d:%H") for d in self.group_of_dates)
+            dates += "/".join(d.strftime("%Y-%m-%dT%H:%M") for d in self.group_of_dates)
             if len(dates) > 100:
                 dates = dates[:100] + "..."
             dates += ")"

anemoi/datasets/create/statistics/__init__.py CHANGED Viewed

@@ -18,6 +18,7 @@ import shutil
 import socket
 import numpy as np
+import tqdm
 from anemoi.utils.provenance import gather_provenance_info
 from ..check import check_data_values
@@ -98,7 +99,7 @@ def fix_variance(x, name, count, sums, squares):
     variances = squares / count - mean * mean
     assert variances.shape == squares.shape == mean.shape
-    if all(variances >= 0):
+    if np.all(variances >= 0):
         LOG.warning(f"All individual variances for {name} are positive, setting variance to 0.")
         return 0
@@ -108,7 +109,7 @@ def fix_variance(x, name, count, sums, squares):
     #     return 0
     LOG.warning(f"ERROR at least one individual variance is negative ({np.nanmin(variances)}).")
-    return x
+    return 0
 def check_variance(x, variables_names, minimum, maximum, mean, count, sums, squares):
@@ -134,7 +135,7 @@ def check_variance(x, variables_names, minimum, maximum, mean, count, sums, squa
 def compute_statistics(array, check_variables_names=None, allow_nans=False):
     """Compute statistics for a given array, provides minimum, maximum, sum, squares, count and has_nans as a dictionary."""
+    LOG.info(f"Computing statistics for {array.shape} array")
     nvars = array.shape[1]
     LOG.debug(f"Stats {nvars}, {array.shape}, {check_variables_names}")
@@ -149,7 +150,7 @@ def compute_statistics(array, check_variables_names=None, allow_nans=False):
     maximum = np.zeros(stats_shape, dtype=np.float64)
     has_nans = np.zeros(stats_shape, dtype=np.bool_)
-    for i, chunk in enumerate(array):
+    for i, chunk in tqdm.tqdm(enumerate(array), delay=1, total=array.shape[0], desc="Computing statistics"):
         values = chunk.reshape((nvars, -1))
         for j, name in enumerate(check_variables_names):
@@ -166,6 +167,8 @@ def compute_statistics(array, check_variables_names=None, allow_nans=False):
         count[i] = np.sum(~np.isnan(values), axis=1)
         has_nans[i] = np.isnan(values).any()
+    LOG.info(f"Statistics computed for {nvars} variables.")
     return {
         "minimum": minimum,
         "maximum": maximum,

anemoi/datasets/create/utils.py CHANGED Viewed

@@ -54,6 +54,10 @@ def to_datetime(*args, **kwargs):
 def make_list_int(value):
+    # Convert a string like "1/2/3" or "1/to/3" or "1/to/10/by/2" to a list of integers.
+    # Moved to anemoi.utils.humanize
+    # replace with from anemoi.utils.humanize import make_list_int
+    # when anemoi-utils is released and pyproject.toml is updated
     if isinstance(value, str):
         if "/" not in value:
             return [value]

anemoi/datasets/data/complement.py ADDED Viewed

@@ -0,0 +1,164 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+from functools import cached_property
+from ..grids import nearest_grid_points
+from .debug import Node
+from .forwards import Combined
+from .indexing import apply_index_to_slices_changes
+from .indexing import index_to_slices
+from .indexing import update_tuple
+from .misc import _auto_adjust
+from .misc import _open
+LOG = logging.getLogger(__name__)
+class Complement(Combined):
+    def __init__(self, target, source, what="variables", interpolation="nearest"):
+        super().__init__([target, source])
+        # We had the variables of dataset[1] to dataset[0]
+        # interpoated on the grid of dataset[0]
+        self.target = target
+        self.source = source
+        self._variables = []
+        # Keep the same order as the original dataset
+        for v in self.source.variables:
+            if v not in self.target.variables:
+                self._variables.append(v)
+        if not self._variables:
+            raise ValueError("Augment: no missing variables")
+    @property
+    def variables(self):
+        return self._variables
+    @property
+    def name_to_index(self):
+        return {v: i for i, v in enumerate(self.variables)}
+    @property
+    def shape(self):
+        shape = self.target.shape
+        return (shape[0], len(self._variables)) + shape[2:]
+    @property
+    def variables_metadata(self):
+        return {k: v for k, v in self.source.variables_metadata.items() if k in self._variables}
+    def check_same_variables(self, d1, d2):
+        pass
+    @cached_property
+    def missing(self):
+        missing = self.source.missing.copy()
+        missing = missing | self.target.missing
+        return set(missing)
+    def tree(self):
+        """Generates a hierarchical tree structure for the `Cutout` instance and
+        its associated datasets.
+        Returns:
+            Node: A `Node` object representing the `Cutout` instance as the root
+            node, with each dataset in `self.datasets` represented as a child
+            node.
+        """
+        return Node(self, [d.tree() for d in (self.target, self.source)])
+    def __getitem__(self, index):
+        if isinstance(index, (int, slice)):
+            index = (index, slice(None), slice(None), slice(None))
+        return self._get_tuple(index)
+class ComplementNone(Complement):
+    def __init__(self, target, source):
+        super().__init__(target, source)
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        result = self.source[index]
+        return apply_index_to_slices_changes(result, changes)
+class ComplementNearest(Complement):
+    def __init__(self, target, source):
+        super().__init__(target, source)
+        self._nearest_grid_points = nearest_grid_points(
+            self.source.latitudes,
+            self.source.longitudes,
+            self.target.latitudes,
+            self.target.longitudes,
+        )
+    def check_compatibility(self, d1, d2):
+        pass
+    def _get_tuple(self, index):
+        variable_index = 1
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, variable_index, slice(None))
+        source_index = [self.source.name_to_index[x] for x in self.variables[previous]]
+        source_data = self.source[index[0], source_index, index[2], ...]
+        target_data = source_data[..., self._nearest_grid_points]
+        result = target_data[..., index[3]]
+        return apply_index_to_slices_changes(result, changes)
+def complement_factory(args, kwargs):
+    from .select import Select
+    assert len(args) == 0, args
+    source = kwargs.pop("source")
+    target = kwargs.pop("complement")
+    what = kwargs.pop("what", "variables")
+    interpolation = kwargs.pop("interpolation", "none")
+    if what != "variables":
+        raise NotImplementedError(f"Complement what={what} not implemented")
+    if interpolation not in ("none", "nearest"):
+        raise NotImplementedError(f"Complement method={interpolation} not implemented")
+    source = _open(source)
+    target = _open(target)
+    # `select` is the same as `variables`
+    (source, target), kwargs = _auto_adjust([source, target], kwargs, exclude=["select"])
+    Class = {
+        None: ComplementNone,
+        "none": ComplementNone,
+        "nearest": ComplementNearest,
+    }[interpolation]
+    complement = Class(target=target, source=source)._subset(**kwargs)
+    # Will join the datasets along the variables axis
+    reorder = source.variables
+    complemented = _open([target, complement])
+    ordered = (
+        Select(complemented, complemented._reorder_to_columns(reorder), {"reoder": reorder})._subset(**kwargs).mutate()
+    )
+    return ordered

anemoi/datasets/data/dataset.py CHANGED Viewed

@@ -168,6 +168,16 @@ class Dataset:
             bbox = kwargs.pop("area")
             return Cropping(self, bbox)._subset(**kwargs).mutate()
+        if "number" in kwargs or "numbers" or "member" in kwargs or "members" in kwargs:
+            from .ensemble import Number
+            members = {}
+            for key in ["number", "numbers", "member", "members"]:
+                if key in kwargs:
+                    members[key] = kwargs.pop(key)
+            return Number(self, **members)._subset(**kwargs).mutate()
         if "set_missing_dates" in kwargs:
             from .missing import MissingDates
@@ -251,13 +261,19 @@ class Dataset:
         return sorted([v for k, v in self.name_to_index.items() if k not in vars])
     def _reorder_to_columns(self, vars):
+        if isinstance(vars, str) and vars == "sort":
+            # Sorting the variables alphabetically.
+            # This is cruical for pre-training then transfer learning in combination with
+            # cutout and adjust = 'all'
+            indices = [self.name_to_index[k] for k, v in sorted(self.name_to_index.items(), key=lambda x: x[0])]
+            assert set(indices) == set(range(len(self.name_to_index)))
+            return indices
         if isinstance(vars, (list, tuple)):
             vars = {k: i for i, k in enumerate(vars)}
-        indices = []
-        for k, v in sorted(vars.items(), key=lambda x: x[1]):
-            indices.append(self.name_to_index[k])
+        indices = [self.name_to_index[k] for k, v in sorted(vars.items(), key=lambda x: x[1])]
         # Make sure we don't forget any variables
         assert set(indices) == set(range(len(self.name_to_index)))
@@ -469,7 +485,7 @@ class Dataset:
         sample_count = min(4, len(indices))
         count = len(indices)
-        p = slice(0, count, count // (sample_count - 1))
+        p = slice(0, count, count // max(1, sample_count - 1))
         samples = list(range(*p.indices(count)))
         samples.append(count - 1)  # Add last
@@ -502,3 +518,50 @@ class Dataset:
                 result.append(v)
         return result
+    def plot(self, date, variable, member=0, **kwargs):
+        """For debugging purposes, plot a field.
+        Parameters
+        ----------
+        date : int or datetime.datetime or numpy.datetime64 or str
+            The date to plot.
+        variable : int or str
+            The variable to plot.
+        member : int, optional
+            The ensemble member to plot.
+        **kwargs:
+            Additional arguments to pass to matplotlib.pyplot.tricontourf
+        Returns
+        -------
+            matplotlib.pyplot.Axes
+        """
+        from anemoi.utils.devtools import plot_values
+        from earthkit.data.utils.dates import to_datetime
+        if not isinstance(date, int):
+            date = np.datetime64(to_datetime(date)).astype(self.dates[0].dtype)
+            index = np.where(self.dates == date)[0]
+            if len(index) == 0:
+                raise ValueError(
+                    f"Date {date} not found in the dataset {self.dates[0]} to {self.dates[-1]} by {self.frequency}"
+                )
+            date_index = index[0]
+        else:
+            date_index = date
+        if isinstance(variable, int):
+            variable_index = variable
+        else:
+            if variable not in self.variables:
+                raise ValueError(f"Unknown variable {variable} (available: {self.variables})")
+            variable_index = self.name_to_index[variable]
+        values = self[date_index, variable_index, member]
+        return plot_values(values, self.latitudes, self.longitudes, **kwargs)

anemoi/datasets/data/ensemble.py CHANGED Viewed

@@ -10,13 +10,68 @@
 import logging
+import numpy as np
 from .debug import Node
+from .forwards import Forwards
 from .forwards import GivenAxis
+from .indexing import apply_index_to_slices_changes
+from .indexing import index_to_slices
+from .indexing import update_tuple
 from .misc import _auto_adjust
 from .misc import _open
 LOG = logging.getLogger(__name__)
+OFFSETS = dict(number=1, numbers=1, member=0, members=0)
+class Number(Forwards):
+    def __init__(self, forward, **kwargs):
+        super().__init__(forward)
+        self.members = []
+        for key, values in kwargs.items():
+            if not isinstance(values, (list, tuple)):
+                values = [values]
+            self.members.extend([int(v) - OFFSETS[key] for v in values])
+        self.members = sorted(set(self.members))
+        for n in self.members:
+            if not (0 <= n < forward.shape[2]):
+                raise ValueError(f"Member {n} is out of range. `number(s)` is one-based, `member(s)` is zero-based.")
+        self.mask = np.array([n in self.members for n in range(forward.shape[2])], dtype=bool)
+        self._shape, _ = update_tuple(forward.shape, 2, len(self.members))
+    @property
+    def shape(self):
+        return self._shape
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            result = self.forward[index]
+            result = result[:, self.mask, :]
+            return result
+        if isinstance(index, slice):
+            result = self.forward[index]
+            result = result[:, :, self.mask, :]
+            return result
+        index, changes = index_to_slices(index, self.shape)
+        result = self.forward[index]
+        result = result[:, :, self.mask, :]
+        return apply_index_to_slices_changes(result, changes)
+    def tree(self):
+        return Node(self, [self.forward.tree()], numbers=[n + 1 for n in self.members])
+    def metadata_specific(self):
+        return {
+            "numbers": [n + 1 for n in self.members],
+        }
 class Ensemble(GivenAxis):
     def tree(self):

anemoi/datasets/data/join.py CHANGED Viewed

@@ -118,6 +118,7 @@ class Join(Combined):
     def variables_metadata(self):
         result = {}
         variables = [v for v in self.variables if not (v.startswith("(") and v.endswith(")"))]
         for d in self.datasets:
             md = d.variables_metadata
             for v in variables:
@@ -130,8 +131,6 @@ class Join(Combined):
                 if v not in result:
                     LOG.error("Missing metadata for %r.", v)
-            raise ValueError("Some variables are missing metadata.")
         return result
     @cached_property

anemoi/datasets/data/merge.py CHANGED Viewed

@@ -134,6 +134,9 @@ class Merge(Combined):
     def tree(self):
         return Node(self, [d.tree() for d in self.datasets], allow_gaps_in_dates=self.allow_gaps_in_dates)
+    def metadata_specific(self):
+        return {"allow_gaps_in_dates": self.allow_gaps_in_dates}
     @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple):

anemoi/datasets/data/misc.py CHANGED Viewed

@@ -103,6 +103,30 @@ def _as_date(d, dates, last):
     if isinstance(d, str):
+        def isfloat(s):
+            try:
+                float(s)
+                return True
+            except ValueError:
+                return False
+        if d.endswith("%") and isfloat(d[:-1]):
+            x = float(d[:-1])
+            if not 0 <= x <= 100:
+                raise ValueError(f"Invalid date: {d}")
+            i_float = x * len(dates) / 100
+            epsilon = 2 ** (-30)
+            if len(dates) > 1 / epsilon:
+                LOG.warning("Too many dates to use percentage, one date may be lost in rounding")
+            if last:
+                index = int(i_float + epsilon) - 1
+            else:
+                index = int(i_float - epsilon)
+            index = max(0, min(len(dates) - 1, index))
+            return dates[index]
         if "-" in d and ":" in d:
             date, time = d.replace(" ", "T").split("T")
             year, month, day = [int(_) for _ in date.split("-")]
@@ -194,7 +218,7 @@ def _open(a):
     raise NotImplementedError(f"Unsupported argument: {type(a)}")
-def _auto_adjust(datasets, kwargs):
+def _auto_adjust(datasets, kwargs, exclude=None):
     if "adjust" not in kwargs:
         return datasets, kwargs
@@ -214,6 +238,9 @@ def _auto_adjust(datasets, kwargs):
     for a in adjust_list:
         adjust_set.update(ALIASES.get(a, [a]))
+    if exclude is not None:
+        adjust_set -= set(exclude)
     extra = set(adjust_set) - set(ALIASES["all"])
     if extra:
         raise ValueError(f"Invalid adjust keys: {extra}")
@@ -335,6 +362,12 @@ def _open_dataset(*args, **kwargs):
         assert not sets, sets
         return cutout_factory(args, kwargs).mutate()
+    if "complement" in kwargs:
+        from .complement import complement_factory
+        assert not sets, sets
+        return complement_factory(args, kwargs).mutate()
     for name in ("datasets", "dataset"):
         if name in kwargs:
             datasets = kwargs.pop(name)

anemoi/datasets/grids.py CHANGED Viewed

@@ -62,6 +62,8 @@ def plot_mask(path, mask, lats, lons, global_lats, global_lons):
         plt.savefig(path + "-global-zoomed.png")
+# TODO: Use the one from anemoi.utils.grids instead
+# from anemoi.utils.grids import ...
 def xyz_to_latlon(x, y, z):
     return (
         np.rad2deg(np.arcsin(np.minimum(1.0, np.maximum(-1.0, z)))),
@@ -69,6 +71,8 @@ def xyz_to_latlon(x, y, z):
     )
+# TODO: Use the one from anemoi.utils.grids instead
+# from anemoi.utils.grids import ...
 def latlon_to_xyz(lat, lon, radius=1.0):
     # https://en.wikipedia.org/wiki/Geographic_coordinate_conversion#From_geodetic_to_ECEF_coordinates
     # We assume that the Earth is a sphere of radius 1 so N(phi) = 1
@@ -152,7 +156,7 @@ def cutout_mask(
     plot=None,
 ):
     """Return a mask for the points in [global_lats, global_lons] that are inside of [lats, lons]"""
-    from scipy.spatial import KDTree
+    from scipy.spatial import cKDTree
     # TODO: transform min_distance from lat/lon to xyz
@@ -195,13 +199,13 @@ def cutout_mask(
         min_distance = min_distance_km / 6371.0
     else:
         points = {"lam": lam_points, "global": global_points, None: global_points}[min_distance_km]
-        distances, _ = KDTree(points).query(points, k=2)
+        distances, _ = cKDTree(points).query(points, k=2)
         min_distance = np.min(distances[:, 1])
         LOG.info(f"cutout_mask using min_distance = {min_distance * 6371.0} km")
-    # Use a KDTree to find the nearest points
-    distances, indices = KDTree(lam_points).query(global_points, k=neighbours)
+    # Use a cKDTree to find the nearest points
+    distances, indices = cKDTree(lam_points).query(global_points, k=neighbours)
     # Centre of the Earth
     zero = np.array([0.0, 0.0, 0.0])
@@ -255,7 +259,7 @@ def thinning_mask(
     cropping_distance=2.0,
 ):
     """Return the list of points in [lats, lons] closest to [global_lats, global_lons]"""
-    from scipy.spatial import KDTree
+    from scipy.spatial import cKDTree
     assert global_lats.ndim == 1
     assert global_lons.ndim == 1
@@ -291,20 +295,20 @@ def thinning_mask(
     xyx = latlon_to_xyz(lats, lons)
     points = np.array(xyx).transpose()
-    # Use a KDTree to find the nearest points
-    _, indices = KDTree(points).query(global_points, k=1)
+    # Use a cKDTree to find the nearest points
+    _, indices = cKDTree(points).query(global_points, k=1)
     return np.array([i for i in indices])
 def outline(lats, lons, neighbours=5):
-    from scipy.spatial import KDTree
+    from scipy.spatial import cKDTree
     xyx = latlon_to_xyz(lats, lons)
     grid_points = np.array(xyx).transpose()
-    # Use a KDTree to find the nearest points
-    _, indices = KDTree(grid_points).query(grid_points, k=neighbours)
+    # Use a cKDTree to find the nearest points
+    _, indices = cKDTree(grid_points).query(grid_points, k=neighbours)
     # Centre of the Earth
     zero = np.array([0.0, 0.0, 0.0])
@@ -379,6 +383,21 @@ def serialise_mask(mask):
     return result
+def nearest_grid_points(source_latitudes, source_longitudes, target_latitudes, target_longitudes):
+    # TODO: Use the one from anemoi.utils.grids instead
+    # from anemoi.utils.grids import ...
+    from scipy.spatial import cKDTree
+    source_xyz = latlon_to_xyz(source_latitudes, source_longitudes)
+    source_points = np.array(source_xyz).transpose()
+    target_xyz = latlon_to_xyz(target_latitudes, target_longitudes)
+    target_points = np.array(target_xyz).transpose()
+    _, indices = cKDTree(source_points).query(target_points, k=1)
+    return indices
 if __name__ == "__main__":
     global_lats, global_lons = np.meshgrid(
         np.linspace(90, -90, 90),

anemoi-datasets 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl

anemoi-datasets 0.5.12py3-none-any.whl → 0.5.14py3-none-any.whl