PyPI - anemoi-datasets - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

anemoi-datasets 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/cleanup.py +44 -0
anemoi/datasets/commands/create.py +50 -20
anemoi/datasets/commands/finalise-additions.py +45 -0
anemoi/datasets/commands/finalise.py +39 -0
anemoi/datasets/commands/init-additions.py +45 -0
anemoi/datasets/commands/init.py +67 -0
anemoi/datasets/commands/inspect.py +1 -1
anemoi/datasets/commands/load-additions.py +47 -0
anemoi/datasets/commands/load.py +47 -0
anemoi/datasets/commands/patch.py +39 -0
anemoi/datasets/compute/recentre.py +1 -1
anemoi/datasets/create/__init__.py +961 -146
anemoi/datasets/create/check.py +5 -3
anemoi/datasets/create/config.py +53 -2
anemoi/datasets/create/functions/sources/accumulations.py +6 -22
anemoi/datasets/create/functions/sources/hindcasts.py +27 -12
anemoi/datasets/create/functions/sources/tendencies.py +1 -1
anemoi/datasets/create/functions/sources/xarray/__init__.py +12 -2
anemoi/datasets/create/functions/sources/xarray/coordinates.py +7 -0
anemoi/datasets/create/functions/sources/xarray/field.py +1 -1
anemoi/datasets/create/functions/sources/xarray/fieldlist.py +0 -2
anemoi/datasets/create/functions/sources/xarray/flavour.py +21 -1
anemoi/datasets/create/functions/sources/xarray/metadata.py +27 -29
anemoi/datasets/create/functions/sources/xarray/time.py +63 -30
anemoi/datasets/create/functions/sources/xarray/variable.py +15 -38
anemoi/datasets/create/input.py +62 -25
anemoi/datasets/create/statistics/__init__.py +39 -23
anemoi/datasets/create/utils.py +3 -2
anemoi/datasets/data/__init__.py +1 -0
anemoi/datasets/data/concat.py +46 -2
anemoi/datasets/data/dataset.py +109 -34
anemoi/datasets/data/forwards.py +17 -8
anemoi/datasets/data/grids.py +17 -3
anemoi/datasets/data/interpolate.py +133 -0
anemoi/datasets/data/misc.py +56 -66
anemoi/datasets/data/missing.py +240 -0
anemoi/datasets/data/select.py +7 -1
anemoi/datasets/data/stores.py +3 -3
anemoi/datasets/data/subset.py +47 -5
anemoi/datasets/data/unchecked.py +20 -22
anemoi/datasets/data/xy.py +125 -0
anemoi/datasets/dates/__init__.py +33 -20
anemoi/datasets/dates/groups.py +2 -2
anemoi/datasets/grids.py +66 -48
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/METADATA +5 -5
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/RECORD +51 -41
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/WHEEL +1 -1
anemoi/datasets/create/loaders.py +0 -924
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/top_level.txt +0 -0

anemoi/datasets/data/forwards.py CHANGED Viewed

@@ -23,7 +23,7 @@ LOG = logging.getLogger(__name__)
 class Forwards(Dataset):
     def __init__(self, forward):
-        self.forward = forward
+        self.forward = forward.mutate()
     def __len__(self):
         return len(self.forward)
@@ -118,6 +118,9 @@ class Combined(Forwards):
         # Forward most properties to the first dataset
         super().__init__(datasets[0])
+    def mutate(self):
+        return self
     def check_same_resolution(self, d1, d2):
         if d1.resolution != d2.resolution:
             raise ValueError(f"Incompatible resolutions: {d1.resolution} and {d2.resolution} ({d1} {d2})")
@@ -187,14 +190,9 @@ class Combined(Forwards):
             **kwargs,
         )
-    @cached_property
+    @property
     def missing(self):
-        offset = 0
-        result = set()
-        for d in self.datasets:
-            result.update(offset + m for m in d.missing)
-            offset += len(d)
-        return result
+        raise NotImplementedError("missing() not implemented for Combined")
     def get_dataset_names(self, names):
         for d in self.datasets:
@@ -249,3 +247,14 @@ class GivenAxis(Combined):
             return self._get_slice(n)
         return np.concatenate([d[n] for d in self.datasets], axis=self.axis - 1)
+    @cached_property
+    def missing(self):
+        offset = 0
+        result = set()
+        for d in self.datasets:
+            print("--->", d.missing, d)
+            result.update(offset + m for m in d.missing)
+            if self.axis == 0:  # Advance if axis is time
+                offset += len(d)
+        return result

anemoi/datasets/data/grids.py CHANGED Viewed

@@ -128,7 +128,7 @@ class Grids(GridsBase):
 class Cutout(GridsBase):
-    def __init__(self, datasets, axis):
+    def __init__(self, datasets, axis, min_distance_km=None, cropping_distance=2.0, neighbours=5, plot=False):
         from anemoi.datasets.grids import cutout_mask
         super().__init__(datasets, axis)
@@ -144,7 +144,10 @@ class Cutout(GridsBase):
             self.lam.longitudes,
             self.globe.latitudes,
             self.globe.longitudes,
-            # plot="cutout",
+            plot=plot,
+            min_distance_km=min_distance_km,
+            cropping_distance=cropping_distance,
+            neighbours=neighbours,
         )
         assert len(self.mask) == self.globe.shape[3], (
             len(self.mask),
@@ -229,6 +232,10 @@ def cutout_factory(args, kwargs):
     cutout = kwargs.pop("cutout")
     axis = kwargs.pop("axis", 3)
+    plot = kwargs.pop("plot", None)
+    min_distance_km = kwargs.pop("min_distance_km", None)
+    cropping_distance = kwargs.pop("cropping_distance", 2.0)
+    neighbours = kwargs.pop("neighbours", 5)
     assert len(args) == 0
     assert isinstance(cutout, (list, tuple))
@@ -236,4 +243,11 @@ def cutout_factory(args, kwargs):
     datasets = [_open(e) for e in cutout]
     datasets, kwargs = _auto_adjust(datasets, kwargs)
-    return Cutout(datasets, axis=axis)._subset(**kwargs)
+    return Cutout(
+        datasets,
+        axis=axis,
+        neighbours=neighbours,
+        min_distance_km=min_distance_km,
+        cropping_distance=cropping_distance,
+        plot=plot,
+    )._subset(**kwargs)

anemoi/datasets/data/interpolate.py ADDED Viewed

@@ -0,0 +1,133 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+from functools import cached_property
+import numpy as np
+from anemoi.utils.dates import frequency_to_timedelta
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Forwards
+from .indexing import apply_index_to_slices_changes
+from .indexing import expand_list_indexing
+from .indexing import index_to_slices
+from .indexing import update_tuple
+LOG = logging.getLogger(__name__)
+class InterpolateFrequency(Forwards):
+    def __init__(self, dataset, frequency):
+        super().__init__(dataset)
+        self._frequency = frequency_to_timedelta(frequency)
+        self.seconds = self._frequency.total_seconds()
+        other_seconds = dataset.frequency.total_seconds()
+        self.seconds = int(self.seconds)
+        assert self.seconds == self._frequency.total_seconds()
+        other_seconds = int(other_seconds)
+        assert other_seconds == dataset.frequency.total_seconds()
+        if self.seconds >= other_seconds:
+            raise ValueError(
+                f"Interpolate frequency {self._frequency} must be more frequent than dataset frequency {dataset.frequency}"
+            )
+        if other_seconds % self.seconds != 0:
+            raise ValueError(
+                f"Interpolate frequency {self._frequency}  must be a multiple of the dataset frequency {dataset.frequency}"
+            )
+        self.ratio = other_seconds // self.seconds
+        self.alphas = np.linspace(0, 1, self.ratio + 1)
+        self.other_len = len(dataset)
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 0, slice(None))
+        result = self._get_slice(previous)
+        return apply_index_to_slices_changes(result[index], changes)
+    def _get_slice(self, s):
+        return np.stack([self[i] for i in range(*s.indices(self._len))])
+    @debug_indexing
+    def __getitem__(self, n):
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+        if isinstance(n, slice):
+            return self._get_slice(n)
+        if n < 0:
+            n += self._len
+        if n == self._len - 1:
+            # Special case for the last element
+            return self.forward[-1]
+        i = n // self.ratio
+        x = n % self.ratio
+        if x == 0:
+            # No interpolation needed
+            return self.forward[i]
+        alpha = self.alphas[x]
+        assert 0 < alpha < 1, alpha
+        return self.forward[i] * (1 - alpha) + self.forward[i + 1] * alpha
+    def __len__(self):
+        return (self.other_len - 1) * self.ratio + 1
+    @property
+    def frequency(self):
+        return self._frequency
+    @cached_property
+    def dates(self):
+        result = []
+        deltas = [np.timedelta64(self.seconds * i, "s") for i in range(self.ratio)]
+        for d in self.forward.dates[:-1]:
+            for i in deltas:
+                result.append(d + i)
+        result.append(self.forward.dates[-1])
+        return np.array(result)
+    @property
+    def shape(self):
+        return (self._len,) + self.forward.shape[1:]
+    def tree(self):
+        return Node(self, [self.forward.tree()], frequency=self.frequency)
+    @cached_property
+    def missing(self):
+        result = []
+        j = 0
+        for i in range(self.other_len):
+            missing = i in self.forward.missing
+            for _ in range(self.ratio):
+                if missing:
+                    result.append(j)
+                j += 1
+        result = set(x for x in result if x < self._len)
+        return result
+    def subclass_metadata_specific(self):
+        return {
+            # "frequency": frequency_to_string(self._frequency),
+        }

anemoi/datasets/data/misc.py CHANGED Viewed

@@ -8,7 +8,6 @@
 import calendar
 import datetime
 import logging
-import re
 from pathlib import PurePath
 import numpy as np
@@ -39,26 +38,21 @@ def add_dataset_path(path):
         config["datasets"]["path"].append(path)
-def _frequency_to_hours(frequency):
-    if isinstance(frequency, int):
-        return frequency
-    if isinstance(frequency, float):
-        assert int(frequency) == frequency
-        return int(frequency)
-    m = re.match(r"(\d+)([dh])?", frequency)
-    if m is None:
-        raise ValueError("Invalid frequency: " + frequency)
-    frequency = int(m.group(1))
-    if m.group(2) == "h":
-        return frequency
-    if m.group(2) == "d":
-        return frequency * 24
+def round_datetime(d, dates, up):
+    """Round up (or down) a datetime to the nearest date in a list of dates"""
+    if dates is None or len(dates) == 0:
+        return d
-    raise NotImplementedError()
+    for i, date in enumerate(dates):
+        if date == d:
+            return date
+        if date > d:
+            if up:
+                return date
+            if i > 0:
+                return dates[i - 1]
+            return date
+    return dates[-1]
 def _as_date(d, dates, last):
@@ -67,7 +61,8 @@ def _as_date(d, dates, last):
     # so we need to check for datetime.datetime first
     if isinstance(d, (np.datetime64, datetime.datetime)):
-        return d
+        d = round_datetime(d, dates, up=not last)
+        return np.datetime64(d)
     if isinstance(d, datetime.date):
         d = d.year * 10_000 + d.month * 100 + d.day
@@ -81,27 +76,27 @@ def _as_date(d, dates, last):
         if len(str(d)) == 4:
             year = d
             if last:
-                return np.datetime64(f"{year:04}-12-31T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-12-31T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-01-01T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-01-01T00:00:00"), dates, last)
         if len(str(d)) == 6:
             year = d // 100
             month = d % 100
             if last:
                 _, last_day = calendar.monthrange(year, month)
-                return np.datetime64(f"{year:04}-{month:02}-{last_day:02}T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{last_day:02}T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-{month:02}-01T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-01T00:00:00"), dates, last)
         if len(str(d)) == 8:
             year = d // 10000
             month = (d % 10000) // 100
             day = d % 100
             if last:
-                return np.datetime64(f"{year:04}-{month:02}-{day:02}T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00"), dates, last)
     if isinstance(d, str):
@@ -109,7 +104,11 @@ def _as_date(d, dates, last):
             date, time = d.replace(" ", "T").split("T")
             year, month, day = [int(_) for _ in date.split("-")]
             hour, minute, second = [int(_) for _ in time.split(":")]
-            return np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}")
+            return _as_date(
+                np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}"),
+                dates,
+                last,
+            )
         if "-" in d:
             assert ":" not in d
@@ -121,11 +120,8 @@ def _as_date(d, dates, last):
                 return _as_date(int(bits[0]) * 100 + int(bits[1]), dates, last)
             if len(bits) == 3:
-                return _as_date(
-                    int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]),
-                    dates,
-                    last,
-                )
+                return _as_date(int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]), dates, last)
         if ":" in d:
             assert len(d) == 5
             hour, minute = d.split(":")
@@ -136,7 +132,7 @@ def _as_date(d, dates, last):
             month = first.month
             day = first.day
-            return np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00")
+            return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00"), dates, last)
     raise NotImplementedError(f"Unsupported date: {d} ({type(d)})")
@@ -163,28 +159,10 @@ def _concat_or_join(datasets, kwargs):
         return Join(datasets)._overlay(), kwargs
-    # Make sure the dates are disjoint
-    for i in range(len(ranges)):
-        r = ranges[i]
-        for j in range(i + 1, len(ranges)):
-            s = ranges[j]
-            if r[0] <= s[0] <= r[1] or r[0] <= s[1] <= r[1]:
-                raise ValueError(f"Overlapping dates: {r} and {s} ({datasets[i]} {datasets[j]})")
-    # For now we should have the datasets in order with no gaps
-    frequency = _frequency_to_hours(datasets[0].frequency)
-    for i in range(len(ranges) - 1):
-        r = ranges[i]
-        s = ranges[i + 1]
-        if r[1] + datetime.timedelta(hours=frequency) != s[0]:
-            raise ValueError(
-                "Datasets must be sorted by dates, with no gaps: " f"{r} and {s} ({datasets[i]} {datasets[i+1]})"
-            )
     from .concat import Concat
+    Concat.check_dataset_compatibility(datasets)
     return Concat(datasets), kwargs
@@ -193,7 +171,7 @@ def _open(a):
     from .stores import zarr_lookup
     if isinstance(a, Dataset):
-        return a
+        return a.mutate()
     if isinstance(a, zarr.hierarchy.Group):
         return Zarr(a).mutate()
@@ -202,13 +180,13 @@ def _open(a):
         return Zarr(zarr_lookup(a)).mutate()
     if isinstance(a, PurePath):
-        return _open(str(a))
+        return _open(str(a)).mutate()
     if isinstance(a, dict):
-        return _open_dataset(**a)
+        return _open_dataset(**a).mutate()
     if isinstance(a, (list, tuple)):
-        return _open_dataset(*a)
+        return _open_dataset(*a).mutate()
     raise NotImplementedError(f"Unsupported argument: {type(a)}")
@@ -288,47 +266,59 @@ def _open_dataset(*args, **kwargs):
     for a in args:
         sets.append(_open(a))
+    if "xy" in kwargs:
+        from .xy import xy_factory
+        assert not sets, sets
+        return xy_factory(args, kwargs).mutate()
+    if "x" in kwargs and "y" in kwargs:
+        from .xy import xy_factory
+        assert not sets, sets
+        return xy_factory(args, kwargs).mutate()
     if "zip" in kwargs:
-        from .unchecked import zip_factory
+        from .xy import zip_factory
         assert not sets, sets
-        return zip_factory(args, kwargs)
+        return zip_factory(args, kwargs).mutate()
     if "chain" in kwargs:
         from .unchecked import chain_factory
         assert not sets, sets
-        return chain_factory(args, kwargs)
+        return chain_factory(args, kwargs).mutate()
     if "join" in kwargs:
         from .join import join_factory
         assert not sets, sets
-        return join_factory(args, kwargs)
+        return join_factory(args, kwargs).mutate()
     if "concat" in kwargs:
         from .concat import concat_factory
         assert not sets, sets
-        return concat_factory(args, kwargs)
+        return concat_factory(args, kwargs).mutate()
     if "ensemble" in kwargs:
         from .ensemble import ensemble_factory
         assert not sets, sets
-        return ensemble_factory(args, kwargs)
+        return ensemble_factory(args, kwargs).mutate()
     if "grids" in kwargs:
         from .grids import grids_factory
         assert not sets, sets
-        return grids_factory(args, kwargs)
+        return grids_factory(args, kwargs).mutate()
     if "cutout" in kwargs:
         from .grids import cutout_factory
         assert not sets, sets
-        return cutout_factory(args, kwargs)
+        return cutout_factory(args, kwargs).mutate()
     for name in ("datasets", "dataset"):
         if name in kwargs:

anemoi-datasets 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

anemoi-datasets 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl