PyPI - anemoi-datasets - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

anemoi-datasets 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/cleanup.py +44 -0
anemoi/datasets/commands/create.py +52 -21
anemoi/datasets/commands/finalise-additions.py +45 -0
anemoi/datasets/commands/finalise.py +39 -0
anemoi/datasets/commands/init-additions.py +45 -0
anemoi/datasets/commands/init.py +67 -0
anemoi/datasets/commands/inspect.py +1 -1
anemoi/datasets/commands/load-additions.py +47 -0
anemoi/datasets/commands/load.py +47 -0
anemoi/datasets/commands/patch.py +39 -0
anemoi/datasets/create/__init__.py +959 -146
anemoi/datasets/create/check.py +5 -3
anemoi/datasets/create/config.py +54 -2
anemoi/datasets/create/functions/filters/pressure_level_relative_humidity_to_specific_humidity.py +57 -0
anemoi/datasets/create/functions/filters/pressure_level_specific_humidity_to_relative_humidity.py +57 -0
anemoi/datasets/create/functions/filters/single_level_dewpoint_to_relative_humidity.py +54 -0
anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_dewpoint.py +59 -0
anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_specific_humidity.py +115 -0
anemoi/datasets/create/functions/filters/single_level_specific_humidity_to_relative_humidity.py +390 -0
anemoi/datasets/create/functions/filters/speeddir_to_uv.py +77 -0
anemoi/datasets/create/functions/filters/uv_to_speeddir.py +55 -0
anemoi/datasets/create/functions/sources/grib.py +86 -1
anemoi/datasets/create/functions/sources/hindcasts.py +14 -73
anemoi/datasets/create/functions/sources/mars.py +9 -3
anemoi/datasets/create/functions/sources/xarray/__init__.py +12 -2
anemoi/datasets/create/functions/sources/xarray/coordinates.py +7 -0
anemoi/datasets/create/functions/sources/xarray/field.py +8 -2
anemoi/datasets/create/functions/sources/xarray/fieldlist.py +0 -2
anemoi/datasets/create/functions/sources/xarray/flavour.py +21 -1
anemoi/datasets/create/functions/sources/xarray/metadata.py +40 -40
anemoi/datasets/create/functions/sources/xarray/time.py +63 -30
anemoi/datasets/create/functions/sources/xarray/variable.py +15 -38
anemoi/datasets/create/input.py +62 -39
anemoi/datasets/create/persistent.py +1 -1
anemoi/datasets/create/statistics/__init__.py +39 -23
anemoi/datasets/create/utils.py +6 -2
anemoi/datasets/data/__init__.py +1 -0
anemoi/datasets/data/concat.py +46 -2
anemoi/datasets/data/dataset.py +119 -34
anemoi/datasets/data/debug.py +5 -1
anemoi/datasets/data/forwards.py +17 -8
anemoi/datasets/data/grids.py +17 -3
anemoi/datasets/data/interpolate.py +133 -0
anemoi/datasets/data/masked.py +2 -2
anemoi/datasets/data/misc.py +56 -66
anemoi/datasets/data/missing.py +240 -0
anemoi/datasets/data/rescale.py +147 -0
anemoi/datasets/data/select.py +7 -1
anemoi/datasets/data/stores.py +23 -10
anemoi/datasets/data/subset.py +47 -5
anemoi/datasets/data/unchecked.py +20 -22
anemoi/datasets/data/xy.py +125 -0
anemoi/datasets/dates/__init__.py +124 -95
anemoi/datasets/dates/groups.py +85 -20
anemoi/datasets/grids.py +66 -48
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.5.0.dist-info}/METADATA +8 -17
anemoi_datasets-0.5.0.dist-info/RECORD +105 -0
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.5.0.dist-info}/WHEEL +1 -1
anemoi/datasets/create/loaders.py +0 -936
anemoi_datasets-0.4.4.dist-info/RECORD +0 -86
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.5.0.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.5.0.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.5.0.dist-info}/top_level.txt +0 -0

anemoi/datasets/data/misc.py CHANGED Viewed

@@ -8,7 +8,6 @@
 import calendar
 import datetime
 import logging
-import re
 from pathlib import PurePath
 import numpy as np
@@ -39,26 +38,21 @@ def add_dataset_path(path):
         config["datasets"]["path"].append(path)
-def _frequency_to_hours(frequency):
-    if isinstance(frequency, int):
-        return frequency
-    if isinstance(frequency, float):
-        assert int(frequency) == frequency
-        return int(frequency)
-    m = re.match(r"(\d+)([dh])?", frequency)
-    if m is None:
-        raise ValueError("Invalid frequency: " + frequency)
-    frequency = int(m.group(1))
-    if m.group(2) == "h":
-        return frequency
-    if m.group(2) == "d":
-        return frequency * 24
+def round_datetime(d, dates, up):
+    """Round up (or down) a datetime to the nearest date in a list of dates"""
+    if dates is None or len(dates) == 0:
+        return d
-    raise NotImplementedError()
+    for i, date in enumerate(dates):
+        if date == d:
+            return date
+        if date > d:
+            if up:
+                return date
+            if i > 0:
+                return dates[i - 1]
+            return date
+    return dates[-1]
 def _as_date(d, dates, last):
@@ -67,7 +61,8 @@ def _as_date(d, dates, last):
     # so we need to check for datetime.datetime first
     if isinstance(d, (np.datetime64, datetime.datetime)):
-        return d
+        d = round_datetime(d, dates, up=not last)
+        return np.datetime64(d)
     if isinstance(d, datetime.date):
         d = d.year * 10_000 + d.month * 100 + d.day
@@ -81,27 +76,27 @@ def _as_date(d, dates, last):
         if len(str(d)) == 4:
             year = d
             if last:
-                return np.datetime64(f"{year:04}-12-31T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-12-31T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-01-01T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-01-01T00:00:00"), dates, last)
         if len(str(d)) == 6:
             year = d // 100
             month = d % 100
             if last:
                 _, last_day = calendar.monthrange(year, month)
-                return np.datetime64(f"{year:04}-{month:02}-{last_day:02}T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{last_day:02}T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-{month:02}-01T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-01T00:00:00"), dates, last)
         if len(str(d)) == 8:
             year = d // 10000
             month = (d % 10000) // 100
             day = d % 100
             if last:
-                return np.datetime64(f"{year:04}-{month:02}-{day:02}T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00"), dates, last)
     if isinstance(d, str):
@@ -109,7 +104,11 @@ def _as_date(d, dates, last):
             date, time = d.replace(" ", "T").split("T")
             year, month, day = [int(_) for _ in date.split("-")]
             hour, minute, second = [int(_) for _ in time.split(":")]
-            return np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}")
+            return _as_date(
+                np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}"),
+                dates,
+                last,
+            )
         if "-" in d:
             assert ":" not in d
@@ -121,11 +120,8 @@ def _as_date(d, dates, last):
                 return _as_date(int(bits[0]) * 100 + int(bits[1]), dates, last)
             if len(bits) == 3:
-                return _as_date(
-                    int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]),
-                    dates,
-                    last,
-                )
+                return _as_date(int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]), dates, last)
         if ":" in d:
             assert len(d) == 5
             hour, minute = d.split(":")
@@ -136,7 +132,7 @@ def _as_date(d, dates, last):
             month = first.month
             day = first.day
-            return np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00")
+            return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00"), dates, last)
     raise NotImplementedError(f"Unsupported date: {d} ({type(d)})")
@@ -163,28 +159,10 @@ def _concat_or_join(datasets, kwargs):
         return Join(datasets)._overlay(), kwargs
-    # Make sure the dates are disjoint
-    for i in range(len(ranges)):
-        r = ranges[i]
-        for j in range(i + 1, len(ranges)):
-            s = ranges[j]
-            if r[0] <= s[0] <= r[1] or r[0] <= s[1] <= r[1]:
-                raise ValueError(f"Overlapping dates: {r} and {s} ({datasets[i]} {datasets[j]})")
-    # For now we should have the datasets in order with no gaps
-    frequency = _frequency_to_hours(datasets[0].frequency)
-    for i in range(len(ranges) - 1):
-        r = ranges[i]
-        s = ranges[i + 1]
-        if r[1] + datetime.timedelta(hours=frequency) != s[0]:
-            raise ValueError(
-                "Datasets must be sorted by dates, with no gaps: " f"{r} and {s} ({datasets[i]} {datasets[i+1]})"
-            )
     from .concat import Concat
+    Concat.check_dataset_compatibility(datasets)
     return Concat(datasets), kwargs
@@ -193,7 +171,7 @@ def _open(a):
     from .stores import zarr_lookup
     if isinstance(a, Dataset):
-        return a
+        return a.mutate()
     if isinstance(a, zarr.hierarchy.Group):
         return Zarr(a).mutate()
@@ -202,13 +180,13 @@ def _open(a):
         return Zarr(zarr_lookup(a)).mutate()
     if isinstance(a, PurePath):
-        return _open(str(a))
+        return _open(str(a)).mutate()
     if isinstance(a, dict):
-        return _open_dataset(**a)
+        return _open_dataset(**a).mutate()
     if isinstance(a, (list, tuple)):
-        return _open_dataset(*a)
+        return _open_dataset(*a).mutate()
     raise NotImplementedError(f"Unsupported argument: {type(a)}")
@@ -288,47 +266,59 @@ def _open_dataset(*args, **kwargs):
     for a in args:
         sets.append(_open(a))
+    if "xy" in kwargs:
+        from .xy import xy_factory
+        assert not sets, sets
+        return xy_factory(args, kwargs).mutate()
+    if "x" in kwargs and "y" in kwargs:
+        from .xy import xy_factory
+        assert not sets, sets
+        return xy_factory(args, kwargs).mutate()
     if "zip" in kwargs:
-        from .unchecked import zip_factory
+        from .xy import zip_factory
         assert not sets, sets
-        return zip_factory(args, kwargs)
+        return zip_factory(args, kwargs).mutate()
     if "chain" in kwargs:
         from .unchecked import chain_factory
         assert not sets, sets
-        return chain_factory(args, kwargs)
+        return chain_factory(args, kwargs).mutate()
     if "join" in kwargs:
         from .join import join_factory
         assert not sets, sets
-        return join_factory(args, kwargs)
+        return join_factory(args, kwargs).mutate()
     if "concat" in kwargs:
         from .concat import concat_factory
         assert not sets, sets
-        return concat_factory(args, kwargs)
+        return concat_factory(args, kwargs).mutate()
     if "ensemble" in kwargs:
         from .ensemble import ensemble_factory
         assert not sets, sets
-        return ensemble_factory(args, kwargs)
+        return ensemble_factory(args, kwargs).mutate()
     if "grids" in kwargs:
         from .grids import grids_factory
         assert not sets, sets
-        return grids_factory(args, kwargs)
+        return grids_factory(args, kwargs).mutate()
     if "cutout" in kwargs:
         from .grids import cutout_factory
         assert not sets, sets
-        return cutout_factory(args, kwargs)
+        return cutout_factory(args, kwargs).mutate()
     for name in ("datasets", "dataset"):
         if name in kwargs:

anemoi/datasets/data/missing.py ADDED Viewed

@@ -0,0 +1,240 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+from functools import cached_property
+import numpy as np
+from anemoi.datasets.create.utils import to_datetime
+from anemoi.datasets.data import MissingDateError
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Forwards
+from .indexing import expand_list_indexing
+from .indexing import update_tuple
+LOG = logging.getLogger(__name__)
+class MissingDates(Forwards):
+    # TODO: Use that class instead of ZarrMissing
+    def __init__(self, dataset, missing_dates):
+        super().__init__(dataset)
+        self.missing_dates = []
+        self._missing = set()
+        other = []
+        for date in missing_dates:
+            if isinstance(date, int):
+                self._missing.add(date)
+                self.missing_dates.append(dataset.dates[date])
+            else:
+                date = to_datetime(date)
+                other.append(date)
+        if other:
+            for i, date in enumerate(dataset.dates):
+                if date in other:
+                    self._missing.add(i)
+                    self.missing_dates.append(date)
+        n = self.forward._len
+        self._missing = set(i for i in self._missing if 0 <= i < n)
+        self.missing_dates = sorted(to_datetime(x) for x in self.missing_dates)
+        assert len(self._missing), "No dates to force missing"
+    @cached_property
+    def missing(self):
+        return self._missing.union(self.forward.missing)
+    @debug_indexing
+    @expand_list_indexing
+    def __getitem__(self, n):
+        if isinstance(n, int):
+            if n in self.missing:
+                self._report_missing(n)
+            return self.forward[n]
+        if isinstance(n, slice):
+            common = set(range(*n.indices(len(self)))) & self.missing
+            if common:
+                self._report_missing(list(common)[0])
+            return self.forward[n]
+        if isinstance(n, tuple):
+            first = n[0]
+            if isinstance(first, int):
+                if first in self.missing:
+                    self._report_missing(first)
+                return self.forward[n]
+            if isinstance(first, slice):
+                common = set(range(*first.indices(len(self)))) & self.missing
+                if common:
+                    self._report_missing(list(common)[0])
+                return self.forward[n]
+            if isinstance(first, (list, tuple)):
+                common = set(first) & self.missing
+                if common:
+                    self._report_missing(list(common)[0])
+                return self.forward[n]
+        raise TypeError(f"Unsupported index {n} {type(n)}")
+    def _report_missing(self, n):
+        raise MissingDateError(f"Date {self.forward.dates[n]} is missing (index={n})")
+    @property
+    def reason(self):
+        return {"missing_dates": self.missing_dates}
+    def tree(self):
+        return Node(self, [self.forward.tree()], **self.reason)
+    def subclass_metadata_specific(self):
+        return {"missing_dates": self.missing_dates}
+class SkipMissingDates(Forwards):
+    def __init__(self, dataset, expected_access):
+        super().__init__(dataset)
+        # if isinstance(expected_access, (tuple, list)):
+        #     expected_access = slice(*expected_access)
+        if isinstance(expected_access, int):
+            expected_access = slice(0, expected_access)
+        assert isinstance(expected_access, slice), f"Expected access must be a slice, got {expected_access}"
+        expected_access = slice(*expected_access.indices(dataset._len))
+        missing = dataset.missing.copy()
+        size = (expected_access.stop - expected_access.start) // expected_access.step
+        indices = []
+        for i in range(dataset._len):
+            s = slice(expected_access.start + i, expected_access.stop + i, expected_access.step)
+            p = set(range(*s.indices(dataset._len)))
+            if p.intersection(missing):
+                continue
+            if len(p) != size:
+                continue
+            indices.append(tuple(sorted(p)))
+        self.expected_access = expected_access
+        self.indices = indices
+    def __len__(self):
+        return len(self.indices)
+    @property
+    def start_date(self):
+        return self.forward.start_date
+    @property
+    def end_date(self):
+        return self.forward.end_date
+    @property
+    def dates(self):
+        raise NotImplementedError("SkipMissingDates.dates")
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        def _get_one(n):
+            result = []
+            for i in self.indices[n]:
+                s, _ = update_tuple(index, 0, i)
+                result.append(self.forward[s])
+            return tuple(result)
+        first = index[0]
+        if isinstance(first, int):
+            return _get_one(first)
+        assert isinstance(first, slice), f"SkipMissingDates._get_tuple {index}"
+        values = [_get_one(i) for i in range(*first.indices(self._len))]
+        result = [_ for _ in zip(*values)]
+        return tuple(np.stack(_) for _ in result)
+    @debug_indexing
+    def _get_slice(self, s):
+        values = [self[i] for i in range(*s.indices(self._len))]
+        result = [_ for _ in zip(*values)]
+        return tuple(np.stack(_) for _ in result)
+    @debug_indexing
+    def __getitem__(self, n):
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+        if isinstance(n, slice):
+            return self._get_slice(n)
+        return tuple(self.forward[i] for i in self.indices[n])
+    @property
+    def frequency(self):
+        return self.forward.frequency
+    def tree(self):
+        return Node(self, [self.forward.tree()], expected_access=self.expected_access)
+    def subclass_metadata_specific(self):
+        return {"expected_access": self.expected_access}
+class MissingDataset(Forwards):
+    def __init__(self, dataset, start, end):
+        super().__init__(dataset)
+        self.start = start
+        self.end = end
+        dates = []
+        date = start
+        while date <= end:
+            dates.append(date)
+            date += dataset.frequency
+        self._dates = np.array(dates, dtype="datetime64")
+        self._missing = set(range(len(dates)))
+    def __len__(self):
+        return len(self._dates)
+    @property
+    def dates(self):
+        return self._dates
+    @property
+    def missing(self):
+        return self._missing
+    def __getitem__(self, n):
+        raise MissingDateError(f"Date {self.dates[n]} is missing (index={n})")
+    def tree(self):
+        return Node(self, [self.forward.tree()], start=self.start, end=self.end)
+    def subclass_metadata_specific(self):
+        return {"start": self.start, "end": self.end}

anemoi/datasets/data/rescale.py ADDED Viewed

@@ -0,0 +1,147 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+from functools import cached_property
+import numpy as np
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Forwards
+from .indexing import apply_index_to_slices_changes
+from .indexing import expand_list_indexing
+from .indexing import index_to_slices
+from .indexing import update_tuple
+LOG = logging.getLogger(__name__)
+def make_rescale(variable, rescale):
+    if isinstance(rescale, (tuple, list)):
+        assert len(rescale) == 2, rescale
+        if isinstance(rescale[0], (int, float)):
+            return rescale
+        from cfunits import Units
+        u0 = Units(rescale[0])
+        u1 = Units(rescale[1])
+        x1, x2 = 0.0, 1.0
+        y1, y2 = Units.conform([x1, x2], u0, u1)
+        a = (y2 - y1) / (x2 - x1)
+        b = y1 - a * x1
+        return a, b
+        return rescale
+    if isinstance(rescale, dict):
+        assert "scale" in rescale, rescale
+        assert "offset" in rescale, rescale
+        return rescale["scale"], rescale["offset"]
+    assert False
+class Rescale(Forwards):
+    def __init__(self, dataset, rescale):
+        super().__init__(dataset)
+        for n in rescale:
+            assert n in dataset.variables, n
+        variables = dataset.variables
+        self._a = np.ones(len(variables))
+        self._b = np.zeros(len(variables))
+        self.rescale = {}
+        for i, v in enumerate(variables):
+            if v in rescale:
+                a, b = make_rescale(v, rescale[v])
+                self.rescale[v] = a, b
+                self._a[i], self._b[i] = a, b
+        self._a = self._a[np.newaxis, :, np.newaxis, np.newaxis]
+        self._b = self._b[np.newaxis, :, np.newaxis, np.newaxis]
+        self._a = self._a.astype(self.forward.dtype)
+        self._b = self._b.astype(self.forward.dtype)
+    def tree(self):
+        return Node(self, [self.forward.tree()], rescale=self.rescale)
+    def subclass_metadata_specific(self):
+        return dict(rescale=self.rescale)
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 1, slice(None))
+        result = self.forward[index]
+        result = result * self._a + self._b
+        result = result[:, previous]
+        result = apply_index_to_slices_changes(result, changes)
+        return result
+    @debug_indexing
+    def __get_slice_(self, n):
+        data = self.forward[n]
+        return data * self._a + self._b
+    @debug_indexing
+    def __getitem__(self, n):
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+        if isinstance(n, slice):
+            return self.__get_slice_(n)
+        data = self.forward[n]
+        return data * self._a[0] + self._b[0]
+    @cached_property
+    def statistics(self):
+        result = {}
+        a = self._a.squeeze()
+        assert np.all(a >= 0)
+        b = self._b.squeeze()
+        for k, v in self.forward.statistics.items():
+            if k in ("maximum", "minimum", "mean"):
+                result[k] = v * a + b
+                continue
+            if k in ("stdev",):
+                result[k] = v * a
+                continue
+            raise NotImplementedError("rescale statistics", k)
+        return result
+    def statistics_tendencies(self, delta=None):
+        result = {}
+        a = self._a.squeeze()
+        assert np.all(a >= 0)
+        for k, v in self.forward.statistics_tendencies(delta).items():
+            if k in ("maximum", "minimum", "mean", "stdev"):
+                result[k] = v * a
+                continue
+            raise NotImplementedError("rescale tendencies statistics", k)
+        return result

anemoi/datasets/data/select.py CHANGED Viewed

@@ -40,6 +40,12 @@ class Select(Forwards):
         # Forward other properties to the main dataset
         super().__init__(dataset)
+    def clone(self, dataset):
+        return self.__class__(dataset, self.indices, self.reason).mutate()
+    def mutate(self):
+        return self.forward.swap_with_parent(parent=self)
     @debug_indexing
     @expand_list_indexing
     def _get_tuple(self, index):
@@ -101,7 +107,7 @@ class Rename(Forwards):
     def __init__(self, dataset, rename):
         super().__init__(dataset)
         for n in rename:
-            assert n in dataset.variables
+            assert n in dataset.variables, n
         self._variables = [rename.get(v, v) for v in dataset.variables]
         self.rename = rename

anemoi-datasets 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

anemoi-datasets 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl