PyPI - anemoi-datasets - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

anemoi-datasets 0.4.4py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/cleanup.py +44 -0
anemoi/datasets/commands/create.py +50 -20
anemoi/datasets/commands/finalise-additions.py +45 -0
anemoi/datasets/commands/finalise.py +39 -0
anemoi/datasets/commands/init-additions.py +45 -0
anemoi/datasets/commands/init.py +67 -0
anemoi/datasets/commands/inspect.py +1 -1
anemoi/datasets/commands/load-additions.py +47 -0
anemoi/datasets/commands/load.py +47 -0
anemoi/datasets/commands/patch.py +39 -0
anemoi/datasets/create/__init__.py +961 -146
anemoi/datasets/create/check.py +5 -3
anemoi/datasets/create/config.py +53 -2
anemoi/datasets/create/functions/sources/xarray/__init__.py +12 -2
anemoi/datasets/create/functions/sources/xarray/coordinates.py +7 -0
anemoi/datasets/create/functions/sources/xarray/field.py +1 -1
anemoi/datasets/create/functions/sources/xarray/fieldlist.py +0 -2
anemoi/datasets/create/functions/sources/xarray/flavour.py +21 -1
anemoi/datasets/create/functions/sources/xarray/metadata.py +27 -29
anemoi/datasets/create/functions/sources/xarray/time.py +63 -30
anemoi/datasets/create/functions/sources/xarray/variable.py +15 -38
anemoi/datasets/create/input.py +23 -22
anemoi/datasets/create/statistics/__init__.py +39 -23
anemoi/datasets/create/utils.py +3 -2
anemoi/datasets/data/__init__.py +1 -0
anemoi/datasets/data/concat.py +46 -2
anemoi/datasets/data/dataset.py +109 -34
anemoi/datasets/data/forwards.py +17 -8
anemoi/datasets/data/grids.py +17 -3
anemoi/datasets/data/interpolate.py +133 -0
anemoi/datasets/data/misc.py +56 -66
anemoi/datasets/data/missing.py +240 -0
anemoi/datasets/data/select.py +7 -1
anemoi/datasets/data/stores.py +3 -3
anemoi/datasets/data/subset.py +47 -5
anemoi/datasets/data/unchecked.py +20 -22
anemoi/datasets/data/xy.py +125 -0
anemoi/datasets/dates/__init__.py +13 -66
anemoi/datasets/dates/groups.py +2 -2
anemoi/datasets/grids.py +66 -48
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.4.5.dist-info}/METADATA +5 -5
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.4.5.dist-info}/RECORD +47 -37
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.4.5.dist-info}/WHEEL +1 -1
anemoi/datasets/create/loaders.py +0 -936
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.4.5.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.4.5.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.4.4.dist-info → anemoi_datasets-0.4.5.dist-info}/top_level.txt +0 -0

anemoi/datasets/data/misc.py CHANGED Viewed

@@ -8,7 +8,6 @@
 import calendar
 import datetime
 import logging
-import re
 from pathlib import PurePath
 import numpy as np
@@ -39,26 +38,21 @@ def add_dataset_path(path):
         config["datasets"]["path"].append(path)
-def _frequency_to_hours(frequency):
-    if isinstance(frequency, int):
-        return frequency
-    if isinstance(frequency, float):
-        assert int(frequency) == frequency
-        return int(frequency)
-    m = re.match(r"(\d+)([dh])?", frequency)
-    if m is None:
-        raise ValueError("Invalid frequency: " + frequency)
-    frequency = int(m.group(1))
-    if m.group(2) == "h":
-        return frequency
-    if m.group(2) == "d":
-        return frequency * 24
+def round_datetime(d, dates, up):
+    """Round up (or down) a datetime to the nearest date in a list of dates"""
+    if dates is None or len(dates) == 0:
+        return d
-    raise NotImplementedError()
+    for i, date in enumerate(dates):
+        if date == d:
+            return date
+        if date > d:
+            if up:
+                return date
+            if i > 0:
+                return dates[i - 1]
+            return date
+    return dates[-1]
 def _as_date(d, dates, last):
@@ -67,7 +61,8 @@ def _as_date(d, dates, last):
     # so we need to check for datetime.datetime first
     if isinstance(d, (np.datetime64, datetime.datetime)):
-        return d
+        d = round_datetime(d, dates, up=not last)
+        return np.datetime64(d)
     if isinstance(d, datetime.date):
         d = d.year * 10_000 + d.month * 100 + d.day
@@ -81,27 +76,27 @@ def _as_date(d, dates, last):
         if len(str(d)) == 4:
             year = d
             if last:
-                return np.datetime64(f"{year:04}-12-31T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-12-31T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-01-01T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-01-01T00:00:00"), dates, last)
         if len(str(d)) == 6:
             year = d // 100
             month = d % 100
             if last:
                 _, last_day = calendar.monthrange(year, month)
-                return np.datetime64(f"{year:04}-{month:02}-{last_day:02}T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{last_day:02}T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-{month:02}-01T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-01T00:00:00"), dates, last)
         if len(str(d)) == 8:
             year = d // 10000
             month = (d % 10000) // 100
             day = d % 100
             if last:
-                return np.datetime64(f"{year:04}-{month:02}-{day:02}T23:59:59")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T23:59:59"), dates, last)
             else:
-                return np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00")
+                return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00"), dates, last)
     if isinstance(d, str):
@@ -109,7 +104,11 @@ def _as_date(d, dates, last):
             date, time = d.replace(" ", "T").split("T")
             year, month, day = [int(_) for _ in date.split("-")]
             hour, minute, second = [int(_) for _ in time.split(":")]
-            return np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}")
+            return _as_date(
+                np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}"),
+                dates,
+                last,
+            )
         if "-" in d:
             assert ":" not in d
@@ -121,11 +120,8 @@ def _as_date(d, dates, last):
                 return _as_date(int(bits[0]) * 100 + int(bits[1]), dates, last)
             if len(bits) == 3:
-                return _as_date(
-                    int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]),
-                    dates,
-                    last,
-                )
+                return _as_date(int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]), dates, last)
         if ":" in d:
             assert len(d) == 5
             hour, minute = d.split(":")
@@ -136,7 +132,7 @@ def _as_date(d, dates, last):
             month = first.month
             day = first.day
-            return np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00")
+            return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00"), dates, last)
     raise NotImplementedError(f"Unsupported date: {d} ({type(d)})")
@@ -163,28 +159,10 @@ def _concat_or_join(datasets, kwargs):
         return Join(datasets)._overlay(), kwargs
-    # Make sure the dates are disjoint
-    for i in range(len(ranges)):
-        r = ranges[i]
-        for j in range(i + 1, len(ranges)):
-            s = ranges[j]
-            if r[0] <= s[0] <= r[1] or r[0] <= s[1] <= r[1]:
-                raise ValueError(f"Overlapping dates: {r} and {s} ({datasets[i]} {datasets[j]})")
-    # For now we should have the datasets in order with no gaps
-    frequency = _frequency_to_hours(datasets[0].frequency)
-    for i in range(len(ranges) - 1):
-        r = ranges[i]
-        s = ranges[i + 1]
-        if r[1] + datetime.timedelta(hours=frequency) != s[0]:
-            raise ValueError(
-                "Datasets must be sorted by dates, with no gaps: " f"{r} and {s} ({datasets[i]} {datasets[i+1]})"
-            )
     from .concat import Concat
+    Concat.check_dataset_compatibility(datasets)
     return Concat(datasets), kwargs
@@ -193,7 +171,7 @@ def _open(a):
     from .stores import zarr_lookup
     if isinstance(a, Dataset):
-        return a
+        return a.mutate()
     if isinstance(a, zarr.hierarchy.Group):
         return Zarr(a).mutate()
@@ -202,13 +180,13 @@ def _open(a):
         return Zarr(zarr_lookup(a)).mutate()
     if isinstance(a, PurePath):
-        return _open(str(a))
+        return _open(str(a)).mutate()
     if isinstance(a, dict):
-        return _open_dataset(**a)
+        return _open_dataset(**a).mutate()
     if isinstance(a, (list, tuple)):
-        return _open_dataset(*a)
+        return _open_dataset(*a).mutate()
     raise NotImplementedError(f"Unsupported argument: {type(a)}")
@@ -288,47 +266,59 @@ def _open_dataset(*args, **kwargs):
     for a in args:
         sets.append(_open(a))
+    if "xy" in kwargs:
+        from .xy import xy_factory
+        assert not sets, sets
+        return xy_factory(args, kwargs).mutate()
+    if "x" in kwargs and "y" in kwargs:
+        from .xy import xy_factory
+        assert not sets, sets
+        return xy_factory(args, kwargs).mutate()
     if "zip" in kwargs:
-        from .unchecked import zip_factory
+        from .xy import zip_factory
         assert not sets, sets
-        return zip_factory(args, kwargs)
+        return zip_factory(args, kwargs).mutate()
     if "chain" in kwargs:
         from .unchecked import chain_factory
         assert not sets, sets
-        return chain_factory(args, kwargs)
+        return chain_factory(args, kwargs).mutate()
     if "join" in kwargs:
         from .join import join_factory
         assert not sets, sets
-        return join_factory(args, kwargs)
+        return join_factory(args, kwargs).mutate()
     if "concat" in kwargs:
         from .concat import concat_factory
         assert not sets, sets
-        return concat_factory(args, kwargs)
+        return concat_factory(args, kwargs).mutate()
     if "ensemble" in kwargs:
         from .ensemble import ensemble_factory
         assert not sets, sets
-        return ensemble_factory(args, kwargs)
+        return ensemble_factory(args, kwargs).mutate()
     if "grids" in kwargs:
         from .grids import grids_factory
         assert not sets, sets
-        return grids_factory(args, kwargs)
+        return grids_factory(args, kwargs).mutate()
     if "cutout" in kwargs:
         from .grids import cutout_factory
         assert not sets, sets
-        return cutout_factory(args, kwargs)
+        return cutout_factory(args, kwargs).mutate()
     for name in ("datasets", "dataset"):
         if name in kwargs:

anemoi/datasets/data/missing.py ADDED Viewed

@@ -0,0 +1,240 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+from functools import cached_property
+import numpy as np
+from anemoi.datasets.create.utils import to_datetime
+from anemoi.datasets.data import MissingDateError
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Forwards
+from .indexing import expand_list_indexing
+from .indexing import update_tuple
+LOG = logging.getLogger(__name__)
+class MissingDates(Forwards):
+    # TODO: Use that class instead of ZarrMissing
+    def __init__(self, dataset, missing_dates):
+        super().__init__(dataset)
+        self.missing_dates = []
+        self._missing = set()
+        other = []
+        for date in missing_dates:
+            if isinstance(date, int):
+                self._missing.add(date)
+                self.missing_dates.append(dataset.dates[date])
+            else:
+                date = to_datetime(date)
+                other.append(date)
+        if other:
+            for i, date in enumerate(dataset.dates):
+                if date in other:
+                    self._missing.add(i)
+                    self.missing_dates.append(date)
+        n = self.forward._len
+        self._missing = set(i for i in self._missing if 0 <= i < n)
+        self.missing_dates = sorted(to_datetime(x) for x in self.missing_dates)
+        assert len(self._missing), "No dates to force missing"
+    @cached_property
+    def missing(self):
+        return self._missing.union(self.forward.missing)
+    @debug_indexing
+    @expand_list_indexing
+    def __getitem__(self, n):
+        if isinstance(n, int):
+            if n in self.missing:
+                self._report_missing(n)
+            return self.forward[n]
+        if isinstance(n, slice):
+            common = set(range(*n.indices(len(self)))) & self.missing
+            if common:
+                self._report_missing(list(common)[0])
+            return self.forward[n]
+        if isinstance(n, tuple):
+            first = n[0]
+            if isinstance(first, int):
+                if first in self.missing:
+                    self._report_missing(first)
+                return self.forward[n]
+            if isinstance(first, slice):
+                common = set(range(*first.indices(len(self)))) & self.missing
+                if common:
+                    self._report_missing(list(common)[0])
+                return self.forward[n]
+            if isinstance(first, (list, tuple)):
+                common = set(first) & self.missing
+                if common:
+                    self._report_missing(list(common)[0])
+                return self.forward[n]
+        raise TypeError(f"Unsupported index {n} {type(n)}")
+    def _report_missing(self, n):
+        raise MissingDateError(f"Date {self.forward.dates[n]} is missing (index={n})")
+    @property
+    def reason(self):
+        return {"missing_dates": self.missing_dates}
+    def tree(self):
+        return Node(self, [self.forward.tree()], **self.reason)
+    def subclass_metadata_specific(self):
+        return {"missing_dates": self.missing_dates}
+class SkipMissingDates(Forwards):
+    def __init__(self, dataset, expected_access):
+        super().__init__(dataset)
+        # if isinstance(expected_access, (tuple, list)):
+        #     expected_access = slice(*expected_access)
+        if isinstance(expected_access, int):
+            expected_access = slice(0, expected_access)
+        assert isinstance(expected_access, slice), f"Expected access must be a slice, got {expected_access}"
+        expected_access = slice(*expected_access.indices(dataset._len))
+        missing = dataset.missing.copy()
+        size = (expected_access.stop - expected_access.start) // expected_access.step
+        indices = []
+        for i in range(dataset._len):
+            s = slice(expected_access.start + i, expected_access.stop + i, expected_access.step)
+            p = set(range(*s.indices(dataset._len)))
+            if p.intersection(missing):
+                continue
+            if len(p) != size:
+                continue
+            indices.append(tuple(sorted(p)))
+        self.expected_access = expected_access
+        self.indices = indices
+    def __len__(self):
+        return len(self.indices)
+    @property
+    def start_date(self):
+        return self.forward.start_date
+    @property
+    def end_date(self):
+        return self.forward.end_date
+    @property
+    def dates(self):
+        raise NotImplementedError("SkipMissingDates.dates")
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        def _get_one(n):
+            result = []
+            for i in self.indices[n]:
+                s, _ = update_tuple(index, 0, i)
+                result.append(self.forward[s])
+            return tuple(result)
+        first = index[0]
+        if isinstance(first, int):
+            return _get_one(first)
+        assert isinstance(first, slice), f"SkipMissingDates._get_tuple {index}"
+        values = [_get_one(i) for i in range(*first.indices(self._len))]
+        result = [_ for _ in zip(*values)]
+        return tuple(np.stack(_) for _ in result)
+    @debug_indexing
+    def _get_slice(self, s):
+        values = [self[i] for i in range(*s.indices(self._len))]
+        result = [_ for _ in zip(*values)]
+        return tuple(np.stack(_) for _ in result)
+    @debug_indexing
+    def __getitem__(self, n):
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+        if isinstance(n, slice):
+            return self._get_slice(n)
+        return tuple(self.forward[i] for i in self.indices[n])
+    @property
+    def frequency(self):
+        return self.forward.frequency
+    def tree(self):
+        return Node(self, [self.forward.tree()], expected_access=self.expected_access)
+    def subclass_metadata_specific(self):
+        return {"expected_access": self.expected_access}
+class MissingDataset(Forwards):
+    def __init__(self, dataset, start, end):
+        super().__init__(dataset)
+        self.start = start
+        self.end = end
+        dates = []
+        date = start
+        while date <= end:
+            dates.append(date)
+            date += dataset.frequency
+        self._dates = np.array(dates, dtype="datetime64")
+        self._missing = set(range(len(dates)))
+    def __len__(self):
+        return len(self._dates)
+    @property
+    def dates(self):
+        return self._dates
+    @property
+    def missing(self):
+        return self._missing
+    def __getitem__(self, n):
+        raise MissingDateError(f"Date {self.dates[n]} is missing (index={n})")
+    def tree(self):
+        return Node(self, [self.forward.tree()], start=self.start, end=self.end)
+    def subclass_metadata_specific(self):
+        return {"start": self.start, "end": self.end}

anemoi/datasets/data/select.py CHANGED Viewed

@@ -40,6 +40,12 @@ class Select(Forwards):
         # Forward other properties to the main dataset
         super().__init__(dataset)
+    def clone(self, dataset):
+        return self.__class__(dataset, self.indices, self.reason).mutate()
+    def mutate(self):
+        return self.forward.swap_with_parent(parent=self)
     @debug_indexing
     @expand_list_indexing
     def _get_tuple(self, index):
@@ -101,7 +107,7 @@ class Rename(Forwards):
     def __init__(self, dataset, rename):
         super().__init__(dataset)
         for n in rename:
-            assert n in dataset.variables
+            assert n in dataset.variables, n
         self._variables = [rename.get(v, v) for v in dataset.variables]
         self.rename = rename

anemoi/datasets/data/stores.py CHANGED Viewed

@@ -13,6 +13,7 @@ from urllib.parse import urlparse
 import numpy as np
 import zarr
+from anemoi.utils.dates import frequency_to_timedelta
 from . import MissingDateError
 from .dataset import Dataset
@@ -268,12 +269,11 @@ class Zarr(Dataset):
     @property
     def frequency(self):
         try:
-            return self.z.attrs["frequency"]
+            return frequency_to_timedelta(self.z.attrs["frequency"])
         except KeyError:
             LOG.warning("No 'frequency' in %r, computing from 'dates'", self)
         dates = self.dates
-        delta = dates[1].astype(object) - dates[0].astype(object)
-        return int(delta.total_seconds() / 3600)
+        return dates[1].astype(object) - dates[0].astype(object)
     @property
     def name_to_index(self):

anemoi/datasets/data/subset.py CHANGED Viewed

@@ -9,6 +9,7 @@ import logging
 from functools import cached_property
 import numpy as np
+from anemoi.utils.dates import frequency_to_timedelta
 from .debug import Node
 from .debug import Source
@@ -23,13 +24,51 @@ from .indexing import update_tuple
 LOG = logging.getLogger(__name__)
+def _default(a, b, dates):
+    return [a, b]
+def _start(a, b, dates):
+    from .misc import as_first_date
+    c = as_first_date(a, dates)
+    d = as_first_date(b, dates)
+    if c < d:
+        return b
+    else:
+        return a
+def _end(a, b, dates):
+    from .misc import as_last_date
+    c = as_last_date(a, dates)
+    d = as_last_date(b, dates)
+    if c < d:
+        return a
+    else:
+        return b
+def _combine_reasons(reason1, reason2, dates):
+    reason = reason1.copy()
+    for k, v in reason2.items():
+        if k not in reason:
+            reason[k] = v
+        else:
+            func = globals().get(f"_{k}", _default)
+            reason[k] = func(reason[k], v, dates)
+    return reason
 class Subset(Forwards):
     """Select a subset of the dates."""
     def __init__(self, dataset, indices, reason):
         while isinstance(dataset, Subset):
             indices = [dataset.indices[i] for i in indices]
-            reason = {**reason, **dataset.reason}
+            reason = _combine_reasons(reason, dataset.reason, dataset.dates)
             dataset = dataset.dataset
         self.dataset = dataset
@@ -39,6 +78,12 @@ class Subset(Forwards):
         # Forward other properties to the super dataset
         super().__init__(dataset)
+    def clone(self, dataset):
+        return self.__class__(dataset, self.indices, self.reason).mutate()
+    def mutate(self):
+        return self.forward.swap_with_parent(parent=self)
     @debug_indexing
     def __getitem__(self, n):
         if isinstance(n, tuple):
@@ -66,10 +111,8 @@ class Subset(Forwards):
     @expand_list_indexing
     def _get_tuple(self, n):
         index, changes = index_to_slices(n, self.shape)
-        # print('INDEX', index, changes)
         indices = [self.indices[i] for i in range(*index[0].indices(self._len))]
         indices = make_slice_or_index_from_list_or_tuple(indices)
-        # print('INDICES', indices)
         index, _ = update_tuple(index, 0, indices)
         result = self.dataset[index]
         result = apply_index_to_slices_changes(result, changes)
@@ -89,8 +132,7 @@ class Subset(Forwards):
     @cached_property
     def frequency(self):
         dates = self.dates
-        delta = dates[1].astype(object) - dates[0].astype(object)
-        return int(delta.total_seconds() / 3600)
+        return frequency_to_timedelta(dates[1].astype(object) - dates[0].astype(object))
     def source(self, index):
         return Source(self, index, self.forward.source(index))

anemoi/datasets/data/unchecked.py CHANGED Viewed

@@ -104,22 +104,29 @@ class Unchecked(Combined):
     def shape(self):
         raise NotImplementedError()
-    @property
-    def dtype(self):
-        raise NotImplementedError()
+    # @property
+    # def field_shape(self):
+    #     return tuple(d.shape for d in self.datasets)
-    @property
-    def grids(self):
-        raise NotImplementedError()
+    # @property
+    # def latitudes(self):
+    #     return tuple(d.latitudes for d in self.datasets)
+    # @property
+    # def longitudes(self):
+    #     return tuple(d.longitudes for d in self.datasets)
-class Zip(Unchecked):
+    # @property
+    # def statistics(self):
+    #     return tuple(d.statistics for d in self.datasets)
-    def __len__(self):
-        return min(len(d) for d in self.datasets)
+    # @property
+    # def resolution(self):
+    #     return tuple(d.resolution for d in self.datasets)
-    def __getitem__(self, n):
-        return tuple(d[n] for d in self.datasets)
+    # @property
+    # def name_to_index(self):
+    #     return tuple(d.name_to_index for d in self.datasets)
     @cached_property
     def missing(self):
@@ -142,17 +149,8 @@ class Chain(ConcatMixin, Unchecked):
     def dates(self):
         raise NotImplementedError()
-def zip_factory(args, kwargs):
-    zip = kwargs.pop("zip")
-    assert len(args) == 0
-    assert isinstance(zip, (list, tuple))
-    datasets = [_open(e) for e in zip]
-    datasets, kwargs = _auto_adjust(datasets, kwargs)
-    return Zip(datasets)._subset(**kwargs)
+    def dataset_metadata(self):
+        return {"multiple": [d.dataset_metadata() for d in self.datasets]}
 def chain_factory(args, kwargs):

anemoi-datasets 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

anemoi-datasets 0.4.4py3-none-any.whl → 0.4.5py3-none-any.whl