PyPI - anemoi-datasets - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

anemoi-datasets 0.5.0py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/inspect.py +1 -1
anemoi/datasets/commands/publish.py +30 -0
anemoi/datasets/create/__init__.py +42 -3
anemoi/datasets/create/check.py +6 -0
anemoi/datasets/create/functions/filters/rename.py +2 -3
anemoi/datasets/create/functions/sources/__init__.py +7 -1
anemoi/datasets/create/functions/sources/accumulations.py +2 -0
anemoi/datasets/create/functions/sources/grib.py +1 -1
anemoi/datasets/create/functions/sources/xarray/__init__.py +7 -2
anemoi/datasets/create/functions/sources/xarray/coordinates.py +12 -1
anemoi/datasets/create/functions/sources/xarray/field.py +13 -4
anemoi/datasets/create/functions/sources/xarray/fieldlist.py +16 -16
anemoi/datasets/create/functions/sources/xarray/flavour.py +130 -13
anemoi/datasets/create/functions/sources/xarray/grid.py +106 -17
anemoi/datasets/create/functions/sources/xarray/metadata.py +3 -11
anemoi/datasets/create/functions/sources/xarray/time.py +1 -5
anemoi/datasets/create/functions/sources/xarray/variable.py +10 -10
anemoi/datasets/create/input/__init__.py +69 -0
anemoi/datasets/create/input/action.py +123 -0
anemoi/datasets/create/input/concat.py +92 -0
anemoi/datasets/create/input/context.py +59 -0
anemoi/datasets/create/input/data_sources.py +71 -0
anemoi/datasets/create/input/empty.py +42 -0
anemoi/datasets/create/input/filter.py +76 -0
anemoi/datasets/create/input/function.py +122 -0
anemoi/datasets/create/input/join.py +57 -0
anemoi/datasets/create/input/misc.py +85 -0
anemoi/datasets/create/input/pipe.py +33 -0
anemoi/datasets/create/input/repeated_dates.py +217 -0
anemoi/datasets/create/input/result.py +413 -0
anemoi/datasets/create/input/step.py +99 -0
anemoi/datasets/create/{template.py → input/template.py} +0 -42
anemoi/datasets/create/statistics/__init__.py +1 -1
anemoi/datasets/create/zarr.py +4 -2
anemoi/datasets/dates/__init__.py +1 -0
anemoi/datasets/dates/groups.py +12 -4
anemoi/datasets/fields.py +66 -0
anemoi/datasets/utils/fields.py +47 -0
{anemoi_datasets-0.5.0.dist-info → anemoi_datasets-0.5.6.dist-info}/METADATA +1 -1
{anemoi_datasets-0.5.0.dist-info → anemoi_datasets-0.5.6.dist-info}/RECORD +46 -30
anemoi/datasets/create/input.py +0 -1087
/anemoi/datasets/create/{trace.py → input/trace.py} +0 -0
{anemoi_datasets-0.5.0.dist-info → anemoi_datasets-0.5.6.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.5.0.dist-info → anemoi_datasets-0.5.6.dist-info}/WHEEL +0 -0
{anemoi_datasets-0.5.0.dist-info → anemoi_datasets-0.5.6.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.5.0.dist-info → anemoi_datasets-0.5.6.dist-info}/top_level.txt +0 -0

anemoi/datasets/create/input/result.py ADDED Viewed

@@ -0,0 +1,413 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+import itertools
+import logging
+import math
+import time
+from collections import defaultdict
+from functools import cached_property
+import numpy as np
+from anemoi.utils.dates import as_datetime as as_datetime
+from anemoi.utils.dates import frequency_to_timedelta as frequency_to_timedelta
+from anemoi.utils.humanize import seconds_to_human
+from anemoi.utils.humanize import shorten_list
+from earthkit.data.core.order import build_remapping
+from anemoi.datasets.dates import DatesProvider as DatesProvider
+from anemoi.datasets.fields import FieldArray as FieldArray
+from anemoi.datasets.fields import NewValidDateTimeField as NewValidDateTimeField
+from .trace import trace
+from .trace import trace_datasource
+LOG = logging.getLogger(__name__)
+def _data_request(data):
+    date = None
+    params_levels = defaultdict(set)
+    params_steps = defaultdict(set)
+    area = grid = None
+    for field in data:
+        try:
+            if date is None:
+                date = field.metadata("valid_datetime")
+            if field.metadata("valid_datetime") != date:
+                continue
+            as_mars = field.metadata(namespace="mars")
+            if not as_mars:
+                continue
+            step = as_mars.get("step")
+            levtype = as_mars.get("levtype", "sfc")
+            param = as_mars["param"]
+            levelist = as_mars.get("levelist", None)
+            area = field.mars_area
+            grid = field.mars_grid
+            if levelist is None:
+                params_levels[levtype].add(param)
+            else:
+                params_levels[levtype].add((param, levelist))
+            if step:
+                params_steps[levtype].add((param, step))
+        except Exception:
+            LOG.error(f"Error in retrieving metadata (cannot build data request info) for {field}", exc_info=True)
+    def sort(old_dic):
+        new_dic = {}
+        for k, v in old_dic.items():
+            new_dic[k] = sorted(list(v))
+        return new_dic
+    params_steps = sort(params_steps)
+    params_levels = sort(params_levels)
+    return dict(param_level=params_levels, param_step=params_steps, area=area, grid=grid)
+class Result:
+    empty = False
+    _coords_already_built = False
+    def __init__(self, context, action_path, dates):
+        from anemoi.datasets.dates.groups import GroupOfDates
+        from .action import ActionContext
+        assert isinstance(dates, GroupOfDates), dates
+        assert isinstance(context, ActionContext), type(context)
+        assert isinstance(action_path, list), action_path
+        self.context = context
+        self.group_of_dates = dates
+        self.action_path = action_path
+    @property
+    @trace_datasource
+    def datasource(self):
+        self._raise_not_implemented()
+    @property
+    def data_request(self):
+        """Returns a dictionary with the parameters needed to retrieve the data."""
+        return _data_request(self.datasource)
+    def get_cube(self):
+        trace("🧊", f"getting cube from {self.__class__.__name__}")
+        ds = self.datasource
+        remapping = self.context.remapping
+        order_by = self.context.order_by
+        flatten_grid = self.context.flatten_grid
+        start = time.time()
+        LOG.debug("Sorting dataset %s %s", dict(order_by), remapping)
+        assert order_by, order_by
+        patches = {"number": {None: 0}}
+        try:
+            cube = ds.cube(
+                order_by,
+                remapping=remapping,
+                flatten_values=flatten_grid,
+                patches=patches,
+            )
+            cube = cube.squeeze()
+            LOG.debug(f"Sorting done in {seconds_to_human(time.time()-start)}.")
+        except ValueError:
+            self.explain(ds, order_by, remapping=remapping, patches=patches)
+            # raise ValueError(f"Error in {self}")
+            exit(1)
+        if LOG.isEnabledFor(logging.DEBUG):
+            LOG.debug("Cube shape: %s", cube)
+            for k, v in cube.user_coords.items():
+                LOG.debug("  %s %s", k, shorten_list(v, max_length=10))
+        return cube
+    def explain(self, ds, *args, remapping, patches):
+        METADATA = (
+            "date",
+            "time",
+            "step",
+            "hdate",
+            "valid_datetime",
+            "levtype",
+            "levelist",
+            "number",
+            "level",
+            "shortName",
+            "paramId",
+            "variable",
+        )
+        # We redo the logic here
+        print()
+        print("❌" * 40)
+        print()
+        if len(args) == 1 and isinstance(args[0], (list, tuple)):
+            args = args[0]
+        # print("Executing", self.action_path)
+        # print("Dates:", compress_dates(self.dates))
+        names = []
+        for a in args:
+            if isinstance(a, str):
+                names.append(a)
+            elif isinstance(a, dict):
+                names += list(a.keys())
+        print(f"Building a {len(names)}D hypercube using", names)
+        ds = ds.order_by(*args, remapping=remapping, patches=patches)
+        user_coords = ds.unique_values(*names, remapping=remapping, patches=patches, progress_bar=False)
+        print()
+        print("Number of unique values found for each coordinate:")
+        for k, v in user_coords.items():
+            print(f"  {k:20}:", len(v), shorten_list(v, max_length=10))
+        print()
+        user_shape = tuple(len(v) for k, v in user_coords.items())
+        print("Shape of the hypercube           :", user_shape)
+        print(
+            "Number of expected fields        :", math.prod(user_shape), "=", " x ".join([str(i) for i in user_shape])
+        )
+        print("Number of fields in the dataset  :", len(ds))
+        print("Difference                       :", abs(len(ds) - math.prod(user_shape)))
+        print()
+        remapping = build_remapping(remapping, patches)
+        expected = set(itertools.product(*user_coords.values()))
+        extra = set()
+        if math.prod(user_shape) > len(ds):
+            print(f"This means that all the fields in the datasets do not exists for all combinations of {names}.")
+            for f in ds:
+                metadata = remapping(f.metadata)
+                key = tuple(metadata(n, default=None) for n in names)
+                if key in expected:
+                    expected.remove(key)
+                else:
+                    extra.add(key)
+            print("Missing fields:")
+            print()
+            for i, f in enumerate(sorted(expected)):
+                print(" ", f)
+                if i >= 9 and len(expected) > 10:
+                    print("...", len(expected) - i - 1, "more")
+                    break
+            print("Extra fields:")
+            print()
+            for i, f in enumerate(sorted(extra)):
+                print(" ", f)
+                if i >= 9 and len(extra) > 10:
+                    print("...", len(extra) - i - 1, "more")
+                    break
+            print()
+            print("Missing values:")
+            per_name = defaultdict(set)
+            for e in expected:
+                for n, v in zip(names, e):
+                    per_name[n].add(v)
+            for n, v in per_name.items():
+                print(" ", n, len(v), shorten_list(sorted(v), max_length=10))
+            print()
+            print("Extra values:")
+            per_name = defaultdict(set)
+            for e in extra:
+                for n, v in zip(names, e):
+                    per_name[n].add(v)
+            for n, v in per_name.items():
+                print(" ", n, len(v), shorten_list(sorted(v), max_length=10))
+            print()
+            print("To solve this issue, you can:")
+            print(
+                "  - Provide a better selection, like 'step: 0' or 'level: 1000' to "
+                "reduce the number of selected fields."
+            )
+            print(
+                "  - Split the 'input' part in smaller sections using 'join', "
+                "making sure that each section represent a full hypercube."
+            )
+        else:
+            print(f"More fields in dataset that expected for {names}. " "This means that some fields are duplicated.")
+            duplicated = defaultdict(list)
+            for f in ds:
+                # print(f.metadata(namespace="default"))
+                metadata = remapping(f.metadata)
+                key = tuple(metadata(n, default=None) for n in names)
+                duplicated[key].append(f)
+            print("Duplicated fields:")
+            print()
+            duplicated = {k: v for k, v in duplicated.items() if len(v) > 1}
+            for i, (k, v) in enumerate(sorted(duplicated.items())):
+                print(" ", k)
+                for f in v:
+                    x = {k: f.metadata(k, default=None) for k in METADATA if f.metadata(k, default=None) is not None}
+                    print("   ", f, x)
+                if i >= 9 and len(duplicated) > 10:
+                    print("...", len(duplicated) - i - 1, "more")
+                    break
+            print()
+            print("To solve this issue, you can:")
+            print("  - Provide a better selection, like 'step: 0' or 'level: 1000'")
+            print("  - Change the way 'param' is computed using 'variable_naming' " "in the 'build' section.")
+        print()
+        print("❌" * 40)
+        print()
+        exit(1)
+    def __repr__(self, *args, _indent_="\n", **kwargs):
+        more = ",".join([str(a)[:5000] for a in args])
+        more += ",".join([f"{k}={v}"[:5000] for k, v in kwargs.items()])
+        dates = " no-dates"
+        if self.group_of_dates is not None:
+            dates = f" {len(self.group_of_dates)} dates"
+            dates += " ("
+            dates += "/".join(d.strftime("%Y-%m-%d:%H") for d in self.group_of_dates)
+            if len(dates) > 100:
+                dates = dates[:100] + "..."
+            dates += ")"
+        more = more[:5000]
+        txt = f"{self.__class__.__name__}:{dates}{_indent_}{more}"
+        if _indent_:
+            txt = txt.replace("\n", "\n  ")
+        return txt
+    def _raise_not_implemented(self):
+        raise NotImplementedError(f"Not implemented in {self.__class__.__name__}")
+    def _trace_datasource(self, *args, **kwargs):
+        return f"{self.__class__.__name__}({self.group_of_dates})"
+    def build_coords(self):
+        if self._coords_already_built:
+            return
+        from_data = self.get_cube().user_coords
+        from_config = self.context.order_by
+        keys_from_config = list(from_config.keys())
+        keys_from_data = list(from_data.keys())
+        assert keys_from_data == keys_from_config, f"Critical error: {keys_from_data=} != {keys_from_config=}. {self=}"
+        variables_key = list(from_config.keys())[1]
+        ensembles_key = list(from_config.keys())[2]
+        if isinstance(from_config[variables_key], (list, tuple)):
+            assert all([v == w for v, w in zip(from_data[variables_key], from_config[variables_key])]), (
+                from_data[variables_key],
+                from_config[variables_key],
+            )
+        self._variables = from_data[variables_key]  # "param_level"
+        self._ensembles = from_data[ensembles_key]  # "number"
+        first_field = self.datasource[0]
+        grid_points = first_field.grid_points()
+        lats, lons = grid_points
+        assert len(lats) == len(lons), (len(lats), len(lons), first_field)
+        assert len(lats) == math.prod(first_field.shape), (len(lats), first_field.shape, first_field)
+        north = np.amax(lats)
+        south = np.amin(lats)
+        east = np.amax(lons)
+        west = np.amin(lons)
+        assert -90 <= south <= north <= 90, (south, north, first_field)
+        assert (-180 <= west <= east <= 180) or (0 <= west <= east <= 360), (
+            west,
+            east,
+            first_field,
+        )
+        grid_values = list(range(len(grid_points[0])))
+        self._grid_points = grid_points
+        self._resolution = first_field.resolution
+        self._grid_values = grid_values
+        self._field_shape = first_field.shape
+        self._proj_string = first_field.proj_string if hasattr(first_field, "proj_string") else None
+    @property
+    def variables(self):
+        self.build_coords()
+        return self._variables
+    @property
+    def ensembles(self):
+        self.build_coords()
+        return self._ensembles
+    @property
+    def resolution(self):
+        self.build_coords()
+        return self._resolution
+    @property
+    def grid_values(self):
+        self.build_coords()
+        return self._grid_values
+    @property
+    def grid_points(self):
+        self.build_coords()
+        return self._grid_points
+    @property
+    def field_shape(self):
+        self.build_coords()
+        return self._field_shape
+    @property
+    def proj_string(self):
+        self.build_coords()
+        return self._proj_string
+    @cached_property
+    def shape(self):
+        return [
+            len(self.group_of_dates),
+            len(self.variables),
+            len(self.ensembles),
+            len(self.grid_values),
+        ]
+    @cached_property
+    def coords(self):
+        return {
+            "dates": list(self.group_of_dates),
+            "variables": self.variables,
+            "ensembles": self.ensembles,
+            "values": self.grid_values,
+        }

anemoi/datasets/create/input/step.py ADDED Viewed

@@ -0,0 +1,99 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+import logging
+from copy import deepcopy
+from anemoi.utils.dates import as_datetime as as_datetime
+from anemoi.utils.dates import frequency_to_timedelta as frequency_to_timedelta
+from anemoi.datasets.dates import DatesProvider as DatesProvider
+from anemoi.datasets.fields import FieldArray as FieldArray
+from anemoi.datasets.fields import NewValidDateTimeField as NewValidDateTimeField
+from .action import Action
+from .context import Context
+from .misc import is_function
+from .result import Result
+from .template import notify_result
+from .trace import trace_datasource
+from .trace import trace_select
+LOG = logging.getLogger(__name__)
+class StepResult(Result):
+    def __init__(self, context, action_path, group_of_dates, action, upstream_result):
+        super().__init__(context, action_path, group_of_dates)
+        assert isinstance(upstream_result, Result), type(upstream_result)
+        self.upstream_result = upstream_result
+        self.action = action
+    @property
+    @notify_result
+    @trace_datasource
+    def datasource(self):
+        raise NotImplementedError(f"Not implemented in {self.__class__.__name__}")
+class StepAction(Action):
+    result_class = None
+    def __init__(self, context, action_path, previous_step, *args, **kwargs):
+        super().__init__(context, action_path, *args, **kwargs)
+        self.previous_step = previous_step
+    @trace_select
+    def select(self, group_of_dates):
+        return self.result_class(
+            self.context,
+            self.action_path,
+            group_of_dates,
+            self,
+            self.previous_step.select(group_of_dates),
+        )
+    def __repr__(self):
+        return super().__repr__(self.previous_step, _inline_=str(self.kwargs))
+def step_factory(config, context, action_path, previous_step):
+    from .filter import FilterStepAction
+    from .filter import FunctionStepAction
+    assert isinstance(context, Context), (type, context)
+    if not isinstance(config, dict):
+        raise ValueError(f"Invalid input config {config}")
+    config = deepcopy(config)
+    assert len(config) == 1, config
+    key = list(config.keys())[0]
+    cls = dict(
+        filter=FilterStepAction,
+        # rename=RenameAction,
+        # remapping=RemappingAction,
+    ).get(key)
+    if isinstance(config[key], list):
+        args, kwargs = config[key], {}
+    if isinstance(config[key], dict):
+        args, kwargs = [], config[key]
+    if isinstance(config[key], str):
+        args, kwargs = [config[key]], {}
+    if cls is None:
+        if not is_function(key, "filters"):
+            raise ValueError(f"Unknown step {key}")
+        cls = FunctionStepAction
+        args = [key] + args
+    return cls(context, action_path, previous_step, *args, **kwargs)

anemoi/datasets/create/{template.py → input/template.py} RENAMED Viewed

@@ -9,14 +9,8 @@
 import logging
 import re
-import textwrap
 from functools import wraps
-from anemoi.utils.humanize import plural
-from .trace import step
-from .trace import trace
 LOG = logging.getLogger(__name__)
@@ -30,42 +24,6 @@ def notify_result(method):
     return wrapper
-class Context:
-    def __init__(self):
-        # used_references is a set of reference paths that will be needed
-        self.used_references = set()
-        # results is a dictionary of reference path -> obj
-        self.results = {}
-    def will_need_reference(self, key):
-        assert isinstance(key, (list, tuple)), key
-        key = tuple(key)
-        self.used_references.add(key)
-    def notify_result(self, key, result):
-        trace(
-            "🎯",
-            step(key),
-            "notify result",
-            textwrap.shorten(repr(result).replace(",", ", "), width=40),
-            plural(len(result), "field"),
-        )
-        assert isinstance(key, (list, tuple)), key
-        key = tuple(key)
-        if key in self.used_references:
-            if key in self.results:
-                raise ValueError(f"Duplicate result {key}")
-            self.results[key] = result
-    def get_result(self, key):
-        assert isinstance(key, (list, tuple)), key
-        key = tuple(key)
-        if key in self.results:
-            return self.results[key]
-        all_keys = sorted(list(self.results.keys()))
-        raise ValueError(f"Cannot find result {key} in {all_keys}")
 class Substitution:
     pass

anemoi/datasets/create/statistics/__init__.py CHANGED Viewed

@@ -155,7 +155,7 @@ def compute_statistics(array, check_variables_names=None, allow_nans=False):
             check_data_values(values[j, :], name=name, allow_nans=allow_nans)
             if np.isnan(values[j, :]).all():
                 # LOG.warning(f"All NaN values for {name} ({j}) for date {i}")
-                raise ValueError(f"All NaN values for {name} ({j}) for date {i}")
+                LOG.warning(f"All NaN values for {name} ({j}) for date {i}")
         # Ignore NaN values
         minimum[i] = np.nanmin(values, axis=1)

anemoi/datasets/create/zarr.py CHANGED Viewed

@@ -128,7 +128,7 @@ class ZarrBuiltRegistry:
     def add_to_history(self, action, **kwargs):
         new = dict(
             action=action,
-            timestamp=datetime.datetime.utcnow().isoformat(),
+            timestamp=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat(),
         )
         new.update(kwargs)
@@ -151,7 +151,9 @@ class ZarrBuiltRegistry:
     def set_flag(self, i, value=True):
         z = self._open_write()
-        z.attrs["latest_write_timestamp"] = datetime.datetime.utcnow().isoformat()
+        z.attrs["latest_write_timestamp"] = (
+            datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat()
+        )
         z["_build"][self.name_flags][i] = value
     def ready(self):

anemoi/datasets/dates/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ import warnings
 # from anemoi.utils.dates import as_datetime
 from anemoi.utils.dates import DateTimes
 from anemoi.utils.dates import as_datetime
+from anemoi.utils.dates import frequency_to_string
 from anemoi.utils.dates import frequency_to_timedelta
 from anemoi.utils.hindcasts import HindcastDatesTimes
 from anemoi.utils.humanize import print_dates

anemoi/datasets/dates/groups.py CHANGED Viewed

@@ -9,18 +9,26 @@
 import itertools
 from functools import cached_property
-from anemoi.datasets.create.input import shorten
 from anemoi.datasets.dates import DatesProvider
 from anemoi.datasets.dates import as_datetime
+def _shorten(dates):
+    if isinstance(dates, (list, tuple)):
+        dates = [d.isoformat() for d in dates]
+        if len(dates) > 5:
+            return f"{dates[0]}...{dates[-1]}"
+    return dates
 class GroupOfDates:
-    def __init__(self, dates, provider):
+    def __init__(self, dates, provider, partial_ok=False):
         assert isinstance(provider, DatesProvider), type(provider)
         assert isinstance(dates, list)
         self.dates = dates
         self.provider = provider
+        self.partial_ok = partial_ok
     def __len__(self):
         return len(self.dates)
@@ -29,7 +37,7 @@ class GroupOfDates:
         return iter(self.dates)
     def __repr__(self) -> str:
-        return f"GroupOfDates(dates={shorten(self.dates)})"
+        return f"GroupOfDates(dates={_shorten(self.dates)})"
     def __eq__(self, other: object) -> bool:
         return isinstance(other, GroupOfDates) and self.dates == other.dates
@@ -93,7 +101,7 @@ class Groups:
         return n
     def __repr__(self):
-        return f"{self.__class__.__name__}(dates={len(self)},{shorten(self._dates)})"
+        return f"{self.__class__.__name__}(dates={len(self)},{_shorten(self._dates)})"
     def describe(self):
         return self.dates.summary

anemoi-datasets 0.5.0__py3-none-any.whl → 0.5.6__py3-none-any.whl

anemoi-datasets 0.5.0py3-none-any.whl → 0.5.6py3-none-any.whl