PyPI - anemoi-datasets - Versions diffs - 0.4.5__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

anemoi-datasets 0.4.5py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/create.py +3 -2
anemoi/datasets/commands/inspect.py +1 -1
anemoi/datasets/commands/publish.py +30 -0
anemoi/datasets/create/__init__.py +72 -35
anemoi/datasets/create/check.py +6 -0
anemoi/datasets/create/config.py +4 -3
anemoi/datasets/create/functions/filters/pressure_level_relative_humidity_to_specific_humidity.py +57 -0
anemoi/datasets/create/functions/filters/pressure_level_specific_humidity_to_relative_humidity.py +57 -0
anemoi/datasets/create/functions/filters/rename.py +2 -3
anemoi/datasets/create/functions/filters/single_level_dewpoint_to_relative_humidity.py +54 -0
anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_dewpoint.py +59 -0
anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_specific_humidity.py +115 -0
anemoi/datasets/create/functions/filters/single_level_specific_humidity_to_relative_humidity.py +390 -0
anemoi/datasets/create/functions/filters/speeddir_to_uv.py +77 -0
anemoi/datasets/create/functions/filters/uv_to_speeddir.py +55 -0
anemoi/datasets/create/functions/sources/__init__.py +7 -1
anemoi/datasets/create/functions/sources/accumulations.py +2 -0
anemoi/datasets/create/functions/sources/grib.py +87 -2
anemoi/datasets/create/functions/sources/hindcasts.py +14 -73
anemoi/datasets/create/functions/sources/mars.py +9 -3
anemoi/datasets/create/functions/sources/xarray/__init__.py +6 -1
anemoi/datasets/create/functions/sources/xarray/coordinates.py +6 -1
anemoi/datasets/create/functions/sources/xarray/field.py +20 -5
anemoi/datasets/create/functions/sources/xarray/fieldlist.py +16 -16
anemoi/datasets/create/functions/sources/xarray/flavour.py +126 -12
anemoi/datasets/create/functions/sources/xarray/grid.py +106 -17
anemoi/datasets/create/functions/sources/xarray/metadata.py +6 -12
anemoi/datasets/create/functions/sources/xarray/time.py +1 -5
anemoi/datasets/create/functions/sources/xarray/variable.py +10 -10
anemoi/datasets/create/input/__init__.py +69 -0
anemoi/datasets/create/input/action.py +123 -0
anemoi/datasets/create/input/concat.py +92 -0
anemoi/datasets/create/input/context.py +59 -0
anemoi/datasets/create/input/data_sources.py +71 -0
anemoi/datasets/create/input/empty.py +42 -0
anemoi/datasets/create/input/filter.py +76 -0
anemoi/datasets/create/input/function.py +122 -0
anemoi/datasets/create/input/join.py +57 -0
anemoi/datasets/create/input/misc.py +85 -0
anemoi/datasets/create/input/pipe.py +33 -0
anemoi/datasets/create/input/repeated_dates.py +217 -0
anemoi/datasets/create/input/result.py +413 -0
anemoi/datasets/create/input/step.py +99 -0
anemoi/datasets/create/{template.py → input/template.py} +0 -42
anemoi/datasets/create/persistent.py +1 -1
anemoi/datasets/create/statistics/__init__.py +1 -1
anemoi/datasets/create/utils.py +3 -0
anemoi/datasets/create/zarr.py +4 -2
anemoi/datasets/data/dataset.py +11 -1
anemoi/datasets/data/debug.py +5 -1
anemoi/datasets/data/masked.py +2 -2
anemoi/datasets/data/rescale.py +147 -0
anemoi/datasets/data/stores.py +20 -7
anemoi/datasets/dates/__init__.py +113 -30
anemoi/datasets/dates/groups.py +92 -19
anemoi/datasets/fields.py +66 -0
anemoi/datasets/utils/fields.py +47 -0
{anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/METADATA +10 -19
anemoi_datasets-0.5.5.dist-info/RECORD +121 -0
{anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/WHEEL +1 -1
anemoi/datasets/create/input.py +0 -1065
anemoi_datasets-0.4.5.dist-info/RECORD +0 -96
/anemoi/datasets/create/{trace.py → input/trace.py} +0 -0
{anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/top_level.txt +0 -0

anemoi/datasets/create/input/repeated_dates.py ADDED Viewed

@@ -0,0 +1,217 @@
+# (C) Copyright 2023 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+import logging
+from collections import defaultdict
+import numpy as np
+from anemoi.utils.dates import as_datetime
+from anemoi.utils.dates import frequency_to_timedelta
+from anemoi.datasets.fields import FieldArray
+from anemoi.datasets.fields import NewValidDateTimeField
+from .action import Action
+from .action import action_factory
+from .join import JoinResult
+from .result import Result
+from .trace import trace_select
+LOG = logging.getLogger(__name__)
+class DateMapper:
+    @staticmethod
+    def from_mode(mode, source, config):
+        MODES = dict(
+            closest=DateMapperClosest,
+            climatology=DateMapperClimatology,
+            constant=DateMapperConstant,
+        )
+        if mode not in MODES:
+            raise ValueError(f"Invalid mode for DateMapper: {mode}")
+        return MODES[mode](source, **config)
+class DateMapperClosest(DateMapper):
+    def __init__(self, source, frequency="1h", maximum="30d", skip_all_nans=False):
+        self.source = source
+        self.maximum = frequency_to_timedelta(maximum)
+        self.frequency = frequency_to_timedelta(frequency)
+        self.skip_all_nans = skip_all_nans
+        self.tried = set()
+        self.found = set()
+    def transform(self, group_of_dates):
+        from anemoi.datasets.dates.groups import GroupOfDates
+        asked_dates = list(group_of_dates)
+        if not asked_dates:
+            return []
+        to_try = set()
+        for date in asked_dates:
+            start = date
+            while start >= date - self.maximum:
+                to_try.add(start)
+                start -= self.frequency
+            end = date
+            while end <= date + self.maximum:
+                to_try.add(end)
+                end += self.frequency
+        to_try = sorted(to_try - self.tried)
+        if to_try:
+            result = self.source.select(
+                GroupOfDates(
+                    sorted(to_try),
+                    group_of_dates.provider,
+                    partial_ok=True,
+                )
+            )
+            for f in result.datasource:
+                # We could keep the fields in a dictionary, but we don't want to keep the fields in memory
+                date = as_datetime(f.metadata("valid_datetime"))
+                if self.skip_all_nans:
+                    if np.isnan(f.to_numpy()).all():
+                        LOG.warning(f"Skipping {date} because all values are NaN")
+                        continue
+                self.found.add(date)
+            self.tried.update(to_try)
+        new_dates = defaultdict(list)
+        for date in asked_dates:
+            best = None
+            for found_date in sorted(self.found):
+                delta = abs(date - found_date)
+                # With < we prefer the first date
+                # With <= we prefer the last date
+                if best is None or delta <= best[0]:
+                    best = delta, found_date
+            new_dates[best[1]].append(date)
+        for date, dates in new_dates.items():
+            yield (
+                GroupOfDates([date], group_of_dates.provider),
+                GroupOfDates(dates, group_of_dates.provider),
+            )
+class DateMapperClimatology(DateMapper):
+    def __init__(self, source, year, day):
+        self.year = year
+        self.day = day
+    def transform(self, group_of_dates):
+        from anemoi.datasets.dates.groups import GroupOfDates
+        dates = list(group_of_dates)
+        if not dates:
+            return []
+        new_dates = defaultdict(list)
+        for date in dates:
+            new_date = date.replace(year=self.year, day=self.day)
+            new_dates[new_date].append(date)
+        for date, dates in new_dates.items():
+            yield (
+                GroupOfDates([date], group_of_dates.provider),
+                GroupOfDates(dates, group_of_dates.provider),
+            )
+class DateMapperConstant(DateMapper):
+    def __init__(self, source, date=None):
+        self.source = source
+        self.date = date
+    def transform(self, group_of_dates):
+        from anemoi.datasets.dates.groups import GroupOfDates
+        if self.date is None:
+            return [
+                (
+                    GroupOfDates([], group_of_dates.provider),
+                    group_of_dates,
+                )
+            ]
+        return [
+            (
+                GroupOfDates([self.date], group_of_dates.provider),
+                group_of_dates,
+            )
+        ]
+class DateMapperResult(Result):
+    def __init__(
+        self,
+        context,
+        action_path,
+        group_of_dates,
+        source_result,
+        mapper,
+        original_group_of_dates,
+    ):
+        super().__init__(context, action_path, group_of_dates)
+        self.source_results = source_result
+        self.mapper = mapper
+        self.original_group_of_dates = original_group_of_dates
+    @property
+    def datasource(self):
+        result = []
+        for field in self.source_results.datasource:
+            for date in self.original_group_of_dates:
+                result.append(NewValidDateTimeField(field, date))
+        return FieldArray(result)
+class RepeatedDatesAction(Action):
+    def __init__(self, context, action_path, source, mode, **kwargs):
+        super().__init__(context, action_path, source, mode, **kwargs)
+        self.source = action_factory(source, context, action_path + ["source"])
+        self.mapper = DateMapper.from_mode(mode, self.source, kwargs)
+    @trace_select
+    def select(self, group_of_dates):
+        results = []
+        for one_date_group, many_dates_group in self.mapper.transform(group_of_dates):
+            results.append(
+                DateMapperResult(
+                    self.context,
+                    self.action_path,
+                    one_date_group,
+                    self.source.select(one_date_group),
+                    self.mapper,
+                    many_dates_group,
+                )
+            )
+        return JoinResult(self.context, self.action_path, group_of_dates, results)
+    def __repr__(self):
+        return f"MultiDateMatchAction({self.source}, {self.mapper})"

anemoi/datasets/create/input/result.py ADDED Viewed

@@ -0,0 +1,413 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+import itertools
+import logging
+import math
+import time
+from collections import defaultdict
+from functools import cached_property
+import numpy as np
+from anemoi.utils.dates import as_datetime as as_datetime
+from anemoi.utils.dates import frequency_to_timedelta as frequency_to_timedelta
+from anemoi.utils.humanize import seconds_to_human
+from anemoi.utils.humanize import shorten_list
+from earthkit.data.core.order import build_remapping
+from anemoi.datasets.dates import DatesProvider as DatesProvider
+from anemoi.datasets.fields import FieldArray as FieldArray
+from anemoi.datasets.fields import NewValidDateTimeField as NewValidDateTimeField
+from .trace import trace
+from .trace import trace_datasource
+LOG = logging.getLogger(__name__)
+def _data_request(data):
+    date = None
+    params_levels = defaultdict(set)
+    params_steps = defaultdict(set)
+    area = grid = None
+    for field in data:
+        try:
+            if date is None:
+                date = field.metadata("valid_datetime")
+            if field.metadata("valid_datetime") != date:
+                continue
+            as_mars = field.metadata(namespace="mars")
+            if not as_mars:
+                continue
+            step = as_mars.get("step")
+            levtype = as_mars.get("levtype", "sfc")
+            param = as_mars["param"]
+            levelist = as_mars.get("levelist", None)
+            area = field.mars_area
+            grid = field.mars_grid
+            if levelist is None:
+                params_levels[levtype].add(param)
+            else:
+                params_levels[levtype].add((param, levelist))
+            if step:
+                params_steps[levtype].add((param, step))
+        except Exception:
+            LOG.error(f"Error in retrieving metadata (cannot build data request info) for {field}", exc_info=True)
+    def sort(old_dic):
+        new_dic = {}
+        for k, v in old_dic.items():
+            new_dic[k] = sorted(list(v))
+        return new_dic
+    params_steps = sort(params_steps)
+    params_levels = sort(params_levels)
+    return dict(param_level=params_levels, param_step=params_steps, area=area, grid=grid)
+class Result:
+    empty = False
+    _coords_already_built = False
+    def __init__(self, context, action_path, dates):
+        from anemoi.datasets.dates.groups import GroupOfDates
+        from .action import ActionContext
+        assert isinstance(dates, GroupOfDates), dates
+        assert isinstance(context, ActionContext), type(context)
+        assert isinstance(action_path, list), action_path
+        self.context = context
+        self.group_of_dates = dates
+        self.action_path = action_path
+    @property
+    @trace_datasource
+    def datasource(self):
+        self._raise_not_implemented()
+    @property
+    def data_request(self):
+        """Returns a dictionary with the parameters needed to retrieve the data."""
+        return _data_request(self.datasource)
+    def get_cube(self):
+        trace("🧊", f"getting cube from {self.__class__.__name__}")
+        ds = self.datasource
+        remapping = self.context.remapping
+        order_by = self.context.order_by
+        flatten_grid = self.context.flatten_grid
+        start = time.time()
+        LOG.debug("Sorting dataset %s %s", dict(order_by), remapping)
+        assert order_by, order_by
+        patches = {"number": {None: 0}}
+        try:
+            cube = ds.cube(
+                order_by,
+                remapping=remapping,
+                flatten_values=flatten_grid,
+                patches=patches,
+            )
+            cube = cube.squeeze()
+            LOG.debug(f"Sorting done in {seconds_to_human(time.time()-start)}.")
+        except ValueError:
+            self.explain(ds, order_by, remapping=remapping, patches=patches)
+            # raise ValueError(f"Error in {self}")
+            exit(1)
+        if LOG.isEnabledFor(logging.DEBUG):
+            LOG.debug("Cube shape: %s", cube)
+            for k, v in cube.user_coords.items():
+                LOG.debug("  %s %s", k, shorten_list(v, max_length=10))
+        return cube
+    def explain(self, ds, *args, remapping, patches):
+        METADATA = (
+            "date",
+            "time",
+            "step",
+            "hdate",
+            "valid_datetime",
+            "levtype",
+            "levelist",
+            "number",
+            "level",
+            "shortName",
+            "paramId",
+            "variable",
+        )
+        # We redo the logic here
+        print()
+        print("❌" * 40)
+        print()
+        if len(args) == 1 and isinstance(args[0], (list, tuple)):
+            args = args[0]
+        # print("Executing", self.action_path)
+        # print("Dates:", compress_dates(self.dates))
+        names = []
+        for a in args:
+            if isinstance(a, str):
+                names.append(a)
+            elif isinstance(a, dict):
+                names += list(a.keys())
+        print(f"Building a {len(names)}D hypercube using", names)
+        ds = ds.order_by(*args, remapping=remapping, patches=patches)
+        user_coords = ds.unique_values(*names, remapping=remapping, patches=patches, progress_bar=False)
+        print()
+        print("Number of unique values found for each coordinate:")
+        for k, v in user_coords.items():
+            print(f"  {k:20}:", len(v), shorten_list(v, max_length=10))
+        print()
+        user_shape = tuple(len(v) for k, v in user_coords.items())
+        print("Shape of the hypercube           :", user_shape)
+        print(
+            "Number of expected fields        :", math.prod(user_shape), "=", " x ".join([str(i) for i in user_shape])
+        )
+        print("Number of fields in the dataset  :", len(ds))
+        print("Difference                       :", abs(len(ds) - math.prod(user_shape)))
+        print()
+        remapping = build_remapping(remapping, patches)
+        expected = set(itertools.product(*user_coords.values()))
+        extra = set()
+        if math.prod(user_shape) > len(ds):
+            print(f"This means that all the fields in the datasets do not exists for all combinations of {names}.")
+            for f in ds:
+                metadata = remapping(f.metadata)
+                key = tuple(metadata(n, default=None) for n in names)
+                if key in expected:
+                    expected.remove(key)
+                else:
+                    extra.add(key)
+            print("Missing fields:")
+            print()
+            for i, f in enumerate(sorted(expected)):
+                print(" ", f)
+                if i >= 9 and len(expected) > 10:
+                    print("...", len(expected) - i - 1, "more")
+                    break
+            print("Extra fields:")
+            print()
+            for i, f in enumerate(sorted(extra)):
+                print(" ", f)
+                if i >= 9 and len(extra) > 10:
+                    print("...", len(extra) - i - 1, "more")
+                    break
+            print()
+            print("Missing values:")
+            per_name = defaultdict(set)
+            for e in expected:
+                for n, v in zip(names, e):
+                    per_name[n].add(v)
+            for n, v in per_name.items():
+                print(" ", n, len(v), shorten_list(sorted(v), max_length=10))
+            print()
+            print("Extra values:")
+            per_name = defaultdict(set)
+            for e in extra:
+                for n, v in zip(names, e):
+                    per_name[n].add(v)
+            for n, v in per_name.items():
+                print(" ", n, len(v), shorten_list(sorted(v), max_length=10))
+            print()
+            print("To solve this issue, you can:")
+            print(
+                "  - Provide a better selection, like 'step: 0' or 'level: 1000' to "
+                "reduce the number of selected fields."
+            )
+            print(
+                "  - Split the 'input' part in smaller sections using 'join', "
+                "making sure that each section represent a full hypercube."
+            )
+        else:
+            print(f"More fields in dataset that expected for {names}. " "This means that some fields are duplicated.")
+            duplicated = defaultdict(list)
+            for f in ds:
+                # print(f.metadata(namespace="default"))
+                metadata = remapping(f.metadata)
+                key = tuple(metadata(n, default=None) for n in names)
+                duplicated[key].append(f)
+            print("Duplicated fields:")
+            print()
+            duplicated = {k: v for k, v in duplicated.items() if len(v) > 1}
+            for i, (k, v) in enumerate(sorted(duplicated.items())):
+                print(" ", k)
+                for f in v:
+                    x = {k: f.metadata(k, default=None) for k in METADATA if f.metadata(k, default=None) is not None}
+                    print("   ", f, x)
+                if i >= 9 and len(duplicated) > 10:
+                    print("...", len(duplicated) - i - 1, "more")
+                    break
+            print()
+            print("To solve this issue, you can:")
+            print("  - Provide a better selection, like 'step: 0' or 'level: 1000'")
+            print("  - Change the way 'param' is computed using 'variable_naming' " "in the 'build' section.")
+        print()
+        print("❌" * 40)
+        print()
+        exit(1)
+    def __repr__(self, *args, _indent_="\n", **kwargs):
+        more = ",".join([str(a)[:5000] for a in args])
+        more += ",".join([f"{k}={v}"[:5000] for k, v in kwargs.items()])
+        dates = " no-dates"
+        if self.group_of_dates is not None:
+            dates = f" {len(self.group_of_dates)} dates"
+            dates += " ("
+            dates += "/".join(d.strftime("%Y-%m-%d:%H") for d in self.group_of_dates)
+            if len(dates) > 100:
+                dates = dates[:100] + "..."
+            dates += ")"
+        more = more[:5000]
+        txt = f"{self.__class__.__name__}:{dates}{_indent_}{more}"
+        if _indent_:
+            txt = txt.replace("\n", "\n  ")
+        return txt
+    def _raise_not_implemented(self):
+        raise NotImplementedError(f"Not implemented in {self.__class__.__name__}")
+    def _trace_datasource(self, *args, **kwargs):
+        return f"{self.__class__.__name__}({self.group_of_dates})"
+    def build_coords(self):
+        if self._coords_already_built:
+            return
+        from_data = self.get_cube().user_coords
+        from_config = self.context.order_by
+        keys_from_config = list(from_config.keys())
+        keys_from_data = list(from_data.keys())
+        assert keys_from_data == keys_from_config, f"Critical error: {keys_from_data=} != {keys_from_config=}. {self=}"
+        variables_key = list(from_config.keys())[1]
+        ensembles_key = list(from_config.keys())[2]
+        if isinstance(from_config[variables_key], (list, tuple)):
+            assert all([v == w for v, w in zip(from_data[variables_key], from_config[variables_key])]), (
+                from_data[variables_key],
+                from_config[variables_key],
+            )
+        self._variables = from_data[variables_key]  # "param_level"
+        self._ensembles = from_data[ensembles_key]  # "number"
+        first_field = self.datasource[0]
+        grid_points = first_field.grid_points()
+        lats, lons = grid_points
+        assert len(lats) == len(lons), (len(lats), len(lons), first_field)
+        assert len(lats) == math.prod(first_field.shape), (len(lats), first_field.shape, first_field)
+        north = np.amax(lats)
+        south = np.amin(lats)
+        east = np.amax(lons)
+        west = np.amin(lons)
+        assert -90 <= south <= north <= 90, (south, north, first_field)
+        assert (-180 <= west <= east <= 180) or (0 <= west <= east <= 360), (
+            west,
+            east,
+            first_field,
+        )
+        grid_values = list(range(len(grid_points[0])))
+        self._grid_points = grid_points
+        self._resolution = first_field.resolution
+        self._grid_values = grid_values
+        self._field_shape = first_field.shape
+        self._proj_string = first_field.proj_string if hasattr(first_field, "proj_string") else None
+    @property
+    def variables(self):
+        self.build_coords()
+        return self._variables
+    @property
+    def ensembles(self):
+        self.build_coords()
+        return self._ensembles
+    @property
+    def resolution(self):
+        self.build_coords()
+        return self._resolution
+    @property
+    def grid_values(self):
+        self.build_coords()
+        return self._grid_values
+    @property
+    def grid_points(self):
+        self.build_coords()
+        return self._grid_points
+    @property
+    def field_shape(self):
+        self.build_coords()
+        return self._field_shape
+    @property
+    def proj_string(self):
+        self.build_coords()
+        return self._proj_string
+    @cached_property
+    def shape(self):
+        return [
+            len(self.group_of_dates),
+            len(self.variables),
+            len(self.ensembles),
+            len(self.grid_values),
+        ]
+    @cached_property
+    def coords(self):
+        return {
+            "dates": list(self.group_of_dates),
+            "variables": self.variables,
+            "ensembles": self.ensembles,
+            "values": self.grid_values,
+        }

anemoi-datasets 0.4.5__py3-none-any.whl → 0.5.5__py3-none-any.whl

anemoi-datasets 0.4.5py3-none-any.whl → 0.5.5py3-none-any.whl