PyPI - anemoi-datasets - Versions diffs - 0.5.28__py3-none-any.whl → 0.5.29__py3-none-any.whl - Mend

anemoi-datasets 0.5.28py3-none-any.whl → 0.5.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

anemoi/datasets/create/sources/grib_index.py CHANGED Viewed

@@ -7,9 +7,12 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
+import hashlib
+import json
 import logging
 import os
 import sqlite3
+from collections import defaultdict
 from collections.abc import Iterator
 from typing import Any
@@ -46,8 +49,8 @@ class GribIndex:
         ----------
         database : str
             Path to the SQLite database file.
-        keys : Optional[List[str] | str], optional
-            List of keys or a string of keys to use for indexing, by default None.
+        keys : Optional[list[str] | str], optional
+            list of keys or a string of keys to use for indexing, by default None.
         flavour : Optional[str], optional
             Flavour configuration for mapping fields, by default None.
         update : bool, optional
@@ -161,7 +164,7 @@ class GribIndex:
         Returns
         -------
-        List[str]
+        list[str]
             A list of metadata keys stored in the database.
         """
         self.cursor.execute("SELECT key FROM metadata_keys")
@@ -229,7 +232,7 @@ class GribIndex:
         Returns
         -------
-        List[str]
+        list[str]
             A list of column names.
         """
         if self._columns is not None:
@@ -245,8 +248,8 @@ class GribIndex:
         Parameters
         ----------
-        columns : List[str]
-            List of column names to ensure in the table.
+        columns : list[str]
+            list of column names to ensure in the table.
         """
         assert self.update
@@ -364,7 +367,7 @@ class GribIndex:
         Returns
         -------
-        List[dict]
+        list[dict]
             A list of GRIB2 parameter information.
         """
         if ("grib2", paramId) in self.cache:
@@ -524,8 +527,8 @@ class GribIndex:
         Parameters
         ----------
-        dates : List[Any]
-            List of dates to retrieve data for.
+        dates : list[Any]
+            list of dates to retrieve data for.
         **kwargs : Any
             Additional filtering criteria.
@@ -545,6 +548,9 @@ class GribIndex:
         params = dates
         for k, v in kwargs.items():
+            if k not in self._columns:
+                LOG.warning(f"Warning : {k} not in database columns, key discarded")
+                continue
             if isinstance(v, list):
                 query += f" AND {k} IN ({', '.join('?' for _ in v)})"
                 params.extend([str(_) for _ in v])
@@ -552,11 +558,14 @@ class GribIndex:
                 query += f" AND {k} = ?"
                 params.append(str(v))
-        print("SELECT", query)
-        print("SELECT", params)
+        print("SELECT (query)", query)
+        print("SELECT (params)", params)
         self.cursor.execute(query, params)
-        for path_id, offset, length in self.cursor.fetchall():
+        fetch = self.cursor.fetchall()
+        for path_id, offset, length in fetch:
             if path_id in self.cache:
                 file = self.cache[path_id]
             else:
@@ -570,9 +579,8 @@ class GribIndex:
             yield data
-@source_registry.register("grib_index")
+@source_registry.register("grib-index")
 class GribIndexSource(LegacySource):
     @staticmethod
     def _execute(
         context: Any,
@@ -602,15 +610,51 @@ class GribIndexSource(LegacySource):
             An array of retrieved GRIB fields.
         """
         index = GribIndex(indexdb)
-        result = []
         if flavour is not None:
             flavour = RuleBasedFlavour(flavour)
-        for grib in index.retrieve(dates, **kwargs):
-            field = ekd.from_source("memory", grib)[0]
-            if flavour:
-                field = flavour.apply(field)
-            result.append(field)
+        if hasattr(dates, "date_to_intervals"):
+            # When using accumulate source
+            full_requests = []
+            for d, interval in dates.intervals:
+                context.trace("🌧️", "interval:", interval)
+                valid_date, request, _ = dates._adjust_request_to_interval(interval, kwargs)
+                context.trace("🌧️", "  request =", request)
+                full_requests.append(([valid_date], request))
+        else:
+            # Normal case, without accumulate source
+            full_requests = [(dates, kwargs)]
+        full_requests = factorise(full_requests)
+        context.trace("🌧️", f"number of (factorised) requests: {len(full_requests)}")
+        for valid_dates, request in full_requests:
+            context.trace("🌧️", f"  dates: {valid_dates}, request: {request}")
+        result = []
+        for valid_dates, request in full_requests:
+            for grib in index.retrieve(valid_dates, **request):
+                field = ekd.from_source("memory", grib)[0]
+                if flavour:
+                    field = flavour.apply(field)
+                result.append(field)
         return FieldArray(result)
+def factorise(lst):
+    """Factorise a list of (dates, request) tuples by merging dates with identical requests."""
+    content = dict()
+    d = defaultdict(list)
+    for dates, request in lst:
+        assert isinstance(request, dict), type(request)
+        key = hashlib.md5(json.dumps(request, sort_keys=True).encode()).hexdigest()
+        content[key] = request
+        d[key] += dates
+    res = []
+    for key, dates in d.items():
+        dates = list(sorted(set(dates)))
+        res.append((dates, content[key]))
+    return res

anemoi/datasets/create/sources/mars.py CHANGED Viewed

@@ -17,6 +17,7 @@ from earthkit.data import from_source
 from earthkit.data.utils.availability import Availability
 from anemoi.datasets.create.sources import source_registry
+from anemoi.datasets.create.sources.accumulate import IntervalsDatesProvider
 from .legacy import LegacySource
@@ -145,7 +146,7 @@ def _expand_mars_request(
     Parameters
     ----------
-    request : Dict[str, Any]
+    request : dict[str, Any]
         The input MARS request.
     date : datetime.datetime
         The date to be used in the request.
@@ -156,7 +157,7 @@ def _expand_mars_request(
     Returns
     -------
-    List[Dict[str, Any]]
+    List[dict[str, Any]]
         A list of expanded MARS requests.
     """
     requests = []
@@ -164,23 +165,26 @@ def _expand_mars_request(
     user_step = to_list(expand_to_by(request.get("step", [0])))
     user_time = None
     user_date = None
     if not request_already_using_valid_datetime:
-        user_time = request.get("time")
+        user_time = request.get("user_time")
         if user_time is not None:
             user_time = to_list(user_time)
             user_time = [_normalise_time(t) for t in user_time]
         user_date = request.get(date_key)
         if user_date is not None:
-            assert isinstance(user_date, str), user_date
+            if isinstance(user_date, int):
+                user_date = str(user_date)
+            elif isinstance(user_date, datetime.datetime):
+                user_date = user_date.strftime("%Y%m%d")
+            else:
+                raise ValueError(f"Invalid type for {user_date}")
             user_date = re.compile("^{}$".format(user_date.replace("-", "").replace("?", ".")))
     for step in user_step:
         r = request.copy()
         if not request_already_using_valid_datetime:
             if isinstance(step, str) and "-" in step:
                 assert step.count("-") == 1, step
@@ -190,30 +194,27 @@ def _expand_mars_request(
             base = date - datetime.timedelta(hours=hours)
             r.update(
                 {
-                    date_key: base.strftime("%Y%m%d"),
+                    "date": base.strftime("%Y%m%d"),
                     "time": base.strftime("%H%M"),
                     "step": step,
                 }
             )
         for pproc in ("grid", "rotation", "frame", "area", "bitmap", "resol"):
             if pproc in r:
                 if isinstance(r[pproc], (list, tuple)):
                     r[pproc] = "/".join(str(x) for x in r[pproc])
         if user_date is not None:
-            if not user_date.match(r[date_key]):
+            if not user_date.match(r["date"]):
                 continue
         if user_time is not None:
-            # It time is provided by the user, we only keep the requests that match the time
+            # If time is provided by the user, we only keep the requests that match the time
             if r["time"] not in user_time:
                 continue
         requests.append(r)
-    # assert requests, requests
     return requests
@@ -222,6 +223,7 @@ def factorise_requests(
     *requests: dict[str, Any],
     request_already_using_valid_datetime: bool = False,
     date_key: str = "date",
+    no_date_here: bool = False,
 ) -> Generator[dict[str, Any], None, None]:
     """Factorizes the requests based on the given dates.
@@ -229,33 +231,42 @@ def factorise_requests(
     ----------
     dates : List[datetime.datetime]
         The list of dates to be used in the requests.
-    requests : Dict[str, Any]
+    requests : List[dict[str, Any]]
         The input requests to be factorized.
     request_already_using_valid_datetime : bool, optional
         Flag indicating if the requests already use valid datetime.
     date_key : str, optional
         The key for the date in the requests.
+    no_date_here : bool, optional
+        Flag indicating if there is no date in the "dates" list.
     Returns
     -------
-    Generator[Dict[str, Any], None, None]
+    Generator[dict[str, Any], None, None]
         Factorized requests.
     """
-    updates = []
-    for req in requests:
-        # req = normalise_request(req)
+    if isinstance(requests, tuple) and len(requests) == 1 and "requests" in requests[0]:
+        requests = requests[0]["requests"]
-        for d in dates:
-            updates += _expand_mars_request(
+    updates = []
+    for d in sorted(dates):
+        for req in requests:
+            if not no_date_here and (
+                ("date" in req)
+                and ("time" in req)
+                and d.strftime("%Y%m%d%H%M") != (str(req["date"]) + str(req["time"]).zfill(4))
+            ):
+                continue
+            new_req = _expand_mars_request(
                 req,
                 date=d,
                 request_already_using_valid_datetime=request_already_using_valid_datetime,
-                date_key=date_key,
+                date_key="user_date",
             )
+            updates += new_req
     if not updates:
         return
     compressed = Availability(updates)
     for r in compressed.iterate():
         for k, v in r.items():
@@ -269,12 +280,12 @@ def use_grib_paramid(r: dict[str, Any]) -> dict[str, Any]:
     Parameters
     ----------
-    r : Dict[str, Any]
+    r : dict[str, Any]
         The input request containing parameter short names.
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         The request with parameter IDs.
     """
     from anemoi.utils.grib import shortname_to_paramid
@@ -379,7 +390,7 @@ class MarsSource(LegacySource):
             The context for the requests.
         dates : List[datetime.datetime]
             The list of dates to be used in the requests.
-        requests : Dict[str, Any]
+        requests : dict[str, Any]
             The input requests to be executed.
         request_already_using_valid_datetime : bool, optional
             Flag indicating if the requests already use valid datetime.
@@ -395,7 +406,6 @@ class MarsSource(LegacySource):
         Any
             The resulting dataset.
         """
         if not requests:
             requests = [kwargs]
@@ -418,7 +428,26 @@ class MarsSource(LegacySource):
                         "'param' cannot be 'True'. If you wrote 'param: on' in yaml, you may want to use quotes?"
                     )
-        if len(dates) == 0:  # When using `repeated_dates`
+        if isinstance(dates, IntervalsDatesProvider):
+            # When using accumulate source
+            requests_ = []
+            for request in requests:
+                for d, interval in dates.intervals:
+                    context.trace("🌧️", "interval:", interval)
+                    _, r, _ = dates._adjust_request_to_interval(interval, request)
+                    context.trace("🌧️", "  adjusted request =", r)
+                    requests_.append(r)
+            requests = requests_
+            context.trace("🌧️", f"Total requests: {len(requests)}")
+            requests = factorise_requests(
+                ["no_date_here"],
+                *requests,
+                request_already_using_valid_datetime=True,
+                date_key=date_key,
+                no_date_here=True,
+            )
+        elif len(dates) == 0:  # When using `repeated_dates`
             assert len(requests) == 1, requests
             assert "date" in requests[0], requests[0]
             if isinstance(requests[0]["date"], datetime.date):
@@ -434,7 +463,7 @@ class MarsSource(LegacySource):
         requests = list(requests)
         ds = from_source("empty")
-        context.trace("✅", f"{[str(d) for d in dates]}")
+        context.trace("✅", f"{[str(d) for d in dates]}, {len(dates)}")
         context.trace("✅", f"Will run {len(requests)} requests")
         for r in requests:
             r = {k: v for k, v in r.items() if v != ("-",)}

anemoi/datasets/create/sources/xarray_support/__init__.py CHANGED Viewed

@@ -97,6 +97,7 @@ def load_one(
     if isinstance(dataset, xr.Dataset):
         data = dataset
     else:
+        print(f"Opening dataset {dataset} with options {options}")
         data = xr.open_dataset(dataset, **options)
     fs = XarrayFieldList.from_xarray(data, flavour=flavour, patch=patch)

anemoi/datasets/create/sources/xarray_support/coordinates.py CHANGED Viewed

@@ -223,13 +223,10 @@ class Coordinate:
         # Assume the array is sorted
         index = np.searchsorted(values, value)
-        index = index[index < len(values)]
-        if np.all(values[index] == value):
+        if np.all(index < len(values)) and np.all(values[index] == value):
             return index
         # If not found, we need to check if the value is in the array
         index = np.where(np.isin(values, value))[0]
         # We could also return incomplete matches

anemoi/datasets/create/sources/xarray_support/flavour.py CHANGED Viewed

@@ -557,10 +557,10 @@ class DefaultCoordinateGuesser(CoordinateGuesser):
         super().__init__(ds)
     def _is_point(self, c: xr.DataArray, attributes: CoordinateAttributes) -> PointCoordinate | None:
-        if attributes.standard_name in ["cell", "station", "poi", "point"]:
+        if attributes.standard_name in ["location", "cell", "id", "station", "poi", "point"]:
             return PointCoordinate(c)
-        if attributes.name in ["cell", "station", "poi", "point"]:  # WeatherBench
+        if attributes.name in ["location", "cell", "id", "station", "poi", "point"]:  # WeatherBench
             return PointCoordinate(c)
         return None

anemoi/datasets/create/sources/xarray_support/patch.py CHANGED Viewed

@@ -10,13 +10,14 @@
 import logging
 from typing import Any
+from typing import Literal
 import xarray as xr
 LOG = logging.getLogger(__name__)
-def patch_attributes(ds: xr.Dataset, attributes: dict[str, dict[str, Any]]) -> Any:
+def patch_attributes(ds: xr.Dataset, attributes: dict[str, dict[str, Any]]) -> xr.Dataset:
     """Patch the attributes of the dataset.
     Parameters
@@ -38,7 +39,7 @@ def patch_attributes(ds: xr.Dataset, attributes: dict[str, dict[str, Any]]) -> A
     return ds
-def patch_coordinates(ds: xr.Dataset, coordinates: list[str]) -> Any:
+def patch_coordinates(ds: xr.Dataset, coordinates: list[str]) -> xr.Dataset:
     """Patch the coordinates of the dataset.
     Parameters
@@ -59,7 +60,7 @@ def patch_coordinates(ds: xr.Dataset, coordinates: list[str]) -> Any:
     return ds
-def patch_rename(ds: xr.Dataset, renames: dict[str, str]) -> Any:
+def patch_rename(ds: xr.Dataset, renames: dict[str, str]) -> xr.Dataset:
     """Rename variables in the dataset.
     Parameters
@@ -77,7 +78,7 @@ def patch_rename(ds: xr.Dataset, renames: dict[str, str]) -> Any:
     return ds.rename(renames)
-def patch_sort_coordinate(ds: xr.Dataset, sort_coordinates: list[str]) -> Any:
+def patch_sort_coordinate(ds: xr.Dataset, sort_coordinates: list[str]) -> xr.Dataset:
     """Sort the coordinates of the dataset.
     Parameters
@@ -98,11 +99,175 @@ def patch_sort_coordinate(ds: xr.Dataset, sort_coordinates: list[str]) -> Any:
     return ds
+def patch_subset_dataset(ds: xr.Dataset, selection: dict[str, Any]) -> xr.Dataset:
+    """Select a subset of the dataset using xarray's sel method.
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The dataset to patch.
+    selection : dict[str, Any]
+        Dictionary mapping dimension names to selection criteria.
+        Keys must be existing dimension names in the dataset.
+        Values can be any type accepted by xarray's sel method, including:
+        - Single values (int, float, str, datetime)
+        - Lists or arrays of values
+        - Slices (using slice() objects)
+        - Boolean arrays
+    Returns
+    -------
+    xr.Dataset
+        The patched dataset containing only the selected subset.
+    Examples
+    --------
+    >>> # Select specific time and pressure level
+    >>> patch_subset_dataset(ds, {
+    ...     'time': '2020-01-01',
+    ...     'pressure': 500
+    ... })
+    >>> # Select a range using slice
+    >>> patch_subset_dataset(ds, {
+    ...     'lat': slice(-90, 90),
+    ...     'lon': slice(0, 180)
+    ... })
+    """
+    ds = ds.sel(selection)
+    return ds
+def patch_analysis_lead_to_valid_time(
+    ds: xr.Dataset,
+    time_coord_names: dict[Literal["analysis_time_coordinate", "lead_time_coordinate", "valid_time_coordinate"], str],
+) -> xr.Dataset:
+    """Convert analysis time and lead time coordinates to valid time.
+    This function creates a new valid time coordinate by adding the analysis time
+    and lead time coordinates, then stacks and reorganizes the dataset to use
+    valid time as the primary time dimension.
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The dataset to patch.
+    time_coord_names : dict[str, str]
+        Dictionary mapping required keys to coordinate names in the dataset:
+        - 'analysis_time_coordinate' : str
+            Name of the analysis/initialization time coordinate.
+        - 'lead_time_coordinate' : str
+            Name of the forecast lead time coordinate.
+        - 'valid_time_coordinate' : str
+            Name for the new valid time coordinate to create.
+    Returns
+    -------
+    xr.Dataset
+        The patched dataset with valid time as the primary time coordinate.
+        The analysis and lead time coordinates are removed.
+    Examples
+    --------
+    >>> patch_analysis_lead_to_valid_time(ds, {
+    ...     'analysis_time_coordinate': 'forecast_reference_time',
+    ...     'lead_time_coordinate': 'step',
+    ...     'valid_time_coordinate': 'time'
+    ... })
+    """
+    assert time_coord_names.keys() == {
+        "analysis_time_coordinate",
+        "lead_time_coordinate",
+        "valid_time_coordinate",
+    }, "time_coord_names must contain exactly keys 'analysis_time_coordinate', 'lead_time_coordinate', and 'valid_time_coordinate'"
+    analysis_time_coordinate = time_coord_names["analysis_time_coordinate"]
+    lead_time_coordinate = time_coord_names["lead_time_coordinate"]
+    valid_time_coordinate = time_coord_names["valid_time_coordinate"]
+    valid_time = ds[analysis_time_coordinate] + ds[lead_time_coordinate]
+    ds = (
+        ds.assign_coords({valid_time_coordinate: valid_time})
+        .stack(time_index=[analysis_time_coordinate, lead_time_coordinate])
+        .set_index(time_index=valid_time_coordinate)
+        .rename(time_index=valid_time_coordinate)
+        .drop_vars([analysis_time_coordinate, lead_time_coordinate])
+    )
+    return ds
+def patch_rolling_operation(
+    ds: xr.Dataset, vars_operation_config: dict[Literal["dim", "steps", "vars", "operation"], str | int | list[str]]
+) -> xr.Dataset:
+    """Apply a rolling operation to specified variables in the dataset.
+    This function calculates a rolling operation over a specified dimension for selected
+    variables. The rolling window requires all periods to be present (min_periods=steps).
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The dataset to patch.
+    vars_operation_config: dict
+        Configuration for the rolling operation with the following keys:
+        - 'dim' : str
+            The dimension along which to apply the rolling operation (e.g., 'time').
+        - 'steps' : int
+            The number of steps in the rolling window.
+        - 'vars' : list[str]
+            List of variable names to apply the rolling operation to.
+        - 'operation' : str
+            The operation to apply ('sum', 'mean', 'min', 'max', 'std', etc.).
+    Returns
+    -------
+    xr.Dataset
+        The patched dataset with rolling operations applied to the specified variables.
+    Examples
+    --------
+    >>> patch_rolling_operation(ds, {
+    ...     'dim': 'time',
+    ...     'steps': 3,
+    ...     'vars': ['precipitation', 'radiation'],
+    ...     'operation': 'sum'
+    ... })
+    """
+    assert vars_operation_config.keys() == {
+        "dim",
+        "steps",
+        "vars",
+        "operation",
+    }, "vars_operation_config must contain exactly keys 'dim', 'steps', 'vars', and 'operation'"
+    dim = vars_operation_config["dim"]
+    steps = vars_operation_config["steps"]
+    vars = vars_operation_config["vars"]
+    operation = vars_operation_config["operation"]
+    for var in vars:
+        rolling = ds[var].rolling(dim={dim: steps}, min_periods=steps)
+        ds[var] = getattr(rolling, operation)()
+    return ds
 PATCHES = {
     "attributes": patch_attributes,
     "coordinates": patch_coordinates,
     "rename": patch_rename,
     "sort_coordinates": patch_sort_coordinate,
+    "analysis_lead_to_valid_time": patch_analysis_lead_to_valid_time,
+    "rolling_operation": patch_rolling_operation,
+    "subset_dataset": patch_subset_dataset,
 }
@@ -122,7 +287,15 @@ def patch_dataset(ds: xr.Dataset, patch: dict[str, dict[str, Any]]) -> Any:
         The patched dataset.
     """
-    ORDER = ["coordinates", "attributes", "rename", "sort_coordinates"]
+    ORDER = [
+        "coordinates",
+        "attributes",
+        "rename",
+        "sort_coordinates",
+        "subset_dataset",
+        "analysis_lead_to_valid_time",
+        "rolling_operation",
+    ]
     for what, values in sorted(patch.items(), key=lambda x: ORDER.index(x[0])):
         if what not in PATCHES:
             raise ValueError(f"Unknown patch type {what!r}")

anemoi-datasets 0.5.28__py3-none-any.whl → 0.5.29__py3-none-any.whl

anemoi-datasets 0.5.28py3-none-any.whl → 0.5.29py3-none-any.whl