PyPI - anemoi-datasets - Versions diffs - 0.5.28__py3-none-any.whl → 0.5.30__py3-none-any.whl - Mend

anemoi-datasets 0.5.28py3-none-any.whl → 0.5.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

anemoi/datasets/create/sources/grib_index.py CHANGED Viewed

@@ -7,9 +7,12 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
+import hashlib
+import json
 import logging
 import os
 import sqlite3
+from collections import defaultdict
 from collections.abc import Iterator
 from typing import Any
@@ -46,8 +49,8 @@ class GribIndex:
         ----------
         database : str
             Path to the SQLite database file.
-        keys : Optional[List[str] | str], optional
-            List of keys or a string of keys to use for indexing, by default None.
+        keys : Optional[list[str] | str], optional
+            list of keys or a string of keys to use for indexing, by default None.
         flavour : Optional[str], optional
             Flavour configuration for mapping fields, by default None.
         update : bool, optional
@@ -103,21 +106,18 @@ class GribIndex:
         """Create the necessary tables in the database."""
         assert self.update
-        self.cursor.execute(
-            """
+        self.cursor.execute("""
         CREATE TABLE IF NOT EXISTS paths (
             id INTEGER PRIMARY KEY,
             path TEXT not null
         )
-        """
-        )
+        """)
         columns = ("valid_datetime",)
         # We don't use NULL as a default because NULL is considered a different value
         # in UNIQUE INDEX constraints (https://www.sqlite.org/lang_createindex.html)
-        self.cursor.execute(
-            f"""
+        self.cursor.execute(f"""
         CREATE TABLE IF NOT EXISTS grib_index (
             _id INTEGER PRIMARY KEY,
             _path_id INTEGER not null,
@@ -125,30 +125,23 @@ class GribIndex:
             _length INTEGER not null,
             {', '.join(f"{key} TEXT not null default ''" for key in columns)},
             FOREIGN KEY(_path_id) REFERENCES paths(id))
-        """
-        )  # ,
+        """)  # ,
-        self.cursor.execute(
-            """
+        self.cursor.execute("""
         CREATE UNIQUE INDEX IF NOT EXISTS idx_grib_index_path_offset
         ON grib_index (_path_id, _offset)
-        """
-        )
+        """)
-        self.cursor.execute(
-            f"""
+        self.cursor.execute(f"""
         CREATE UNIQUE INDEX IF NOT EXISTS idx_grib_index_all_keys
         ON grib_index ({', '.join(columns)})
-        """
-        )
+        """)
         for key in columns:
-            self.cursor.execute(
-                f"""
+            self.cursor.execute(f"""
             CREATE INDEX IF NOT EXISTS idx_grib_index_{key}
             ON grib_index ({key})
-            """
-            )
+            """)
         self._commit()
@@ -161,7 +154,7 @@ class GribIndex:
         Returns
         -------
-        List[str]
+        list[str]
             A list of metadata keys stored in the database.
         """
         self.cursor.execute("SELECT key FROM metadata_keys")
@@ -229,7 +222,7 @@ class GribIndex:
         Returns
         -------
-        List[str]
+        list[str]
             A list of column names.
         """
         if self._columns is not None:
@@ -245,8 +238,8 @@ class GribIndex:
         Parameters
         ----------
-        columns : List[str]
-            List of column names to ensure in the table.
+        columns : list[str]
+            list of column names to ensure in the table.
         """
         assert self.update
@@ -264,20 +257,16 @@ class GribIndex:
         self.cursor.execute("""DROP INDEX IF EXISTS idx_grib_index_all_keys""")
         all_columns = self._all_columns()
-        self.cursor.execute(
-            f"""
+        self.cursor.execute(f"""
         CREATE UNIQUE INDEX IF NOT EXISTS idx_grib_index_all_keys
         ON grib_index ({', '.join(all_columns)})
-        """
-        )
+        """)
         for key in all_columns:
-            self.cursor.execute(
-                f"""
+            self.cursor.execute(f"""
             CREATE INDEX IF NOT EXISTS idx_grib_index_{key}
             ON grib_index ({key})
-            """
-            )
+            """)
     def add_grib_file(self, path: str) -> None:
         """Add a GRIB file to the database.
@@ -364,7 +353,7 @@ class GribIndex:
         Returns
         -------
-        List[dict]
+        list[dict]
             A list of GRIB2 parameter information.
         """
         if ("grib2", paramId) in self.cache:
@@ -524,8 +513,8 @@ class GribIndex:
         Parameters
         ----------
-        dates : List[Any]
-            List of dates to retrieve data for.
+        dates : list[Any]
+            list of dates to retrieve data for.
         **kwargs : Any
             Additional filtering criteria.
@@ -539,12 +528,13 @@ class GribIndex:
         dates = [d.isoformat() for d in dates]
         query = """SELECT _path_id, _offset, _length
-                   FROM grib_index WHERE valid_datetime IN ({})""".format(
-            ", ".join("?" for _ in dates)
-        )
+                   FROM grib_index WHERE valid_datetime IN ({})""".format(", ".join("?" for _ in dates))
         params = dates
         for k, v in kwargs.items():
+            if k not in self._columns:
+                LOG.warning(f"Warning : {k} not in database columns, key discarded")
+                continue
             if isinstance(v, list):
                 query += f" AND {k} IN ({', '.join('?' for _ in v)})"
                 params.extend([str(_) for _ in v])
@@ -552,11 +542,14 @@ class GribIndex:
                 query += f" AND {k} = ?"
                 params.append(str(v))
-        print("SELECT", query)
-        print("SELECT", params)
+        print("SELECT (query)", query)
+        print("SELECT (params)", params)
         self.cursor.execute(query, params)
-        for path_id, offset, length in self.cursor.fetchall():
+        fetch = self.cursor.fetchall()
+        for path_id, offset, length in fetch:
             if path_id in self.cache:
                 file = self.cache[path_id]
             else:
@@ -570,9 +563,8 @@ class GribIndex:
             yield data
-@source_registry.register("grib_index")
+@source_registry.register("grib-index")
 class GribIndexSource(LegacySource):
     @staticmethod
     def _execute(
         context: Any,
@@ -602,15 +594,51 @@ class GribIndexSource(LegacySource):
             An array of retrieved GRIB fields.
         """
         index = GribIndex(indexdb)
-        result = []
         if flavour is not None:
             flavour = RuleBasedFlavour(flavour)
-        for grib in index.retrieve(dates, **kwargs):
-            field = ekd.from_source("memory", grib)[0]
-            if flavour:
-                field = flavour.apply(field)
-            result.append(field)
+        if hasattr(dates, "date_to_intervals"):
+            # When using accumulate source
+            full_requests = []
+            for d, interval in dates.intervals:
+                context.trace("🌧️", "interval:", interval)
+                valid_date, request, _ = dates._adjust_request_to_interval(interval, kwargs)
+                context.trace("🌧️", "  request =", request)
+                full_requests.append(([valid_date], request))
+        else:
+            # Normal case, without accumulate source
+            full_requests = [(dates, kwargs)]
+        full_requests = factorise(full_requests)
+        context.trace("🌧️", f"number of (factorised) requests: {len(full_requests)}")
+        for valid_dates, request in full_requests:
+            context.trace("🌧️", f"  dates: {valid_dates}, request: {request}")
+        result = []
+        for valid_dates, request in full_requests:
+            for grib in index.retrieve(valid_dates, **request):
+                field = ekd.from_source("memory", grib)[0]
+                if flavour:
+                    field = flavour.apply(field)
+                result.append(field)
         return FieldArray(result)
+def factorise(lst):
+    """Factorise a list of (dates, request) tuples by merging dates with identical requests."""
+    content = dict()
+    d = defaultdict(list)
+    for dates, request in lst:
+        assert isinstance(request, dict), type(request)
+        key = hashlib.md5(json.dumps(request, sort_keys=True).encode()).hexdigest()
+        content[key] = request
+        d[key] += dates
+    res = []
+    for key, dates in d.items():
+        dates = list(sorted(set(dates)))
+        res.append((dates, content[key]))
+    return res

anemoi/datasets/create/sources/mars.py CHANGED Viewed

@@ -17,6 +17,7 @@ from earthkit.data import from_source
 from earthkit.data.utils.availability import Availability
 from anemoi.datasets.create.sources import source_registry
+from anemoi.datasets.create.sources.accumulate import IntervalsDatesProvider
 from .legacy import LegacySource
@@ -145,7 +146,7 @@ def _expand_mars_request(
     Parameters
     ----------
-    request : Dict[str, Any]
+    request : dict[str, Any]
         The input MARS request.
     date : datetime.datetime
         The date to be used in the request.
@@ -156,7 +157,7 @@ def _expand_mars_request(
     Returns
     -------
-    List[Dict[str, Any]]
+    List[dict[str, Any]]
         A list of expanded MARS requests.
     """
     requests = []
@@ -164,23 +165,26 @@ def _expand_mars_request(
     user_step = to_list(expand_to_by(request.get("step", [0])))
     user_time = None
     user_date = None
     if not request_already_using_valid_datetime:
-        user_time = request.get("time")
+        user_time = request.get("user_time")
         if user_time is not None:
             user_time = to_list(user_time)
             user_time = [_normalise_time(t) for t in user_time]
         user_date = request.get(date_key)
         if user_date is not None:
-            assert isinstance(user_date, str), user_date
+            if isinstance(user_date, int):
+                user_date = str(user_date)
+            elif isinstance(user_date, datetime.datetime):
+                user_date = user_date.strftime("%Y%m%d")
+            else:
+                raise ValueError(f"Invalid type for {user_date}")
             user_date = re.compile("^{}$".format(user_date.replace("-", "").replace("?", ".")))
     for step in user_step:
         r = request.copy()
         if not request_already_using_valid_datetime:
             if isinstance(step, str) and "-" in step:
                 assert step.count("-") == 1, step
@@ -190,30 +194,27 @@ def _expand_mars_request(
             base = date - datetime.timedelta(hours=hours)
             r.update(
                 {
-                    date_key: base.strftime("%Y%m%d"),
+                    "date": base.strftime("%Y%m%d"),
                     "time": base.strftime("%H%M"),
                     "step": step,
                 }
             )
         for pproc in ("grid", "rotation", "frame", "area", "bitmap", "resol"):
             if pproc in r:
                 if isinstance(r[pproc], (list, tuple)):
                     r[pproc] = "/".join(str(x) for x in r[pproc])
         if user_date is not None:
-            if not user_date.match(r[date_key]):
+            if not user_date.match(r["date"]):
                 continue
         if user_time is not None:
-            # It time is provided by the user, we only keep the requests that match the time
+            # If time is provided by the user, we only keep the requests that match the time
             if r["time"] not in user_time:
                 continue
         requests.append(r)
-    # assert requests, requests
     return requests
@@ -222,6 +223,7 @@ def factorise_requests(
     *requests: dict[str, Any],
     request_already_using_valid_datetime: bool = False,
     date_key: str = "date",
+    no_date_here: bool = False,
 ) -> Generator[dict[str, Any], None, None]:
     """Factorizes the requests based on the given dates.
@@ -229,33 +231,42 @@ def factorise_requests(
     ----------
     dates : List[datetime.datetime]
         The list of dates to be used in the requests.
-    requests : Dict[str, Any]
+    requests : List[dict[str, Any]]
         The input requests to be factorized.
     request_already_using_valid_datetime : bool, optional
         Flag indicating if the requests already use valid datetime.
     date_key : str, optional
         The key for the date in the requests.
+    no_date_here : bool, optional
+        Flag indicating if there is no date in the "dates" list.
     Returns
     -------
-    Generator[Dict[str, Any], None, None]
+    Generator[dict[str, Any], None, None]
         Factorized requests.
     """
-    updates = []
-    for req in requests:
-        # req = normalise_request(req)
+    if isinstance(requests, tuple) and len(requests) == 1 and "requests" in requests[0]:
+        requests = requests[0]["requests"]
-        for d in dates:
-            updates += _expand_mars_request(
+    updates = []
+    for d in sorted(dates):
+        for req in requests:
+            if not no_date_here and (
+                ("date" in req)
+                and ("time" in req)
+                and d.strftime("%Y%m%d%H%M") != (str(req["date"]) + str(req["time"]).zfill(4))
+            ):
+                continue
+            new_req = _expand_mars_request(
                 req,
                 date=d,
                 request_already_using_valid_datetime=request_already_using_valid_datetime,
-                date_key=date_key,
+                date_key="user_date",
             )
+            updates += new_req
     if not updates:
         return
     compressed = Availability(updates)
     for r in compressed.iterate():
         for k, v in r.items():
@@ -269,12 +280,12 @@ def use_grib_paramid(r: dict[str, Any]) -> dict[str, Any]:
     Parameters
     ----------
-    r : Dict[str, Any]
+    r : dict[str, Any]
         The input request containing parameter short names.
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         The request with parameter IDs.
     """
     from anemoi.utils.grib import shortname_to_paramid
@@ -379,7 +390,7 @@ class MarsSource(LegacySource):
             The context for the requests.
         dates : List[datetime.datetime]
             The list of dates to be used in the requests.
-        requests : Dict[str, Any]
+        requests : dict[str, Any]
             The input requests to be executed.
         request_already_using_valid_datetime : bool, optional
             Flag indicating if the requests already use valid datetime.
@@ -395,7 +406,6 @@ class MarsSource(LegacySource):
         Any
             The resulting dataset.
         """
         if not requests:
             requests = [kwargs]
@@ -418,7 +428,26 @@ class MarsSource(LegacySource):
                         "'param' cannot be 'True'. If you wrote 'param: on' in yaml, you may want to use quotes?"
                     )
-        if len(dates) == 0:  # When using `repeated_dates`
+        if isinstance(dates, IntervalsDatesProvider):
+            # When using accumulate source
+            requests_ = []
+            for request in requests:
+                for d, interval in dates.intervals:
+                    context.trace("🌧️", "interval:", interval)
+                    _, r, _ = dates._adjust_request_to_interval(interval, request)
+                    context.trace("🌧️", "  adjusted request =", r)
+                    requests_.append(r)
+            requests = requests_
+            context.trace("🌧️", f"Total requests: {len(requests)}")
+            requests = factorise_requests(
+                ["no_date_here"],
+                *requests,
+                request_already_using_valid_datetime=True,
+                date_key=date_key,
+                no_date_here=True,
+            )
+        elif len(dates) == 0:  # When using `repeated_dates`
             assert len(requests) == 1, requests
             assert "date" in requests[0], requests[0]
             if isinstance(requests[0]["date"], datetime.date):
@@ -434,7 +463,7 @@ class MarsSource(LegacySource):
         requests = list(requests)
         ds = from_source("empty")
-        context.trace("✅", f"{[str(d) for d in dates]}")
+        context.trace("✅", f"{[str(d) for d in dates]}, {len(dates)}")
         context.trace("✅", f"Will run {len(requests)} requests")
         for r in requests:
             r = {k: v for k, v in r.items() if v != ("-",)}

anemoi/datasets/create/sources/xarray_support/__init__.py CHANGED Viewed

@@ -97,6 +97,7 @@ def load_one(
     if isinstance(dataset, xr.Dataset):
         data = dataset
     else:
+        print(f"Opening dataset {dataset} with options {options}")
         data = xr.open_dataset(dataset, **options)
     fs = XarrayFieldList.from_xarray(data, flavour=flavour, patch=patch)

anemoi/datasets/create/sources/xarray_support/coordinates.py CHANGED Viewed

@@ -223,13 +223,10 @@ class Coordinate:
         # Assume the array is sorted
         index = np.searchsorted(values, value)
-        index = index[index < len(values)]
-        if np.all(values[index] == value):
+        if np.all(index < len(values)) and np.all(values[index] == value):
             return index
         # If not found, we need to check if the value is in the array
         index = np.where(np.isin(values, value))[0]
         # We could also return incomplete matches

anemoi/datasets/create/sources/xarray_support/flavour.py CHANGED Viewed

@@ -557,10 +557,10 @@ class DefaultCoordinateGuesser(CoordinateGuesser):
         super().__init__(ds)
     def _is_point(self, c: xr.DataArray, attributes: CoordinateAttributes) -> PointCoordinate | None:
-        if attributes.standard_name in ["cell", "station", "poi", "point"]:
+        if attributes.standard_name in ["location", "cell", "id", "station", "poi", "point"]:
             return PointCoordinate(c)
-        if attributes.name in ["cell", "station", "poi", "point"]:  # WeatherBench
+        if attributes.name in ["location", "cell", "id", "station", "poi", "point"]:  # WeatherBench
             return PointCoordinate(c)
         return None

anemoi-datasets 0.5.28__py3-none-any.whl → 0.5.30__py3-none-any.whl

anemoi-datasets 0.5.28py3-none-any.whl → 0.5.30py3-none-any.whl