anemoi-datasets 0.5.27__py3-none-any.whl → 0.5.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/commands/recipe/__init__.py +93 -0
  3. anemoi/datasets/commands/recipe/format.py +55 -0
  4. anemoi/datasets/commands/recipe/migrate.py +555 -0
  5. anemoi/datasets/create/__init__.py +46 -13
  6. anemoi/datasets/create/config.py +52 -53
  7. anemoi/datasets/create/input/__init__.py +43 -63
  8. anemoi/datasets/create/input/action.py +296 -236
  9. anemoi/datasets/create/input/context/__init__.py +71 -0
  10. anemoi/datasets/create/input/context/field.py +54 -0
  11. anemoi/datasets/create/input/data_sources.py +2 -1
  12. anemoi/datasets/create/input/misc.py +0 -71
  13. anemoi/datasets/create/input/repeated_dates.py +0 -114
  14. anemoi/datasets/create/input/result/__init__.py +17 -0
  15. anemoi/datasets/create/input/{result.py → result/field.py} +10 -92
  16. anemoi/datasets/create/sources/accumulate.py +517 -0
  17. anemoi/datasets/create/sources/accumulate_utils/__init__.py +8 -0
  18. anemoi/datasets/create/sources/accumulate_utils/covering_intervals.py +221 -0
  19. anemoi/datasets/create/sources/accumulate_utils/field_to_interval.py +149 -0
  20. anemoi/datasets/create/sources/accumulate_utils/interval_generators.py +321 -0
  21. anemoi/datasets/create/sources/anemoi_dataset.py +46 -42
  22. anemoi/datasets/create/sources/constants.py +39 -38
  23. anemoi/datasets/create/sources/empty.py +26 -22
  24. anemoi/datasets/create/sources/forcings.py +29 -28
  25. anemoi/datasets/create/sources/grib.py +92 -72
  26. anemoi/datasets/create/sources/grib_index.py +102 -54
  27. anemoi/datasets/create/sources/hindcasts.py +56 -55
  28. anemoi/datasets/create/sources/legacy.py +10 -62
  29. anemoi/datasets/create/sources/mars.py +159 -154
  30. anemoi/datasets/create/sources/netcdf.py +28 -24
  31. anemoi/datasets/create/sources/opendap.py +28 -24
  32. anemoi/datasets/create/sources/recentre.py +42 -41
  33. anemoi/datasets/create/sources/repeated_dates.py +44 -0
  34. anemoi/datasets/create/sources/source.py +26 -48
  35. anemoi/datasets/create/sources/xarray_support/__init__.py +30 -24
  36. anemoi/datasets/create/sources/xarray_support/coordinates.py +1 -4
  37. anemoi/datasets/create/sources/xarray_support/field.py +4 -4
  38. anemoi/datasets/create/sources/xarray_support/flavour.py +2 -2
  39. anemoi/datasets/create/sources/xarray_support/patch.py +178 -5
  40. anemoi/datasets/create/sources/xarray_zarr.py +28 -24
  41. anemoi/datasets/create/sources/zenodo.py +43 -39
  42. anemoi/datasets/create/utils.py +0 -42
  43. anemoi/datasets/data/complement.py +26 -17
  44. anemoi/datasets/data/dataset.py +12 -0
  45. anemoi/datasets/data/grids.py +0 -152
  46. anemoi/datasets/data/masked.py +74 -13
  47. anemoi/datasets/data/missing.py +5 -0
  48. anemoi/datasets/data/rolling_average.py +141 -0
  49. anemoi/datasets/data/stores.py +7 -9
  50. anemoi/datasets/dates/__init__.py +2 -0
  51. anemoi/datasets/dumper.py +76 -0
  52. anemoi/datasets/grids.py +1 -178
  53. anemoi/datasets/schemas/recipe.json +131 -0
  54. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/METADATA +9 -6
  55. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/RECORD +59 -57
  56. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/WHEEL +1 -1
  57. anemoi/datasets/create/filter.py +0 -47
  58. anemoi/datasets/create/input/concat.py +0 -161
  59. anemoi/datasets/create/input/context.py +0 -86
  60. anemoi/datasets/create/input/empty.py +0 -53
  61. anemoi/datasets/create/input/filter.py +0 -117
  62. anemoi/datasets/create/input/function.py +0 -232
  63. anemoi/datasets/create/input/join.py +0 -129
  64. anemoi/datasets/create/input/pipe.py +0 -66
  65. anemoi/datasets/create/input/step.py +0 -173
  66. anemoi/datasets/create/input/template.py +0 -161
  67. anemoi/datasets/create/sources/accumulations.py +0 -1062
  68. anemoi/datasets/create/sources/accumulations2.py +0 -647
  69. anemoi/datasets/create/sources/tendencies.py +0 -198
  70. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/entry_points.txt +0 -0
  71. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/licenses/LICENSE +0 -0
  72. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/top_level.txt +0 -0
@@ -7,9 +7,12 @@
7
7
  # granted to it by virtue of its status as an intergovernmental organisation
8
8
  # nor does it submit to any jurisdiction.
9
9
 
10
+ import hashlib
11
+ import json
10
12
  import logging
11
13
  import os
12
14
  import sqlite3
15
+ from collections import defaultdict
13
16
  from collections.abc import Iterator
14
17
  from typing import Any
15
18
 
@@ -19,7 +22,8 @@ from anemoi.transform.flavour import RuleBasedFlavour
19
22
  from cachetools import LRUCache
20
23
  from earthkit.data.indexing.fieldlist import FieldArray
21
24
 
22
- from .legacy import legacy_source
25
+ from . import source_registry
26
+ from .legacy import LegacySource
23
27
 
24
28
  LOG = logging.getLogger(__name__)
25
29
 
@@ -45,8 +49,8 @@ class GribIndex:
45
49
  ----------
46
50
  database : str
47
51
  Path to the SQLite database file.
48
- keys : Optional[List[str] | str], optional
49
- List of keys or a string of keys to use for indexing, by default None.
52
+ keys : Optional[list[str] | str], optional
53
+ list of keys or a string of keys to use for indexing, by default None.
50
54
  flavour : Optional[str], optional
51
55
  Flavour configuration for mapping fields, by default None.
52
56
  update : bool, optional
@@ -160,7 +164,7 @@ class GribIndex:
160
164
 
161
165
  Returns
162
166
  -------
163
- List[str]
167
+ list[str]
164
168
  A list of metadata keys stored in the database.
165
169
  """
166
170
  self.cursor.execute("SELECT key FROM metadata_keys")
@@ -228,7 +232,7 @@ class GribIndex:
228
232
 
229
233
  Returns
230
234
  -------
231
- List[str]
235
+ list[str]
232
236
  A list of column names.
233
237
  """
234
238
  if self._columns is not None:
@@ -244,8 +248,8 @@ class GribIndex:
244
248
 
245
249
  Parameters
246
250
  ----------
247
- columns : List[str]
248
- List of column names to ensure in the table.
251
+ columns : list[str]
252
+ list of column names to ensure in the table.
249
253
  """
250
254
  assert self.update
251
255
 
@@ -363,7 +367,7 @@ class GribIndex:
363
367
 
364
368
  Returns
365
369
  -------
366
- List[dict]
370
+ list[dict]
367
371
  A list of GRIB2 parameter information.
368
372
  """
369
373
  if ("grib2", paramId) in self.cache:
@@ -523,8 +527,8 @@ class GribIndex:
523
527
 
524
528
  Parameters
525
529
  ----------
526
- dates : List[Any]
527
- List of dates to retrieve data for.
530
+ dates : list[Any]
531
+ list of dates to retrieve data for.
528
532
  **kwargs : Any
529
533
  Additional filtering criteria.
530
534
 
@@ -544,6 +548,9 @@ class GribIndex:
544
548
  params = dates
545
549
 
546
550
  for k, v in kwargs.items():
551
+ if k not in self._columns:
552
+ LOG.warning(f"Warning : {k} not in database columns, key discarded")
553
+ continue
547
554
  if isinstance(v, list):
548
555
  query += f" AND {k} IN ({', '.join('?' for _ in v)})"
549
556
  params.extend([str(_) for _ in v])
@@ -551,11 +558,14 @@ class GribIndex:
551
558
  query += f" AND {k} = ?"
552
559
  params.append(str(v))
553
560
 
554
- print("SELECT", query)
555
- print("SELECT", params)
561
+ print("SELECT (query)", query)
562
+ print("SELECT (params)", params)
556
563
 
557
564
  self.cursor.execute(query, params)
558
- for path_id, offset, length in self.cursor.fetchall():
565
+
566
+ fetch = self.cursor.fetchall()
567
+
568
+ for path_id, offset, length in fetch:
559
569
  if path_id in self.cache:
560
570
  file = self.cache[path_id]
561
571
  else:
@@ -569,44 +579,82 @@ class GribIndex:
569
579
  yield data
570
580
 
571
581
 
572
- @legacy_source(__file__)
573
- def execute(
574
- context: Any,
575
- dates: list[Any],
576
- indexdb: str,
577
- flavour: str | None = None,
578
- **kwargs: Any,
579
- ) -> FieldArray:
580
- """Execute the GRIB data retrieval process.
581
-
582
- Parameters
583
- ----------
584
- context : Any
585
- The execution context.
586
- dates : List[Any]
587
- List of dates to retrieve data for.
588
- indexdb : str
589
- Path to the GRIB index database.
590
- flavour : Optional[str], optional
591
- Flavour configuration for mapping fields, by default None.
592
- **kwargs : Any
593
- Additional filtering criteria.
594
-
595
- Returns
596
- -------
597
- FieldArray
598
- An array of retrieved GRIB fields.
599
- """
600
- index = GribIndex(indexdb)
601
- result = []
602
-
603
- if flavour is not None:
604
- flavour = RuleBasedFlavour(flavour)
605
-
606
- for grib in index.retrieve(dates, **kwargs):
607
- field = ekd.from_source("memory", grib)[0]
608
- if flavour:
609
- field = flavour.apply(field)
610
- result.append(field)
611
-
612
- return FieldArray(result)
582
+ @source_registry.register("grib-index")
583
+ class GribIndexSource(LegacySource):
584
+ @staticmethod
585
+ def _execute(
586
+ context: Any,
587
+ dates: list[Any],
588
+ indexdb: str,
589
+ flavour: str | None = None,
590
+ **kwargs: Any,
591
+ ) -> FieldArray:
592
+ """Execute the GRIB data retrieval process.
593
+
594
+ Parameters
595
+ ----------
596
+ context : Any
597
+ The execution context.
598
+ dates : List[Any]
599
+ List of dates to retrieve data for.
600
+ indexdb : str
601
+ Path to the GRIB index database.
602
+ flavour : Optional[str], optional
603
+ Flavour configuration for mapping fields, by default None.
604
+ **kwargs : Any
605
+ Additional filtering criteria.
606
+
607
+ Returns
608
+ -------
609
+ FieldArray
610
+ An array of retrieved GRIB fields.
611
+ """
612
+ index = GribIndex(indexdb)
613
+
614
+ if flavour is not None:
615
+ flavour = RuleBasedFlavour(flavour)
616
+
617
+ if hasattr(dates, "date_to_intervals"):
618
+ # When using accumulate source
619
+ full_requests = []
620
+ for d, interval in dates.intervals:
621
+ context.trace("🌧️", "interval:", interval)
622
+ valid_date, request, _ = dates._adjust_request_to_interval(interval, kwargs)
623
+ context.trace("🌧️", " request =", request)
624
+ full_requests.append(([valid_date], request))
625
+ else:
626
+ # Normal case, without accumulate source
627
+ full_requests = [(dates, kwargs)]
628
+
629
+ full_requests = factorise(full_requests)
630
+ context.trace("🌧️", f"number of (factorised) requests: {len(full_requests)}")
631
+ for valid_dates, request in full_requests:
632
+ context.trace("🌧️", f" dates: {valid_dates}, request: {request}")
633
+
634
+ result = []
635
+ for valid_dates, request in full_requests:
636
+ for grib in index.retrieve(valid_dates, **request):
637
+ field = ekd.from_source("memory", grib)[0]
638
+ if flavour:
639
+ field = flavour.apply(field)
640
+ result.append(field)
641
+
642
+ return FieldArray(result)
643
+
644
+
645
+ def factorise(lst):
646
+ """Factorise a list of (dates, request) tuples by merging dates with identical requests."""
647
+ content = dict()
648
+
649
+ d = defaultdict(list)
650
+ for dates, request in lst:
651
+ assert isinstance(request, dict), type(request)
652
+ key = hashlib.md5(json.dumps(request, sort_keys=True).encode()).hexdigest()
653
+ content[key] = request
654
+ d[key] += dates
655
+
656
+ res = []
657
+ for key, dates in d.items():
658
+ dates = list(sorted(set(dates)))
659
+ res.append((dates, content[key]))
660
+ return res
@@ -14,7 +14,8 @@ from earthkit.data.core.fieldlist import MultiFieldList
14
14
 
15
15
  from anemoi.datasets.create.sources.mars import mars
16
16
 
17
- from .legacy import legacy_source
17
+ from . import source_registry
18
+ from .legacy import LegacySource
18
19
 
19
20
  LOGGER = logging.getLogger(__name__)
20
21
 
@@ -37,57 +38,57 @@ def _to_list(x: list | tuple | Any) -> list[Any]:
37
38
  return [x]
38
39
 
39
40
 
40
- @legacy_source(__file__)
41
- def hindcasts(context: Any, dates: list[Any], **request: dict[str, Any]) -> MultiFieldList:
42
- """Generates hindcast requests based on the provided dates and request parameters.
43
-
44
- Parameters
45
- ----------
46
- context : Any
47
- The context containing the dates provider and trace method.
48
- dates : List[Any]
49
- A list of dates for which to generate hindcast requests.
50
- request : Dict[str, Any]
51
- Additional request parameters.
52
-
53
- Returns
54
- -------
55
- MultiFieldList
56
- A MultiFieldList containing the hindcast data.
57
- """
58
- from anemoi.datasets.dates import HindcastsDates
59
-
60
- provider = context.dates_provider
61
- assert isinstance(provider, HindcastsDates)
62
-
63
- context.trace("H️", f"hindcasts {len(dates)=}")
64
-
65
- request["param"] = _to_list(request["param"])
66
- request["step"] = _to_list(request.get("step", 0))
67
- request["step"] = [int(_) for _ in request["step"]]
68
-
69
- context.trace("H️", f"hindcast {request}")
70
-
71
- requests = []
72
- for d in dates:
73
- r = request.copy()
74
- hindcast = provider.mapping[d]
75
- r["hdate"] = hindcast.hdate.strftime("%Y-%m-%d")
76
- r["date"] = hindcast.refdate.strftime("%Y-%m-%d")
77
- r["time"] = hindcast.refdate.strftime("%H")
78
- r["step"] = hindcast.step
79
- requests.append(r)
80
-
81
- if len(requests) == 0:
82
- return MultiFieldList([])
83
-
84
- return mars(
85
- context,
86
- dates,
87
- *requests,
88
- date_key="hdate",
89
- request_already_using_valid_datetime=True,
90
- )
91
-
92
-
93
- execute = hindcasts
41
+ @source_registry.register("hindcasts")
42
+ class HindcastsSource(LegacySource):
43
+
44
+ @staticmethod
45
+ def _execute(context: Any, dates: list[Any], **request: dict[str, Any]) -> MultiFieldList:
46
+ """Generates hindcast requests based on the provided dates and request parameters.
47
+
48
+ Parameters
49
+ ----------
50
+ context : Any
51
+ The context containing the dates provider and trace method.
52
+ dates : List[Any]
53
+ A list of dates for which to generate hindcast requests.
54
+ request : Dict[str, Any]
55
+ Additional request parameters.
56
+
57
+ Returns
58
+ -------
59
+ MultiFieldList
60
+ A MultiFieldList containing the hindcast data.
61
+ """
62
+ from anemoi.datasets.dates import HindcastsDates
63
+
64
+ provider = context.dates_provider
65
+ assert isinstance(provider, HindcastsDates)
66
+
67
+ context.trace("H️", f"hindcasts {len(dates)=}")
68
+
69
+ request["param"] = _to_list(request["param"])
70
+ request["step"] = _to_list(request.get("step", 0))
71
+ request["step"] = [int(_) for _ in request["step"]]
72
+
73
+ context.trace("H️", f"hindcast {request}")
74
+
75
+ requests = []
76
+ for d in dates:
77
+ r = request.copy()
78
+ hindcast = provider.mapping[d]
79
+ r["hdate"] = hindcast.hdate.strftime("%Y-%m-%d")
80
+ r["date"] = hindcast.refdate.strftime("%Y-%m-%d")
81
+ r["time"] = hindcast.refdate.strftime("%H")
82
+ r["step"] = hindcast.step
83
+ requests.append(r)
84
+
85
+ if len(requests) == 0:
86
+ return MultiFieldList([])
87
+
88
+ return mars(
89
+ context,
90
+ dates,
91
+ *requests,
92
+ date_key="hdate",
93
+ request_already_using_valid_datetime=True,
94
+ )
@@ -8,16 +8,13 @@
8
8
  # nor does it submit to any jurisdiction.
9
9
 
10
10
 
11
- import inspect
12
11
  import logging
13
- import os
14
- from collections.abc import Callable
12
+ from abc import abstractmethod
15
13
  from typing import Any
16
14
 
17
- from anemoi.datasets.create.input.template import resolve
15
+ from anemoi.datasets.create.input.context import Context
18
16
 
19
17
  from ..source import Source
20
- from . import source_registry
21
18
 
22
19
  LOG = logging.getLogger(__name__)
23
20
 
@@ -27,7 +24,7 @@ class LegacySource(Source):
27
24
 
28
25
  Parameters
29
26
  ----------
30
- context : Any
27
+ context : Context
31
28
  The context in which the source is created.
32
29
  *args : tuple
33
30
  Positional arguments.
@@ -35,64 +32,15 @@ class LegacySource(Source):
35
32
  Keyword arguments.
36
33
  """
37
34
 
38
- def __init__(self, context: Any, *args: Any, **kwargs: Any) -> None:
35
+ def __init__(self, context: Context, *args: Any, **kwargs: Any) -> None:
39
36
  super().__init__(context, *args, **kwargs)
40
37
  self.args = args
41
38
  self.kwargs = kwargs
42
39
 
40
+ @staticmethod
41
+ @abstractmethod
42
+ def _execute(context, *args, **kwargs):
43
+ pass
43
44
 
44
- class legacy_source:
45
- """A decorator class for legacy sources.
46
-
47
- Parameters
48
- ----------
49
- name : str
50
- The name of the legacy source.
51
- """
52
-
53
- def __init__(self, name: str) -> None:
54
- name, _ = os.path.splitext(os.path.basename(name))
55
- self.name = name
56
-
57
- def __call__(self, execute: Callable) -> Callable:
58
- """Call method to wrap the execute function.
59
-
60
- Parameters
61
- ----------
62
- execute : function
63
- The execute function to be wrapped.
64
-
65
- Returns
66
- -------
67
- function
68
- The wrapped execute function.
69
- """
70
- this = self
71
- name = f"Legacy{self.name.title()}Source"
72
- source = ".".join([execute.__module__, execute.__name__])
73
-
74
- def execute_wrapper(self, dates) -> Any:
75
- """Wrapper method to call the execute function."""
76
-
77
- args, kwargs = resolve(self.context, (self.args, self.kwargs))
78
-
79
- try:
80
- return execute(self.context, dates, *args, **kwargs)
81
- except TypeError:
82
- LOG.error(f"Error executing source {this.name} from {source}")
83
- LOG.error(f"Function signature is: {inspect.signature(execute)}")
84
- LOG.error(f"Arguments are: {args=}, {kwargs=}")
85
- raise
86
-
87
- klass = type(
88
- name,
89
- (LegacySource,),
90
- {
91
- "execute": execute_wrapper,
92
- "_source": source,
93
- },
94
- )
95
-
96
- source_registry.register(self.name)(klass)
97
-
98
- return execute
45
+ def execute(self, dates: Any) -> Any:
46
+ return self._execute(self.context, dates, *self.args, **self.kwargs)