anemoi-datasets 0.5.27__py3-none-any.whl → 0.5.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/commands/recipe/__init__.py +93 -0
  3. anemoi/datasets/commands/recipe/format.py +55 -0
  4. anemoi/datasets/commands/recipe/migrate.py +555 -0
  5. anemoi/datasets/create/__init__.py +46 -13
  6. anemoi/datasets/create/config.py +52 -53
  7. anemoi/datasets/create/input/__init__.py +43 -63
  8. anemoi/datasets/create/input/action.py +296 -236
  9. anemoi/datasets/create/input/context/__init__.py +71 -0
  10. anemoi/datasets/create/input/context/field.py +54 -0
  11. anemoi/datasets/create/input/data_sources.py +2 -1
  12. anemoi/datasets/create/input/misc.py +0 -71
  13. anemoi/datasets/create/input/repeated_dates.py +0 -114
  14. anemoi/datasets/create/input/result/__init__.py +17 -0
  15. anemoi/datasets/create/input/{result.py → result/field.py} +10 -92
  16. anemoi/datasets/create/sources/accumulate.py +517 -0
  17. anemoi/datasets/create/sources/accumulate_utils/__init__.py +8 -0
  18. anemoi/datasets/create/sources/accumulate_utils/covering_intervals.py +221 -0
  19. anemoi/datasets/create/sources/accumulate_utils/field_to_interval.py +149 -0
  20. anemoi/datasets/create/sources/accumulate_utils/interval_generators.py +321 -0
  21. anemoi/datasets/create/sources/anemoi_dataset.py +46 -42
  22. anemoi/datasets/create/sources/constants.py +39 -38
  23. anemoi/datasets/create/sources/empty.py +26 -22
  24. anemoi/datasets/create/sources/forcings.py +29 -28
  25. anemoi/datasets/create/sources/grib.py +92 -72
  26. anemoi/datasets/create/sources/grib_index.py +102 -54
  27. anemoi/datasets/create/sources/hindcasts.py +56 -55
  28. anemoi/datasets/create/sources/legacy.py +10 -62
  29. anemoi/datasets/create/sources/mars.py +159 -154
  30. anemoi/datasets/create/sources/netcdf.py +28 -24
  31. anemoi/datasets/create/sources/opendap.py +28 -24
  32. anemoi/datasets/create/sources/recentre.py +42 -41
  33. anemoi/datasets/create/sources/repeated_dates.py +44 -0
  34. anemoi/datasets/create/sources/source.py +26 -48
  35. anemoi/datasets/create/sources/xarray_support/__init__.py +30 -24
  36. anemoi/datasets/create/sources/xarray_support/coordinates.py +1 -4
  37. anemoi/datasets/create/sources/xarray_support/field.py +4 -4
  38. anemoi/datasets/create/sources/xarray_support/flavour.py +2 -2
  39. anemoi/datasets/create/sources/xarray_support/patch.py +178 -5
  40. anemoi/datasets/create/sources/xarray_zarr.py +28 -24
  41. anemoi/datasets/create/sources/zenodo.py +43 -39
  42. anemoi/datasets/create/utils.py +0 -42
  43. anemoi/datasets/data/complement.py +26 -17
  44. anemoi/datasets/data/dataset.py +12 -0
  45. anemoi/datasets/data/grids.py +0 -152
  46. anemoi/datasets/data/masked.py +74 -13
  47. anemoi/datasets/data/missing.py +5 -0
  48. anemoi/datasets/data/rolling_average.py +141 -0
  49. anemoi/datasets/data/stores.py +7 -9
  50. anemoi/datasets/dates/__init__.py +2 -0
  51. anemoi/datasets/dumper.py +76 -0
  52. anemoi/datasets/grids.py +1 -178
  53. anemoi/datasets/schemas/recipe.json +131 -0
  54. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/METADATA +9 -6
  55. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/RECORD +59 -57
  56. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/WHEEL +1 -1
  57. anemoi/datasets/create/filter.py +0 -47
  58. anemoi/datasets/create/input/concat.py +0 -161
  59. anemoi/datasets/create/input/context.py +0 -86
  60. anemoi/datasets/create/input/empty.py +0 -53
  61. anemoi/datasets/create/input/filter.py +0 -117
  62. anemoi/datasets/create/input/function.py +0 -232
  63. anemoi/datasets/create/input/join.py +0 -129
  64. anemoi/datasets/create/input/pipe.py +0 -66
  65. anemoi/datasets/create/input/step.py +0 -173
  66. anemoi/datasets/create/input/template.py +0 -161
  67. anemoi/datasets/create/sources/accumulations.py +0 -1062
  68. anemoi/datasets/create/sources/accumulations2.py +0 -647
  69. anemoi/datasets/create/sources/tendencies.py +0 -198
  70. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/entry_points.txt +0 -0
  71. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/licenses/LICENSE +0 -0
  72. {anemoi_datasets-0.5.27.dist-info → anemoi_datasets-0.5.29.dist-info}/top_level.txt +0 -0
@@ -256,8 +256,7 @@ class Dataset:
256
256
  resolution: str,
257
257
  dates: list[datetime.datetime],
258
258
  frequency: datetime.timedelta,
259
- raise_exception: bool = True,
260
- is_test: bool = False,
259
+ raise_exception: bool = False,
261
260
  ) -> None:
262
261
  """Check the name of the dataset.
263
262
 
@@ -271,15 +270,13 @@ class Dataset:
271
270
  The frequency of the dataset.
272
271
  raise_exception : bool, optional
273
272
  Whether to raise an exception if the name is invalid.
274
- is_test : bool, optional
275
- Whether this is a test.
276
273
  """
277
274
  basename, _ = os.path.splitext(os.path.basename(self.path))
278
275
  try:
279
276
  DatasetName(basename, resolution, dates[0], dates[-1], frequency).raise_if_not_valid()
280
277
  except Exception as e:
281
- if raise_exception and not is_test:
282
- raise e
278
+ if raise_exception:
279
+ raise
283
280
  else:
284
281
  LOG.warning(f"Dataset name error: {e}")
285
282
 
@@ -577,7 +574,6 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
577
574
  use_threads: bool = False,
578
575
  statistics_temp_dir: str | None = None,
579
576
  progress: Any = None,
580
- test: bool = False,
581
577
  cache: str | None = None,
582
578
  **kwargs: Any,
583
579
  ):
@@ -599,8 +595,6 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
599
595
  The directory for temporary statistics.
600
596
  progress : Any, optional
601
597
  The progress indicator.
602
- test : bool, optional
603
- Whether this is a test.
604
598
  cache : Optional[str], optional
605
599
  The cache directory.
606
600
  """
@@ -613,9 +607,8 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
613
607
  self.use_threads = use_threads
614
608
  self.statistics_temp_dir = statistics_temp_dir
615
609
  self.progress = progress
616
- self.test = test
617
610
 
618
- self.main_config = loader_config(config, is_test=test)
611
+ self.main_config = loader_config(config)
619
612
 
620
613
  # self.registry.delete() ??
621
614
  self.tmp_statistics.delete()
@@ -748,7 +741,6 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
748
741
 
749
742
  self.dataset.check_name(
750
743
  raise_exception=self.check_name,
751
- is_test=self.test,
752
744
  resolution=resolution,
753
745
  dates=dates,
754
746
  frequency=frequency,
@@ -865,7 +857,7 @@ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
865
857
  # assert isinstance(group[0], datetime.datetime), type(group[0])
866
858
  LOG.debug(f"Building data for group {igroup}/{self.n_groups}")
867
859
 
868
- result = self.input.select(group_of_dates=group)
860
+ result = self.input.select(argument=group)
869
861
  assert result.group_of_dates == group, (len(result.group_of_dates), len(group), group)
870
862
 
871
863
  # There are several groups.
@@ -1616,3 +1608,44 @@ def creator_factory(name: str, trace: str | None = None, **kwargs: Any) -> Any:
1616
1608
  )[name]
1617
1609
  LOG.debug(f"Creating {cls.__name__} with {kwargs}")
1618
1610
  return cls(**kwargs)
1611
+
1612
+
1613
+ def validate_config(config: Any) -> None:
1614
+
1615
+ import json
1616
+
1617
+ import jsonschema
1618
+
1619
+ def _tidy(d):
1620
+ if isinstance(d, dict):
1621
+ return {k: _tidy(v) for k, v in d.items()}
1622
+
1623
+ if isinstance(d, list):
1624
+ return [_tidy(v) for v in d if v is not None]
1625
+
1626
+ # jsonschema does not support datetime.date
1627
+ if isinstance(d, datetime.datetime):
1628
+ return d.isoformat()
1629
+
1630
+ if isinstance(d, datetime.date):
1631
+ return d.isoformat()
1632
+
1633
+ return d
1634
+
1635
+ # https://json-schema.org
1636
+
1637
+ with open(
1638
+ os.path.join(
1639
+ os.path.dirname(os.path.dirname(__file__)),
1640
+ "schemas",
1641
+ "recipe.json",
1642
+ )
1643
+ ) as f:
1644
+ schema = json.load(f)
1645
+
1646
+ try:
1647
+ jsonschema.validate(instance=_tidy(config), schema=schema)
1648
+ except jsonschema.exceptions.ValidationError as e:
1649
+ LOG.error("❌ Config validation failed (jsonschema):")
1650
+ LOG.error(e.message)
1651
+ raise
@@ -18,8 +18,6 @@ from anemoi.utils.config import DotDict
18
18
  from anemoi.utils.config import load_any_dict_format
19
19
  from earthkit.data.core.order import normalize_order_by
20
20
 
21
- from anemoi.datasets.dates.groups import Groups
22
-
23
21
  LOG = logging.getLogger(__name__)
24
22
 
25
23
 
@@ -279,6 +277,8 @@ class LoadersConfig(Config):
279
277
 
280
278
  self.output.order_by = normalize_order_by(self.output.order_by)
281
279
 
280
+ self.setdefault("dates", Config())
281
+
282
282
  self.dates["group_by"] = self.build.group_by
283
283
 
284
284
  ###########
@@ -338,61 +338,13 @@ def _prepare_serialisation(o: Any) -> Any:
338
338
  return str(o)
339
339
 
340
340
 
341
- def set_to_test_mode(cfg: dict) -> None:
342
- """Modifies the configuration to run in test mode.
343
-
344
- Parameters
345
- ----------
346
- cfg : dict
347
- The configuration dictionary.
348
- """
349
- NUMBER_OF_DATES = 4
350
-
351
- LOG.warning(f"Running in test mode. Changing the list of dates to use only {NUMBER_OF_DATES}.")
352
- groups = Groups(**LoadersConfig(cfg).dates)
353
-
354
- dates = groups.provider.values
355
- cfg["dates"] = dict(
356
- start=dates[0],
357
- end=dates[NUMBER_OF_DATES - 1],
358
- frequency=groups.provider.frequency,
359
- group_by=NUMBER_OF_DATES,
360
- )
361
-
362
- def set_element_to_test(obj):
363
- if isinstance(obj, (list, tuple)):
364
- for v in obj:
365
- set_element_to_test(v)
366
- return
367
- if isinstance(obj, (dict, DotDict)):
368
- if "grid" in obj:
369
- previous = obj["grid"]
370
- obj["grid"] = "20./20."
371
- LOG.warning(f"Running in test mode. Setting grid to {obj['grid']} instead of {previous}")
372
- if "number" in obj:
373
- if isinstance(obj["number"], (list, tuple)):
374
- previous = obj["number"]
375
- obj["number"] = previous[0:3]
376
- LOG.warning(f"Running in test mode. Setting number to {obj['number']} instead of {previous}")
377
- for k, v in obj.items():
378
- set_element_to_test(v)
379
- if "constants" in obj:
380
- constants = obj["constants"]
381
- if "param" in constants and isinstance(constants["param"], list):
382
- constants["param"] = ["cos_latitude"]
383
-
384
- set_element_to_test(cfg)
385
-
386
-
387
- def loader_config(config: dict, is_test: bool = False) -> LoadersConfig:
341
+ def loader_config(config: dict) -> LoadersConfig:
388
342
  """Loads and validates the configuration for dataset loaders.
389
343
 
390
344
  Parameters
391
345
  ----------
392
346
  config : dict
393
347
  The configuration dictionary.
394
- is_test : bool, optional
395
- Whether to run in test mode. Defaults to False.
396
348
 
397
349
  Returns
398
350
  -------
@@ -400,8 +352,6 @@ def loader_config(config: dict, is_test: bool = False) -> LoadersConfig:
400
352
  The validated configuration object.
401
353
  """
402
354
  config = Config(config)
403
- if is_test:
404
- set_to_test_mode(config)
405
355
  obj = LoadersConfig(config)
406
356
 
407
357
  # yaml round trip to check that serialisation works as expected
@@ -422,6 +372,9 @@ def loader_config(config: dict, is_test: bool = False) -> LoadersConfig:
422
372
  LOG.info(f"Setting env variable {k}={v}")
423
373
  os.environ[k] = str(v)
424
374
 
375
+ # Used by pytest only
376
+ # copy.pop('checks', None)
377
+
425
378
  return copy
426
379
 
427
380
 
@@ -441,3 +394,49 @@ def build_output(*args, **kwargs) -> OutputSpecs:
441
394
  The output specifications object.
442
395
  """
443
396
  return OutputSpecs(*args, **kwargs)
397
+
398
+
399
+ def flatten_list_of_sets(list_of_sets: list[set]) -> set:
400
+ return {element for subset in list_of_sets for element in subset}
401
+
402
+
403
+ def mars_str_to_set(s: str) -> set[str]:
404
+ """Mars strings are like 1/to/2 or 1/to/2/by/1
405
+
406
+ Returns a set of strings, e.g. {'1', '2'}
407
+ """
408
+ assert "/" in s, "mars_str_to_set expects a string with '/'"
409
+ lst = s.split("/")
410
+ assert len(lst) in (3, 5), f"mars_str_to_set expects a string like 1/to/2 or 1/to/4/by/1, got {s}"
411
+ if len(lst) == 3:
412
+ assert "to" in lst
413
+ start, _, end = lst
414
+ step = 1
415
+ elif len(lst) == 5:
416
+ assert "by" in lst and "to" in lst
417
+ start, _, end, _, step = lst
418
+ return {str(i) for i in range(int(start), int(end) + 1, int(step))}
419
+
420
+
421
+ def get_ensembles_set(obj):
422
+ """Counts the number of ensembles in the configuration."""
423
+ if isinstance(obj, dict):
424
+ if "number" in obj:
425
+ if isinstance(obj["number"], (list, tuple)):
426
+ return set([str(element) for element in obj["number"]])
427
+ if isinstance(obj["number"], (str, int)):
428
+ if "/" in str(obj["number"]):
429
+ return mars_str_to_set(obj["number"])
430
+ else:
431
+ return {str(obj["number"])}
432
+ if isinstance(obj, (dict)):
433
+ return flatten_list_of_sets([get_ensembles_set(v) for v in obj.values()])
434
+ if isinstance(obj, (list, tuple)):
435
+ return flatten_list_of_sets([get_ensembles_set(v) for v in obj])
436
+ return {}
437
+
438
+
439
+ def count_ensembles(config: Config) -> int:
440
+ """Counts the number of ensembles in the configuration."""
441
+ ensembles = get_ensembles_set(config.input)
442
+ return len(ensembles) if ensembles else 1
@@ -1,4 +1,4 @@
1
- # (C) Copyright 2024 Anemoi contributors.
1
+ # (C) Copyright 2024-2025 Anemoi contributors.
2
2
  #
3
3
  # This software is licensed under the terms of the Apache Licence Version 2.0
4
4
  # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
@@ -7,21 +7,15 @@
7
7
  # granted to it by virtue of its status as an intergovernmental organisation
8
8
  # nor does it submit to any jurisdiction.
9
9
 
10
- import logging
11
10
  from copy import deepcopy
11
+ from functools import cached_property
12
+ from typing import TYPE_CHECKING
12
13
  from typing import Any
13
14
 
14
- from anemoi.datasets.dates.groups import GroupOfDates
15
+ from anemoi.datasets.create.input.context.field import FieldContext
15
16
 
16
- from .trace import trace_select
17
-
18
- LOG = logging.getLogger(__name__)
19
-
20
-
21
- class Context:
22
- """Context for building input data."""
23
-
24
- pass
17
+ if TYPE_CHECKING:
18
+ from anemoi.datasets.create.input.action import Recipe
25
19
 
26
20
 
27
21
  class InputBuilder:
@@ -34,72 +28,58 @@ class InputBuilder:
34
28
  ----------
35
29
  config : dict
36
30
  Configuration dictionary.
37
- data_sources : Union[dict, list]
31
+ data_sources : dict
38
32
  Data sources.
39
33
  **kwargs : Any
40
34
  Additional keyword arguments.
41
35
  """
42
36
  self.kwargs = kwargs
37
+ self.config = deepcopy(config)
38
+ self.data_sources = deepcopy(dict(data_sources=data_sources))
43
39
 
44
- config = deepcopy(config)
45
- if data_sources:
46
- config = dict(
47
- data_sources=dict(
48
- sources=data_sources,
49
- input=config,
50
- )
51
- )
52
- self.config = config
53
- self.action_path = ["input"]
54
-
55
- @trace_select
56
- def select(self, group_of_dates: GroupOfDates) -> Any:
57
- """Select data based on the group of dates.
58
-
59
- Parameters
60
- ----------
61
- group_of_dates : GroupOfDates
62
- Group of dates to select data for.
63
-
64
- Returns
65
- -------
66
- Any
67
- Selected data.
68
- """
69
- from .action import ActionContext
40
+ @cached_property
41
+ def action(self) -> "Recipe":
42
+ """Returns the action object based on the configuration."""
43
+ from .action import Recipe
70
44
  from .action import action_factory
71
45
 
72
- """This changes the context."""
73
- context = ActionContext(**self.kwargs)
74
- action = action_factory(self.config, context, self.action_path)
75
- return action.select(group_of_dates)
76
-
77
- def __repr__(self) -> str:
78
- """Return a string representation of the InputBuilder.
79
-
80
- Returns
81
- -------
82
- str
83
- String representation.
84
- """
85
- from .action import ActionContext
86
- from .action import action_factory
46
+ sources = action_factory(self.data_sources, "data_sources")
47
+ input = action_factory(self.config, "input")
87
48
 
88
- context = ActionContext(**self.kwargs)
89
- a = action_factory(self.config, context, self.action_path)
90
- return repr(a)
49
+ return Recipe(input, sources)
91
50
 
92
- def _trace_select(self, group_of_dates: GroupOfDates) -> str:
93
- """Trace the select operation.
51
+ def select(self, argument) -> Any:
52
+ """Select data based on the group of dates.
94
53
 
95
54
  Parameters
96
55
  ----------
97
- group_of_dates : GroupOfDates
56
+ argument : GroupOfDates
98
57
  Group of dates to select data for.
99
58
 
100
59
  Returns
101
60
  -------
102
- str
103
- Trace string.
61
+ Any
62
+ Selected data.
104
63
  """
105
- return f"InputBuilder({group_of_dates})"
64
+ context = FieldContext(argument, **self.kwargs)
65
+ return context.create_result(self.action(context, argument))
66
+
67
+
68
+ def build_input(config: dict, data_sources: dict | list, **kwargs: Any) -> InputBuilder:
69
+ """Build an InputBuilder instance.
70
+
71
+ Parameters
72
+ ----------
73
+ config : dict
74
+ Configuration dictionary.
75
+ data_sources : Union[dict, list]
76
+ Data sources.
77
+ **kwargs : Any
78
+ Additional keyword arguments.
79
+
80
+ Returns
81
+ -------
82
+ InputBuilder
83
+ An instance of InputBuilder.
84
+ """
85
+ return InputBuilder(config, data_sources, **kwargs)