anemoi-datasets 0.4.5__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/commands/create.py +3 -2
  3. anemoi/datasets/commands/inspect.py +1 -1
  4. anemoi/datasets/commands/publish.py +30 -0
  5. anemoi/datasets/create/__init__.py +72 -35
  6. anemoi/datasets/create/check.py +6 -0
  7. anemoi/datasets/create/config.py +4 -3
  8. anemoi/datasets/create/functions/filters/pressure_level_relative_humidity_to_specific_humidity.py +57 -0
  9. anemoi/datasets/create/functions/filters/pressure_level_specific_humidity_to_relative_humidity.py +57 -0
  10. anemoi/datasets/create/functions/filters/rename.py +2 -3
  11. anemoi/datasets/create/functions/filters/single_level_dewpoint_to_relative_humidity.py +54 -0
  12. anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_dewpoint.py +59 -0
  13. anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_specific_humidity.py +115 -0
  14. anemoi/datasets/create/functions/filters/single_level_specific_humidity_to_relative_humidity.py +390 -0
  15. anemoi/datasets/create/functions/filters/speeddir_to_uv.py +77 -0
  16. anemoi/datasets/create/functions/filters/uv_to_speeddir.py +55 -0
  17. anemoi/datasets/create/functions/sources/__init__.py +7 -1
  18. anemoi/datasets/create/functions/sources/accumulations.py +2 -0
  19. anemoi/datasets/create/functions/sources/grib.py +87 -2
  20. anemoi/datasets/create/functions/sources/hindcasts.py +14 -73
  21. anemoi/datasets/create/functions/sources/mars.py +9 -3
  22. anemoi/datasets/create/functions/sources/xarray/__init__.py +6 -1
  23. anemoi/datasets/create/functions/sources/xarray/coordinates.py +6 -1
  24. anemoi/datasets/create/functions/sources/xarray/field.py +20 -5
  25. anemoi/datasets/create/functions/sources/xarray/fieldlist.py +16 -16
  26. anemoi/datasets/create/functions/sources/xarray/flavour.py +126 -12
  27. anemoi/datasets/create/functions/sources/xarray/grid.py +106 -17
  28. anemoi/datasets/create/functions/sources/xarray/metadata.py +6 -12
  29. anemoi/datasets/create/functions/sources/xarray/time.py +1 -5
  30. anemoi/datasets/create/functions/sources/xarray/variable.py +10 -10
  31. anemoi/datasets/create/input/__init__.py +69 -0
  32. anemoi/datasets/create/input/action.py +123 -0
  33. anemoi/datasets/create/input/concat.py +92 -0
  34. anemoi/datasets/create/input/context.py +59 -0
  35. anemoi/datasets/create/input/data_sources.py +71 -0
  36. anemoi/datasets/create/input/empty.py +42 -0
  37. anemoi/datasets/create/input/filter.py +76 -0
  38. anemoi/datasets/create/input/function.py +122 -0
  39. anemoi/datasets/create/input/join.py +57 -0
  40. anemoi/datasets/create/input/misc.py +85 -0
  41. anemoi/datasets/create/input/pipe.py +33 -0
  42. anemoi/datasets/create/input/repeated_dates.py +217 -0
  43. anemoi/datasets/create/input/result.py +413 -0
  44. anemoi/datasets/create/input/step.py +99 -0
  45. anemoi/datasets/create/{template.py → input/template.py} +0 -42
  46. anemoi/datasets/create/persistent.py +1 -1
  47. anemoi/datasets/create/statistics/__init__.py +1 -1
  48. anemoi/datasets/create/utils.py +3 -0
  49. anemoi/datasets/create/zarr.py +4 -2
  50. anemoi/datasets/data/dataset.py +11 -1
  51. anemoi/datasets/data/debug.py +5 -1
  52. anemoi/datasets/data/masked.py +2 -2
  53. anemoi/datasets/data/rescale.py +147 -0
  54. anemoi/datasets/data/stores.py +20 -7
  55. anemoi/datasets/dates/__init__.py +113 -30
  56. anemoi/datasets/dates/groups.py +92 -19
  57. anemoi/datasets/fields.py +66 -0
  58. anemoi/datasets/utils/fields.py +47 -0
  59. {anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/METADATA +10 -19
  60. anemoi_datasets-0.5.5.dist-info/RECORD +121 -0
  61. {anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/WHEEL +1 -1
  62. anemoi/datasets/create/input.py +0 -1065
  63. anemoi_datasets-0.4.5.dist-info/RECORD +0 -96
  64. /anemoi/datasets/create/{trace.py → input/trace.py} +0 -0
  65. {anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/LICENSE +0 -0
  66. {anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/entry_points.txt +0 -0
  67. {anemoi_datasets-0.4.5.dist-info → anemoi_datasets-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,99 @@
1
+ # (C) Copyright 2024 ECMWF.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ # In applying this licence, ECMWF does not waive the privileges and immunities
6
+ # granted to it by virtue of its status as an intergovernmental organisation
7
+ # nor does it submit to any jurisdiction.
8
+ #
9
+ import logging
10
+ from copy import deepcopy
11
+
12
+ from anemoi.utils.dates import as_datetime as as_datetime
13
+ from anemoi.utils.dates import frequency_to_timedelta as frequency_to_timedelta
14
+
15
+ from anemoi.datasets.dates import DatesProvider as DatesProvider
16
+ from anemoi.datasets.fields import FieldArray as FieldArray
17
+ from anemoi.datasets.fields import NewValidDateTimeField as NewValidDateTimeField
18
+
19
+ from .action import Action
20
+ from .context import Context
21
+ from .misc import is_function
22
+ from .result import Result
23
+ from .template import notify_result
24
+ from .trace import trace_datasource
25
+ from .trace import trace_select
26
+
27
+ LOG = logging.getLogger(__name__)
28
+
29
+
30
+ class StepResult(Result):
31
+ def __init__(self, context, action_path, group_of_dates, action, upstream_result):
32
+ super().__init__(context, action_path, group_of_dates)
33
+ assert isinstance(upstream_result, Result), type(upstream_result)
34
+ self.upstream_result = upstream_result
35
+ self.action = action
36
+
37
+ @property
38
+ @notify_result
39
+ @trace_datasource
40
+ def datasource(self):
41
+ raise NotImplementedError(f"Not implemented in {self.__class__.__name__}")
42
+
43
+
44
+ class StepAction(Action):
45
+ result_class = None
46
+
47
+ def __init__(self, context, action_path, previous_step, *args, **kwargs):
48
+ super().__init__(context, action_path, *args, **kwargs)
49
+ self.previous_step = previous_step
50
+
51
+ @trace_select
52
+ def select(self, group_of_dates):
53
+ return self.result_class(
54
+ self.context,
55
+ self.action_path,
56
+ group_of_dates,
57
+ self,
58
+ self.previous_step.select(group_of_dates),
59
+ )
60
+
61
+ def __repr__(self):
62
+ return super().__repr__(self.previous_step, _inline_=str(self.kwargs))
63
+
64
+
65
+ def step_factory(config, context, action_path, previous_step):
66
+
67
+ from .filter import FilterStepAction
68
+ from .filter import FunctionStepAction
69
+
70
+ assert isinstance(context, Context), (type, context)
71
+ if not isinstance(config, dict):
72
+ raise ValueError(f"Invalid input config {config}")
73
+
74
+ config = deepcopy(config)
75
+ assert len(config) == 1, config
76
+
77
+ key = list(config.keys())[0]
78
+ cls = dict(
79
+ filter=FilterStepAction,
80
+ # rename=RenameAction,
81
+ # remapping=RemappingAction,
82
+ ).get(key)
83
+
84
+ if isinstance(config[key], list):
85
+ args, kwargs = config[key], {}
86
+
87
+ if isinstance(config[key], dict):
88
+ args, kwargs = [], config[key]
89
+
90
+ if isinstance(config[key], str):
91
+ args, kwargs = [config[key]], {}
92
+
93
+ if cls is None:
94
+ if not is_function(key, "filters"):
95
+ raise ValueError(f"Unknown step {key}")
96
+ cls = FunctionStepAction
97
+ args = [key] + args
98
+
99
+ return cls(context, action_path, previous_step, *args, **kwargs)
@@ -9,14 +9,8 @@
9
9
 
10
10
  import logging
11
11
  import re
12
- import textwrap
13
12
  from functools import wraps
14
13
 
15
- from anemoi.utils.humanize import plural
16
-
17
- from .trace import step
18
- from .trace import trace
19
-
20
14
  LOG = logging.getLogger(__name__)
21
15
 
22
16
 
@@ -30,42 +24,6 @@ def notify_result(method):
30
24
  return wrapper
31
25
 
32
26
 
33
- class Context:
34
- def __init__(self):
35
- # used_references is a set of reference paths that will be needed
36
- self.used_references = set()
37
- # results is a dictionary of reference path -> obj
38
- self.results = {}
39
-
40
- def will_need_reference(self, key):
41
- assert isinstance(key, (list, tuple)), key
42
- key = tuple(key)
43
- self.used_references.add(key)
44
-
45
- def notify_result(self, key, result):
46
- trace(
47
- "🎯",
48
- step(key),
49
- "notify result",
50
- textwrap.shorten(repr(result).replace(",", ", "), width=40),
51
- plural(len(result), "field"),
52
- )
53
- assert isinstance(key, (list, tuple)), key
54
- key = tuple(key)
55
- if key in self.used_references:
56
- if key in self.results:
57
- raise ValueError(f"Duplicate result {key}")
58
- self.results[key] = result
59
-
60
- def get_result(self, key):
61
- assert isinstance(key, (list, tuple)), key
62
- key = tuple(key)
63
- if key in self.results:
64
- return self.results[key]
65
- all_keys = sorted(list(self.results.keys()))
66
- raise ValueError(f"Cannot find result {key} in {all_keys}")
67
-
68
-
69
27
  class Substitution:
70
28
  pass
71
29
 
@@ -68,7 +68,7 @@ class PersistentDict:
68
68
  path = os.path.join(self.dirname, f"{h}.pickle")
69
69
 
70
70
  if os.path.exists(path):
71
- LOG.warn(f"{path} already exists")
71
+ LOG.warning(f"{path} already exists")
72
72
 
73
73
  tmp_path = path + f".tmp-{os.getpid()}-on-{socket.gethostname()}"
74
74
  with open(tmp_path, "wb") as f:
@@ -155,7 +155,7 @@ def compute_statistics(array, check_variables_names=None, allow_nans=False):
155
155
  check_data_values(values[j, :], name=name, allow_nans=allow_nans)
156
156
  if np.isnan(values[j, :]).all():
157
157
  # LOG.warning(f"All NaN values for {name} ({j}) for date {i}")
158
- raise ValueError(f"All NaN values for {name} ({j}) for date {i}")
158
+ LOG.warning(f"All NaN values for {name} ({j}) for date {i}")
159
159
 
160
160
  # Ignore NaN values
161
161
  minimum[i] = np.nanmin(values, axis=1)
@@ -62,6 +62,9 @@ def make_list_int(value):
62
62
 
63
63
 
64
64
  def normalize_and_check_dates(dates, start, end, frequency, dtype="datetime64[s]"):
65
+
66
+ dates = [d.hdate if hasattr(d, "hdate") else d for d in dates]
67
+
65
68
  assert isinstance(frequency, datetime.timedelta), frequency
66
69
  start = np.datetime64(start)
67
70
  end = np.datetime64(end)
@@ -128,7 +128,7 @@ class ZarrBuiltRegistry:
128
128
  def add_to_history(self, action, **kwargs):
129
129
  new = dict(
130
130
  action=action,
131
- timestamp=datetime.datetime.utcnow().isoformat(),
131
+ timestamp=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat(),
132
132
  )
133
133
  new.update(kwargs)
134
134
 
@@ -151,7 +151,9 @@ class ZarrBuiltRegistry:
151
151
 
152
152
  def set_flag(self, i, value=True):
153
153
  z = self._open_write()
154
- z.attrs["latest_write_timestamp"] = datetime.datetime.utcnow().isoformat()
154
+ z.attrs["latest_write_timestamp"] = (
155
+ datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat()
156
+ )
155
157
  z["_build"][self.name_flags][i] = value
156
158
 
157
159
  def ready(self):
@@ -23,7 +23,11 @@ LOG = logging.getLogger(__name__)
23
23
  class Dataset:
24
24
  arguments = {}
25
25
 
26
- def mutate(self):
26
+ def mutate(self) -> "Dataset":
27
+ """
28
+ Give an opportunity to a subclass to return a new Dataset
29
+ object of a different class, if needed.
30
+ """
27
31
  return self
28
32
 
29
33
  def swap_with_parent(self, parent):
@@ -90,6 +94,12 @@ class Dataset:
90
94
  rename = kwargs.pop("rename")
91
95
  return Rename(self, rename)._subset(**kwargs).mutate()
92
96
 
97
+ if "rescale" in kwargs:
98
+ from .rescale import Rescale
99
+
100
+ rescale = kwargs.pop("rescale")
101
+ return Rescale(self, rescale)._subset(**kwargs).mutate()
102
+
93
103
  if "statistics" in kwargs:
94
104
  from ..data import open_dataset
95
105
  from .statistics import Statistics
@@ -209,10 +209,14 @@ def _debug_indexing(method):
209
209
  return wrapper
210
210
 
211
211
 
212
+ def _identity(x):
213
+ return x
214
+
215
+
212
216
  if DEBUG_ZARR_INDEXING:
213
217
  debug_indexing = _debug_indexing
214
218
  else:
215
- debug_indexing = lambda x: x # noqa
219
+ debug_indexing = _identity
216
220
 
217
221
 
218
222
  def debug_zarr_loading(on_off):
@@ -112,5 +112,5 @@ class Cropping(Masked):
112
112
  def tree(self):
113
113
  return Node(self, [self.forward.tree()], area=self.area)
114
114
 
115
- def metadata_specific(self, **kwargs):
116
- return super().metadata_specific(area=self.area, **kwargs)
115
+ def subclass_metadata_specific(self):
116
+ return dict(area=self.area)
@@ -0,0 +1,147 @@
1
+ # (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
2
+ # This software is licensed under the terms of the Apache Licence Version 2.0
3
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
4
+ # In applying this licence, ECMWF does not waive the privileges and immunities
5
+ # granted to it by virtue of its status as an intergovernmental organisation
6
+ # nor does it submit to any jurisdiction.
7
+
8
+ import logging
9
+ from functools import cached_property
10
+
11
+ import numpy as np
12
+
13
+ from .debug import Node
14
+ from .debug import debug_indexing
15
+ from .forwards import Forwards
16
+ from .indexing import apply_index_to_slices_changes
17
+ from .indexing import expand_list_indexing
18
+ from .indexing import index_to_slices
19
+ from .indexing import update_tuple
20
+
21
+ LOG = logging.getLogger(__name__)
22
+
23
+
24
+ def make_rescale(variable, rescale):
25
+
26
+ if isinstance(rescale, (tuple, list)):
27
+
28
+ assert len(rescale) == 2, rescale
29
+
30
+ if isinstance(rescale[0], (int, float)):
31
+ return rescale
32
+
33
+ from cfunits import Units
34
+
35
+ u0 = Units(rescale[0])
36
+ u1 = Units(rescale[1])
37
+
38
+ x1, x2 = 0.0, 1.0
39
+ y1, y2 = Units.conform([x1, x2], u0, u1)
40
+
41
+ a = (y2 - y1) / (x2 - x1)
42
+ b = y1 - a * x1
43
+
44
+ return a, b
45
+
46
+ return rescale
47
+
48
+ if isinstance(rescale, dict):
49
+ assert "scale" in rescale, rescale
50
+ assert "offset" in rescale, rescale
51
+ return rescale["scale"], rescale["offset"]
52
+
53
+ assert False
54
+
55
+
56
+ class Rescale(Forwards):
57
+ def __init__(self, dataset, rescale):
58
+ super().__init__(dataset)
59
+ for n in rescale:
60
+ assert n in dataset.variables, n
61
+
62
+ variables = dataset.variables
63
+
64
+ self._a = np.ones(len(variables))
65
+ self._b = np.zeros(len(variables))
66
+
67
+ self.rescale = {}
68
+ for i, v in enumerate(variables):
69
+ if v in rescale:
70
+ a, b = make_rescale(v, rescale[v])
71
+ self.rescale[v] = a, b
72
+ self._a[i], self._b[i] = a, b
73
+
74
+ self._a = self._a[np.newaxis, :, np.newaxis, np.newaxis]
75
+ self._b = self._b[np.newaxis, :, np.newaxis, np.newaxis]
76
+
77
+ self._a = self._a.astype(self.forward.dtype)
78
+ self._b = self._b.astype(self.forward.dtype)
79
+
80
+ def tree(self):
81
+ return Node(self, [self.forward.tree()], rescale=self.rescale)
82
+
83
+ def subclass_metadata_specific(self):
84
+ return dict(rescale=self.rescale)
85
+
86
+ @debug_indexing
87
+ @expand_list_indexing
88
+ def _get_tuple(self, index):
89
+ index, changes = index_to_slices(index, self.shape)
90
+ index, previous = update_tuple(index, 1, slice(None))
91
+ result = self.forward[index]
92
+ result = result * self._a + self._b
93
+ result = result[:, previous]
94
+ result = apply_index_to_slices_changes(result, changes)
95
+ return result
96
+
97
+ @debug_indexing
98
+ def __get_slice_(self, n):
99
+ data = self.forward[n]
100
+ return data * self._a + self._b
101
+
102
+ @debug_indexing
103
+ def __getitem__(self, n):
104
+
105
+ if isinstance(n, tuple):
106
+ return self._get_tuple(n)
107
+
108
+ if isinstance(n, slice):
109
+ return self.__get_slice_(n)
110
+
111
+ data = self.forward[n]
112
+
113
+ return data * self._a[0] + self._b[0]
114
+
115
+ @cached_property
116
+ def statistics(self):
117
+ result = {}
118
+ a = self._a.squeeze()
119
+ assert np.all(a >= 0)
120
+
121
+ b = self._b.squeeze()
122
+ for k, v in self.forward.statistics.items():
123
+ if k in ("maximum", "minimum", "mean"):
124
+ result[k] = v * a + b
125
+ continue
126
+
127
+ if k in ("stdev",):
128
+ result[k] = v * a
129
+ continue
130
+
131
+ raise NotImplementedError("rescale statistics", k)
132
+
133
+ return result
134
+
135
+ def statistics_tendencies(self, delta=None):
136
+ result = {}
137
+ a = self._a.squeeze()
138
+ assert np.all(a >= 0)
139
+
140
+ for k, v in self.forward.statistics_tendencies(delta).items():
141
+ if k in ("maximum", "minimum", "mean", "stdev"):
142
+ result[k] = v * a
143
+ continue
144
+
145
+ raise NotImplementedError("rescale tendencies statistics", k)
146
+
147
+ return result
@@ -5,6 +5,7 @@
5
5
  # granted to it by virtue of its status as an intergovernmental organisation
6
6
  # nor does it submit to any jurisdiction.
7
7
 
8
+
8
9
  import logging
9
10
  import os
10
11
  import warnings
@@ -83,6 +84,8 @@ class S3Store(ReadOnlyStore):
83
84
 
84
85
 
85
86
  class DebugStore(ReadOnlyStore):
87
+ """A store to debug the zarr loading."""
88
+
86
89
  def __init__(self, store):
87
90
  assert not isinstance(store, DebugStore)
88
91
  self.store = store
@@ -148,6 +151,8 @@ def open_zarr(path, dont_fail=False, cache=None):
148
151
 
149
152
 
150
153
  class Zarr(Dataset):
154
+ """A zarr dataset."""
155
+
151
156
  def __init__(self, path):
152
157
  if isinstance(path, zarr.hierarchy.Group):
153
158
  self.was_zarr = True
@@ -244,14 +249,20 @@ class Zarr(Dataset):
244
249
  delta = self.frequency
245
250
  if isinstance(delta, int):
246
251
  delta = f"{delta}h"
247
- from anemoi.datasets.create.loaders import TendenciesStatisticsAddition
252
+ from anemoi.utils.dates import frequency_to_string
253
+ from anemoi.utils.dates import frequency_to_timedelta
254
+
255
+ delta = frequency_to_timedelta(delta)
256
+ delta = frequency_to_string(delta)
257
+
258
+ def func(k):
259
+ return f"statistics_tendencies_{delta}_{k}"
248
260
 
249
- func = TendenciesStatisticsAddition.final_storage_name_from_delta
250
261
  return dict(
251
- mean=self.z[func("mean", delta)][:],
252
- stdev=self.z[func("stdev", delta)][:],
253
- maximum=self.z[func("maximum", delta)][:],
254
- minimum=self.z[func("minimum", delta)][:],
262
+ mean=self.z[func("mean")][:],
263
+ stdev=self.z[func("stdev")][:],
264
+ maximum=self.z[func("maximum")][:],
265
+ minimum=self.z[func("minimum")][:],
255
266
  )
256
267
 
257
268
  @property
@@ -322,11 +333,13 @@ class Zarr(Dataset):
322
333
 
323
334
 
324
335
  class ZarrWithMissingDates(Zarr):
336
+ """A zarr dataset with missing dates."""
337
+
325
338
  def __init__(self, path):
326
339
  super().__init__(path)
327
340
 
328
341
  missing_dates = self.z.attrs.get("missing_dates", [])
329
- missing_dates = [np.datetime64(x) for x in missing_dates]
342
+ missing_dates = set([np.datetime64(x) for x in missing_dates])
330
343
  self.missing_to_dates = {i: d for i, d in enumerate(self.dates) if d in missing_dates}
331
344
  self.missing = set(self.missing_to_dates)
332
345