anemoi-datasets 0.5.24__py3-none-any.whl → 0.5.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/commands/finalise-additions.py +2 -1
  3. anemoi/datasets/commands/finalise.py +2 -1
  4. anemoi/datasets/commands/grib-index.py +1 -1
  5. anemoi/datasets/commands/init-additions.py +2 -1
  6. anemoi/datasets/commands/load-additions.py +2 -1
  7. anemoi/datasets/commands/load.py +2 -1
  8. anemoi/datasets/create/__init__.py +24 -33
  9. anemoi/datasets/create/filter.py +22 -24
  10. anemoi/datasets/create/input/__init__.py +0 -20
  11. anemoi/datasets/create/input/step.py +2 -16
  12. anemoi/datasets/create/sources/accumulations.py +7 -6
  13. anemoi/datasets/create/sources/planetary_computer.py +44 -0
  14. anemoi/datasets/create/sources/xarray_support/__init__.py +6 -22
  15. anemoi/datasets/create/sources/xarray_support/coordinates.py +8 -0
  16. anemoi/datasets/create/sources/xarray_support/field.py +1 -4
  17. anemoi/datasets/create/sources/xarray_support/flavour.py +44 -6
  18. anemoi/datasets/create/sources/xarray_support/patch.py +44 -1
  19. anemoi/datasets/create/sources/xarray_support/variable.py +6 -2
  20. anemoi/datasets/data/complement.py +44 -10
  21. anemoi/datasets/data/dataset.py +29 -0
  22. anemoi/datasets/data/forwards.py +8 -2
  23. anemoi/datasets/data/misc.py +74 -16
  24. anemoi/datasets/data/observations/__init__.py +316 -0
  25. anemoi/datasets/data/observations/legacy_obs_dataset.py +200 -0
  26. anemoi/datasets/data/observations/multi.py +64 -0
  27. anemoi/datasets/data/padded.py +227 -0
  28. anemoi/datasets/data/records/__init__.py +442 -0
  29. anemoi/datasets/data/records/backends/__init__.py +157 -0
  30. anemoi/datasets/data/stores.py +7 -56
  31. anemoi/datasets/data/subset.py +5 -0
  32. anemoi/datasets/grids.py +6 -3
  33. {anemoi_datasets-0.5.24.dist-info → anemoi_datasets-0.5.26.dist-info}/METADATA +3 -2
  34. {anemoi_datasets-0.5.24.dist-info → anemoi_datasets-0.5.26.dist-info}/RECORD +38 -51
  35. {anemoi_datasets-0.5.24.dist-info → anemoi_datasets-0.5.26.dist-info}/WHEEL +1 -1
  36. anemoi/datasets/create/filters/__init__.py +0 -33
  37. anemoi/datasets/create/filters/empty.py +0 -37
  38. anemoi/datasets/create/filters/legacy.py +0 -93
  39. anemoi/datasets/create/filters/noop.py +0 -37
  40. anemoi/datasets/create/filters/orog_to_z.py +0 -58
  41. anemoi/datasets/create/filters/pressure_level_relative_humidity_to_specific_humidity.py +0 -83
  42. anemoi/datasets/create/filters/pressure_level_specific_humidity_to_relative_humidity.py +0 -84
  43. anemoi/datasets/create/filters/rename.py +0 -205
  44. anemoi/datasets/create/filters/rotate_winds.py +0 -105
  45. anemoi/datasets/create/filters/single_level_dewpoint_to_relative_humidity.py +0 -78
  46. anemoi/datasets/create/filters/single_level_relative_humidity_to_dewpoint.py +0 -84
  47. anemoi/datasets/create/filters/single_level_relative_humidity_to_specific_humidity.py +0 -163
  48. anemoi/datasets/create/filters/single_level_specific_humidity_to_relative_humidity.py +0 -451
  49. anemoi/datasets/create/filters/speeddir_to_uv.py +0 -95
  50. anemoi/datasets/create/filters/sum.py +0 -68
  51. anemoi/datasets/create/filters/transform.py +0 -51
  52. anemoi/datasets/create/filters/unrotate_winds.py +0 -105
  53. anemoi/datasets/create/filters/uv_to_speeddir.py +0 -94
  54. anemoi/datasets/create/filters/wz_to_w.py +0 -98
  55. anemoi/datasets/create/testing.py +0 -76
  56. {anemoi_datasets-0.5.24.dist-info → anemoi_datasets-0.5.26.dist-info}/entry_points.txt +0 -0
  57. {anemoi_datasets-0.5.24.dist-info → anemoi_datasets-0.5.26.dist-info}/licenses/LICENSE +0 -0
  58. {anemoi_datasets-0.5.24.dist-info → anemoi_datasets-0.5.26.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,6 @@
7
7
  # granted to it by virtue of its status as an intergovernmental organisation
8
8
  # nor does it submit to any jurisdiction.
9
9
 
10
-
11
10
  import datetime
12
11
  import logging
13
12
  from abc import abstractmethod
@@ -19,6 +18,7 @@ from typing import Optional
19
18
  from typing import Set
20
19
  from typing import Tuple
21
20
 
21
+ import numpy as np
22
22
  from numpy.typing import NDArray
23
23
 
24
24
  from ..grids import nearest_grid_points
@@ -85,6 +85,7 @@ class Complement(Combined):
85
85
  for v in self._source.variables:
86
86
  if v not in self._target.variables:
87
87
  self._variables.append(v)
88
+ LOG.info(f"The following variables will be complemented: {self._variables}")
88
89
 
89
90
  if not self._variables:
90
91
  raise ValueError("Augment: no missing variables")
@@ -96,9 +97,11 @@ class Complement(Combined):
96
97
 
97
98
  @property
98
99
  def statistics(self) -> Dict[str, NDArray[Any]]:
99
- """Returns the statistics of the complemented dataset."""
100
- index = [self._source.name_to_index[v] for v in self._variables]
101
- return {k: v[index] for k, v in self._source.statistics.items()}
100
+ datasets = [self._source, self._target]
101
+ return {
102
+ k: [d.statistics[k][d.name_to_index[i]] for d in datasets for i in d.variables if i in self.variables]
103
+ for k in datasets[0].statistics
104
+ }
102
105
 
103
106
  def statistics_tendencies(self, delta: Optional[datetime.timedelta] = None) -> Dict[str, NDArray[Any]]:
104
107
  index = [self._source.name_to_index[v] for v in self._variables]
@@ -120,7 +123,11 @@ class Complement(Combined):
120
123
  @property
121
124
  def variables_metadata(self) -> Dict[str, Any]:
122
125
  """Returns the metadata of the variables to be added to the target dataset."""
123
- return {k: v for k, v in self._source.variables_metadata.items() if k in self._variables}
126
+ # Merge the two dicts first
127
+ all_meta = {**self._source.variables_metadata, **self._target.variables_metadata}
128
+
129
+ # Filter to keep only desired variables
130
+ return {k: v for k, v in all_meta.items() if k in self._variables}
124
131
 
125
132
  def check_same_variables(self, d1: Dataset, d2: Dataset) -> None:
126
133
  """Checks if the variables in two datasets are the same.
@@ -231,7 +238,7 @@ class ComplementNone(Complement):
231
238
  class ComplementNearest(Complement):
232
239
  """A class to complement a target dataset with variables from a source dataset using nearest neighbor interpolation."""
233
240
 
234
- def __init__(self, target: Any, source: Any, max_distance: float = None) -> None:
241
+ def __init__(self, target: Any, source: Any, max_distance: float = None, k: int = 1) -> None:
235
242
  """Initializes the ComplementNearest class.
236
243
 
237
244
  Parameters
@@ -242,17 +249,25 @@ class ComplementNearest(Complement):
242
249
  The source dataset.
243
250
  max_distance : float, optional
244
251
  The maximum distance for nearest neighbor interpolation, default is None.
252
+ k : int, optional
253
+ The number of k closest neighbors to consider for interpolation
245
254
  """
246
255
  super().__init__(target, source)
247
256
 
248
- self._nearest_grid_points = nearest_grid_points(
257
+ self.k = k
258
+ self._distances, self._nearest_grid_points = nearest_grid_points(
249
259
  self._source.latitudes,
250
260
  self._source.longitudes,
251
261
  self._target.latitudes,
252
262
  self._target.longitudes,
253
263
  max_distance=max_distance,
264
+ k=k,
254
265
  )
255
266
 
267
+ if k == 1:
268
+ self._distances = np.expand_dims(self._distances, axis=1)
269
+ self._nearest_grid_points = np.expand_dims(self._nearest_grid_points, axis=1)
270
+
256
271
  def check_compatibility(self, d1: Dataset, d2: Dataset) -> None:
257
272
  """Checks the compatibility of two datasets for nearest neighbor interpolation.
258
273
 
@@ -285,7 +300,19 @@ class ComplementNearest(Complement):
285
300
  source_data = self._source[index[0], source_index, index[2], ...]
286
301
  target_data = source_data[..., self._nearest_grid_points]
287
302
 
288
- result = target_data[..., index[3]]
303
+ epsilon = 1e-8 # prevent division by zero
304
+ weights = 1.0 / (self._distances + epsilon)
305
+ weights = weights.astype(target_data.dtype)
306
+ weights /= weights.sum(axis=1, keepdims=True) # normalize
307
+
308
+ # Reshape weights to broadcast correctly
309
+ # Add leading singleton dimensions so it matches target_data shape
310
+ while weights.ndim < target_data.ndim:
311
+ weights = np.expand_dims(weights, axis=0)
312
+
313
+ # Compute weighted average along the last dimension
314
+ final_point = np.sum(target_data * weights, axis=-1)
315
+ result = final_point[..., index[3]]
289
316
 
290
317
  return apply_index_to_slices_changes(result, changes)
291
318
 
@@ -330,6 +357,13 @@ def complement_factory(args: Tuple, kwargs: dict) -> Dataset:
330
357
  "nearest": ComplementNearest,
331
358
  }[interpolation]
332
359
 
333
- complement = Class(target=target, source=source)._subset(**kwargs)
360
+ if interpolation == "nearest":
361
+ k = kwargs.pop("k", "1")
362
+ complement = Class(target=target, source=source, k=k)._subset(**kwargs)
363
+
364
+ else:
365
+ complement = Class(target=target, source=source)._subset(**kwargs)
366
+
367
+ joined = _open_dataset([target, complement])
334
368
 
335
- return _open_dataset([target, complement], reorder=source.variables)
369
+ return _open_dataset(joined, reorder=sorted(joined.variables))
@@ -179,6 +179,19 @@ class Dataset(ABC, Sized):
179
179
  if "start" in kwargs or "end" in kwargs:
180
180
  start = kwargs.pop("start", None)
181
181
  end = kwargs.pop("end", None)
182
+ padding = kwargs.pop("padding", None)
183
+
184
+ if padding:
185
+ if padding != "empty":
186
+ raise ValueError(f"Only 'empty' padding is supported, got {padding=}")
187
+ from .padded import Padded
188
+
189
+ frequency = kwargs.pop("frequency", self.frequency)
190
+ return (
191
+ Padded(self, start, end, frequency, dict(start=start, end=end, frequency=frequency))
192
+ ._subset(**kwargs)
193
+ .mutate()
194
+ )
182
195
 
183
196
  from .subset import Subset
184
197
 
@@ -724,6 +737,9 @@ class Dataset(ABC, Sized):
724
737
  """Return the grid shape of the dataset."""
725
738
  return (self.shape[-1],)
726
739
 
740
+ def empty_item(self) -> NDArray[Any]:
741
+ return np.zeros((*self.shape[1:-1], 0), dtype=self.dtype)
742
+
727
743
  def _check(self) -> None:
728
744
  """Check for overridden private methods in the dataset."""
729
745
  common = Dataset.__dict__.keys() & self.__class__.__dict__.keys()
@@ -1075,3 +1091,16 @@ class Dataset(ABC, Sized):
1075
1091
  The dataset names.
1076
1092
  """
1077
1093
  pass
1094
+
1095
+ def get_latitudes(self, i):
1096
+ return self.get_aux(i)[0]
1097
+
1098
+ def get_longitudes(self, i):
1099
+ return self.get_aux(i)[1]
1100
+
1101
+ def get_timedeltas(self, i):
1102
+ return self.get_aux(i)[2]
1103
+
1104
+ def get_aux(self, i):
1105
+ # need to decide if Fields datasets need to implement this
1106
+ raise NotImplementedError(f"get_aux is not implemented for this dataset, {type(self)}")
@@ -330,8 +330,14 @@ class Combined(Forwards):
330
330
  ValueError
331
331
  If the grids are not the same.
332
332
  """
333
- if (d1.latitudes != d2.latitudes).any() or (d1.longitudes != d2.longitudes).any():
334
- raise ValueError(f"Incompatible grid ({d1} {d2})")
333
+
334
+ # note: not a proper implementation, should be handled
335
+ # in a more consolidated way ...
336
+ rtol = 1.0e-7
337
+ if not np.allclose(d1.latitudes, d2.latitudes, rtol=rtol) or not np.allclose(
338
+ d1.longitudes, d2.longitudes, rtol=rtol
339
+ ):
340
+ raise ValueError(f"Incompatible grid ({d1.longitudes} {d2.longitudes})")
335
341
 
336
342
  def check_same_shape(self, d1: Dataset, d2: Dataset) -> None:
337
343
  """Checks if the shapes of two datasets are the same.
@@ -11,6 +11,7 @@
11
11
  import calendar
12
12
  import datetime
13
13
  import logging
14
+ import os
14
15
  from pathlib import PurePath
15
16
  from typing import TYPE_CHECKING
16
17
  from typing import Any
@@ -22,6 +23,7 @@ from typing import Union
22
23
 
23
24
  import numpy as np
24
25
  import zarr
26
+ from anemoi.utils.config import load_any_dict_format
25
27
  from anemoi.utils.config import load_config as load_settings
26
28
  from numpy.typing import NDArray
27
29
 
@@ -108,7 +110,10 @@ def round_datetime(d: np.datetime64, dates: NDArray[np.datetime64], up: bool) ->
108
110
 
109
111
 
110
112
  def _as_date(
111
- d: Union[int, str, np.datetime64, datetime.date], dates: NDArray[np.datetime64], last: bool
113
+ d: Union[int, str, np.datetime64, datetime.date],
114
+ dates: NDArray[np.datetime64],
115
+ last: bool,
116
+ frequency: Optional[datetime.timedelta] = None,
112
117
  ) -> np.datetime64:
113
118
  """Convert a date to a numpy datetime64 object, rounding to the nearest date in a list of dates.
114
119
 
@@ -120,6 +125,8 @@ def _as_date(
120
125
  The list of dates.
121
126
  last : bool
122
127
  Whether to round to the last date.
128
+ frequency : Optional[datetime.timedelta]
129
+ The frequency of the dataset.
123
130
 
124
131
  Returns
125
132
  -------
@@ -142,30 +149,49 @@ def _as_date(
142
149
  pass
143
150
 
144
151
  if isinstance(d, int):
152
+ delta = frequency
153
+ if delta is None:
154
+ delta = np.timedelta64(1, "s")
155
+ delta = np.timedelta64(delta, "s")
156
+
145
157
  if len(str(d)) == 4:
146
158
  year = d
147
159
  if last:
148
- return _as_date(np.datetime64(f"{year:04}-12-31T23:59:59"), dates, last)
160
+ year = year + 1
161
+ npdate = np.datetime64(f"{year:04}-01-01T00:00:00")
162
+ return _as_date(npdate - delta, dates, last, frequency)
149
163
  else:
150
- return _as_date(np.datetime64(f"{year:04}-01-01T00:00:00"), dates, last)
164
+ return _as_date(np.datetime64(f"{year:04}-01-01T00:00:00"), dates, last, frequency)
151
165
 
152
166
  if len(str(d)) == 6:
153
167
  year = d // 100
154
168
  month = d % 100
155
169
  if last:
156
- _, last_day = calendar.monthrange(year, month)
157
- return _as_date(np.datetime64(f"{year:04}-{month:02}-{last_day:02}T23:59:59"), dates, last)
170
+ month = month + 1
171
+ if month > 12:
172
+ month = 1
173
+ year += 1
174
+ npdate = np.datetime64(f"{year:04}-{month:02}-01T00:00:00")
175
+ return _as_date(npdate - delta, dates, last, frequency)
158
176
  else:
159
- return _as_date(np.datetime64(f"{year:04}-{month:02}-01T00:00:00"), dates, last)
177
+ return _as_date(np.datetime64(f"{year:04}-{month:02}-01T00:00:00"), dates, last, frequency)
160
178
 
161
179
  if len(str(d)) == 8:
162
180
  year = d // 10000
163
181
  month = (d % 10000) // 100
164
182
  day = d % 100
165
183
  if last:
166
- return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T23:59:59"), dates, last)
184
+ day = day + 1
185
+ if day > calendar.monthrange(year, month)[1]:
186
+ day = 1
187
+ month += 1
188
+ if month > 12:
189
+ month = 1
190
+ year += 1
191
+ npdate = np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00")
192
+ return _as_date(npdate - delta, dates, last, frequency)
167
193
  else:
168
- return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00"), dates, last)
194
+ return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T00:00:00"), dates, last, frequency)
169
195
 
170
196
  if isinstance(d, str):
171
197
 
@@ -201,19 +227,20 @@ def _as_date(
201
227
  np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}"),
202
228
  dates,
203
229
  last,
230
+ frequency,
204
231
  )
205
232
 
206
233
  if "-" in d:
207
234
  assert ":" not in d
208
235
  bits = d.split("-")
209
236
  if len(bits) == 1:
210
- return _as_date(int(bits[0]), dates, last)
237
+ return _as_date(int(bits[0]), dates, last, frequency)
211
238
 
212
239
  if len(bits) == 2:
213
- return _as_date(int(bits[0]) * 100 + int(bits[1]), dates, last)
240
+ return _as_date(int(bits[0]) * 100 + int(bits[1]), dates, last, frequency)
214
241
 
215
242
  if len(bits) == 3:
216
- return _as_date(int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]), dates, last)
243
+ return _as_date(int(bits[0]) * 10000 + int(bits[1]) * 100 + int(bits[2]), dates, last, frequency)
217
244
 
218
245
  if ":" in d:
219
246
  assert len(d) == 5
@@ -225,12 +252,16 @@ def _as_date(
225
252
  month = first.month
226
253
  day = first.day
227
254
 
228
- return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00"), dates, last)
255
+ return _as_date(np.datetime64(f"{year:04}-{month:02}-{day:02}T{hour}:00:00"), dates, last, frequency)
229
256
 
230
257
  raise NotImplementedError(f"Unsupported date: {d} ({type(d)})")
231
258
 
232
259
 
233
- def as_first_date(d: Union[int, str, np.datetime64, datetime.date], dates: NDArray[np.datetime64]) -> np.datetime64:
260
+ def as_first_date(
261
+ d: Union[int, str, np.datetime64, datetime.date],
262
+ dates: NDArray[np.datetime64],
263
+ frequency: Optional[datetime.timedelta] = None,
264
+ ) -> np.datetime64:
234
265
  """Convert a date to the first date in a list of dates.
235
266
 
236
267
  Parameters
@@ -239,16 +270,22 @@ def as_first_date(d: Union[int, str, np.datetime64, datetime.date], dates: NDArr
239
270
  The date to convert.
240
271
  dates : NDArray[np.datetime64]
241
272
  The list of dates.
273
+ frequency : Optional[datetime.timedelta]
274
+ The frequency of the dataset.
242
275
 
243
276
  Returns
244
277
  -------
245
278
  np.datetime64
246
279
  The first date.
247
280
  """
248
- return _as_date(d, dates, last=False)
281
+ return _as_date(d, dates, last=False, frequency=frequency)
249
282
 
250
283
 
251
- def as_last_date(d: Union[int, str, np.datetime64, datetime.date], dates: NDArray[np.datetime64]) -> np.datetime64:
284
+ def as_last_date(
285
+ d: Union[int, str, np.datetime64, datetime.date],
286
+ dates: NDArray[np.datetime64],
287
+ frequency: Optional[datetime.timedelta] = None,
288
+ ) -> np.datetime64:
252
289
  """Convert a date to the last date in a list of dates.
253
290
 
254
291
  Parameters
@@ -257,13 +294,15 @@ def as_last_date(d: Union[int, str, np.datetime64, datetime.date], dates: NDArra
257
294
  The date to convert.
258
295
  dates : NDArray[np.datetime64]
259
296
  The list of dates.
297
+ frequency : Optional[datetime.timedelta]
298
+ The frequency of the dataset.
260
299
 
261
300
  Returns
262
301
  -------
263
302
  np.datetime64
264
303
  The last date.
265
304
  """
266
- return _as_date(d, dates, last=True)
305
+ return _as_date(d, dates, last=True, frequency=frequency)
267
306
 
268
307
 
269
308
  def _concat_or_join(datasets: List["Dataset"], kwargs: Dict[str, Any]) -> Tuple["Dataset", Dict[str, Any]]:
@@ -317,6 +356,18 @@ def _open(a: Union[str, PurePath, Dict[str, Any], List[Any], Tuple[Any, ...]]) -
317
356
  from .stores import Zarr
318
357
  from .stores import zarr_lookup
319
358
 
359
+ if isinstance(a, str) and len(a.split(".")) in [2, 3]:
360
+
361
+ metadata_path = os.path.join(a, "metadata.json")
362
+ if os.path.exists(metadata_path):
363
+ metadata = load_any_dict_format(metadata_path)
364
+ if "backend" not in metadata:
365
+ raise ValueError(f"Metadata for {a} does not contain 'backend' key")
366
+
367
+ from anemoi.datasets.data.records import open_records_dataset
368
+
369
+ return open_records_dataset(a, backend=metadata["backend"])
370
+
320
371
  if isinstance(a, Dataset):
321
372
  return a.mutate()
322
373
 
@@ -454,6 +505,13 @@ def _open_dataset(*args: Any, **kwargs: Any) -> "Dataset":
454
505
  for a in args:
455
506
  sets.append(_open(a))
456
507
 
508
+ if "observations" in kwargs:
509
+ from .observations import observations_factory
510
+
511
+ assert not sets, sets
512
+
513
+ return observations_factory(args, kwargs).mutate()
514
+
457
515
  if "xy" in kwargs:
458
516
  # Experimental feature, may be removed
459
517
  from .xy import xy_factory