anemoi-datasets 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/commands/compare.py +59 -0
  3. anemoi/datasets/commands/create.py +84 -3
  4. anemoi/datasets/commands/inspect.py +3 -3
  5. anemoi/datasets/create/__init__.py +44 -17
  6. anemoi/datasets/create/check.py +6 -5
  7. anemoi/datasets/create/chunks.py +1 -1
  8. anemoi/datasets/create/config.py +5 -26
  9. anemoi/datasets/create/functions/filters/rename.py +9 -1
  10. anemoi/datasets/create/functions/filters/rotate_winds.py +10 -1
  11. anemoi/datasets/create/functions/sources/__init__.py +39 -0
  12. anemoi/datasets/create/functions/sources/accumulations.py +11 -41
  13. anemoi/datasets/create/functions/sources/constants.py +3 -0
  14. anemoi/datasets/create/functions/sources/grib.py +4 -0
  15. anemoi/datasets/create/functions/sources/hindcasts.py +32 -377
  16. anemoi/datasets/create/functions/sources/mars.py +53 -22
  17. anemoi/datasets/create/functions/sources/netcdf.py +2 -60
  18. anemoi/datasets/create/functions/sources/opendap.py +3 -2
  19. anemoi/datasets/create/functions/sources/xarray/__init__.py +73 -0
  20. anemoi/datasets/create/functions/sources/xarray/coordinates.py +234 -0
  21. anemoi/datasets/create/functions/sources/xarray/field.py +109 -0
  22. anemoi/datasets/create/functions/sources/xarray/fieldlist.py +171 -0
  23. anemoi/datasets/create/functions/sources/xarray/flavour.py +330 -0
  24. anemoi/datasets/create/functions/sources/xarray/grid.py +46 -0
  25. anemoi/datasets/create/functions/sources/xarray/metadata.py +161 -0
  26. anemoi/datasets/create/functions/sources/xarray/time.py +98 -0
  27. anemoi/datasets/create/functions/sources/xarray/variable.py +198 -0
  28. anemoi/datasets/create/functions/sources/xarray_kerchunk.py +42 -0
  29. anemoi/datasets/create/functions/sources/xarray_zarr.py +15 -0
  30. anemoi/datasets/create/functions/sources/zenodo.py +40 -0
  31. anemoi/datasets/create/input.py +290 -172
  32. anemoi/datasets/create/loaders.py +120 -71
  33. anemoi/datasets/create/patch.py +17 -14
  34. anemoi/datasets/create/persistent.py +1 -1
  35. anemoi/datasets/create/size.py +4 -5
  36. anemoi/datasets/create/statistics/__init__.py +49 -16
  37. anemoi/datasets/create/template.py +11 -61
  38. anemoi/datasets/create/trace.py +91 -0
  39. anemoi/datasets/create/utils.py +0 -48
  40. anemoi/datasets/create/zarr.py +24 -10
  41. anemoi/datasets/data/misc.py +9 -37
  42. anemoi/datasets/data/stores.py +29 -14
  43. anemoi/datasets/dates/__init__.py +7 -1
  44. anemoi/datasets/dates/groups.py +3 -0
  45. {anemoi_datasets-0.4.0.dist-info → anemoi_datasets-0.4.2.dist-info}/METADATA +18 -3
  46. anemoi_datasets-0.4.2.dist-info/RECORD +86 -0
  47. {anemoi_datasets-0.4.0.dist-info → anemoi_datasets-0.4.2.dist-info}/WHEEL +1 -1
  48. anemoi_datasets-0.4.0.dist-info/RECORD +0 -73
  49. {anemoi_datasets-0.4.0.dist-info → anemoi_datasets-0.4.2.dist-info}/LICENSE +0 -0
  50. {anemoi_datasets-0.4.0.dist-info → anemoi_datasets-0.4.2.dist-info}/entry_points.txt +0 -0
  51. {anemoi_datasets-0.4.0.dist-info → anemoi_datasets-0.4.2.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,7 @@ from .mars import use_grib_paramid
24
24
  LOG = logging.getLogger(__name__)
25
25
 
26
26
 
27
- def member(field):
27
+ def _member(field):
28
28
  # Bug in eccodes has number=0 randomly
29
29
  number = field.metadata("number", default=0)
30
30
  if number is None:
@@ -68,7 +68,7 @@ class Accumulation:
68
68
  self.time,
69
69
  field.metadata("time"),
70
70
  )
71
- assert self.number == member(field), (self.number, member(field))
71
+ assert self.number == _member(field), (self.number, _member(field))
72
72
 
73
73
  return
74
74
 
@@ -241,17 +241,17 @@ class AccumulationFromLastStep(Accumulation):
241
241
  )
242
242
 
243
243
 
244
- def identity(x):
244
+ def _identity(x):
245
245
  return x
246
246
 
247
247
 
248
- def compute_accumulations(
248
+ def _compute_accumulations(
249
249
  context,
250
250
  dates,
251
251
  request,
252
252
  user_accumulation_period=6,
253
253
  data_accumulation_period=None,
254
- patch=identity,
254
+ patch=_identity,
255
255
  base_times=None,
256
256
  ):
257
257
  adjust_step = isinstance(user_accumulation_period, int)
@@ -340,7 +340,7 @@ def compute_accumulations(
340
340
  field.metadata("date"),
341
341
  field.metadata("time"),
342
342
  field.metadata("step"),
343
- member(field),
343
+ _member(field),
344
344
  )
345
345
  values = field.values # optimisation
346
346
  assert accumulations[key], key
@@ -365,43 +365,13 @@ def compute_accumulations(
365
365
  return ds
366
366
 
367
367
 
368
- def to_list(x):
368
+ def _to_list(x):
369
369
  if isinstance(x, (list, tuple)):
370
370
  return x
371
371
  return [x]
372
372
 
373
373
 
374
- def normalise_time_to_hours(r):
375
- r = deepcopy(r)
376
- if "time" not in r:
377
- return r
378
-
379
- times = []
380
- for t in to_list(r["time"]):
381
- assert len(t) == 4, r
382
- assert t.endswith("00"), r
383
- times.append(int(t) // 100)
384
- r["time"] = tuple(times)
385
- return r
386
-
387
-
388
- def normalise_number(r):
389
- if "number" not in r:
390
- return r
391
- number = r["number"]
392
- number = to_list(number)
393
-
394
- if len(number) > 4 and (number[1] == "to" and number[3] == "by"):
395
- return list(range(int(number[0]), int(number[2]) + 1, int(number[4])))
396
-
397
- if len(number) > 2 and number[1] == "to":
398
- return list(range(int(number[0]), int(number[2]) + 1))
399
-
400
- r["number"] = number
401
- return r
402
-
403
-
404
- def scda(request):
374
+ def _scda(request):
405
375
  if request["time"] in (6, 18, 600, 1800):
406
376
  request["stream"] = "scda"
407
377
  else:
@@ -410,14 +380,14 @@ def scda(request):
410
380
 
411
381
 
412
382
  def accumulations(context, dates, **request):
413
- to_list(request["param"])
383
+ _to_list(request["param"])
414
384
  class_ = request.get("class", "od")
415
385
  stream = request.get("stream", "oper")
416
386
 
417
387
  user_accumulation_period = request.pop("accumulation_period", 6)
418
388
 
419
389
  KWARGS = {
420
- ("od", "oper"): dict(patch=scda),
390
+ ("od", "oper"): dict(patch=_scda),
421
391
  ("od", "elda"): dict(base_times=(6, 18)),
422
392
  ("ea", "oper"): dict(data_accumulation_period=1, base_times=(6, 18)),
423
393
  ("ea", "enda"): dict(data_accumulation_period=3, base_times=(6, 18)),
@@ -427,7 +397,7 @@ def accumulations(context, dates, **request):
427
397
 
428
398
  context.trace("🌧️", f"accumulations {request} {user_accumulation_period} {kwargs}")
429
399
 
430
- return compute_accumulations(
400
+ return _compute_accumulations(
431
401
  context,
432
402
  dates,
433
403
  request,
@@ -18,6 +18,9 @@ def constants(context, dates, template, param):
18
18
  stacklevel=2,
19
19
  )
20
20
  context.trace("✅", f"from_source(constants, {template}, {param}")
21
+ if len(template) == 0:
22
+ raise ValueError("Forcings template is empty.")
23
+
21
24
  return from_source("forcings", source_or_dataset=template, date=dates, param=param)
22
25
 
23
26
 
@@ -26,8 +26,12 @@ def check(ds, paths, **kwargs):
26
26
 
27
27
  def _expand(paths):
28
28
  for path in paths:
29
+ cnt = 0
29
30
  for p in glob.glob(path):
30
31
  yield p
32
+ cnt += 1
33
+ if cnt == 0:
34
+ yield path
31
35
 
32
36
 
33
37
  def execute(context, dates, path, *args, **kwargs):
@@ -7,21 +7,13 @@
7
7
  # nor does it submit to any jurisdiction.
8
8
  #
9
9
  import datetime
10
- import warnings
11
- from copy import deepcopy
12
-
13
- import earthkit.data as ekd
14
- import numpy as np
15
- from earthkit.data.core.temporary import temp_file
16
- from earthkit.data.readers.grib.output import new_grib_output
17
- from earthkit.utils.availability import Availability
18
10
 
19
11
  from anemoi.datasets.create.functions.sources.mars import mars
20
12
 
21
13
  DEBUG = True
22
14
 
23
15
 
24
- def member(field):
16
+ def _member(field):
25
17
  # Bug in eccodes has number=0 randomly
26
18
  number = field.metadata("number")
27
19
  if number is None:
@@ -29,368 +21,12 @@ def member(field):
29
21
  return number
30
22
 
31
23
 
32
- class Accumulation:
33
- def __init__(self, out, /, param, date, time, number, step, frequency, **kwargs):
34
- self.out = out
35
- self.param = param
36
- self.date = date
37
- self.time = time
38
- self.steps = step
39
- self.number = number
40
- self.values = None
41
- self.seen = set()
42
- self.startStep = None
43
- self.endStep = None
44
- self.done = False
45
- self.frequency = frequency
46
- self._check = None
47
-
48
- @property
49
- def key(self):
50
- return (self.param, self.date, self.time, self.steps, self.number)
51
-
52
- def check(self, field):
53
- if self._check is None:
54
- self._check = field.as_mars()
55
-
56
- assert self.param == field.metadata("param"), (
57
- self.param,
58
- field.metadata("param"),
59
- )
60
- assert self.date == field.metadata("date"), (
61
- self.date,
62
- field.metadata("date"),
63
- )
64
- assert self.time == field.metadata("time"), (
65
- self.time,
66
- field.metadata("time"),
67
- )
68
- assert self.number == member(field), (self.number, member(field))
69
-
70
- return
71
-
72
- mars = field.as_mars()
73
- keys1 = sorted(self._check.keys())
74
- keys2 = sorted(mars.keys())
75
-
76
- assert keys1 == keys2, (keys1, keys2)
77
-
78
- for k in keys1:
79
- if k not in ("step",):
80
- assert self._check[k] == mars[k], (k, self._check[k], mars[k])
81
-
82
- def write(self, template):
83
-
84
- assert self.startStep != self.endStep, (self.startStep, self.endStep)
85
- assert np.all(self.values >= 0), (np.amin(self.values), np.amax(self.values))
86
-
87
- self.out.write(
88
- self.values,
89
- template=template,
90
- stepType="accum",
91
- startStep=self.startStep,
92
- endStep=self.endStep,
93
- )
94
- self.values = None
95
- self.done = True
96
-
97
- def add(self, field, values):
98
-
99
- self.check(field)
100
-
101
- step = field.metadata("step")
102
- if step not in self.steps:
103
- return
104
-
105
- if not np.all(values >= 0):
106
- warnings.warn(f"Negative values for {field}: {np.amin(values)} {np.amax(values)}")
107
-
108
- assert not self.done, (self.key, step)
109
- assert step not in self.seen, (self.key, step)
110
-
111
- startStep = field.metadata("startStep")
112
- endStep = field.metadata("endStep")
113
-
114
- if self.buggy_steps and startStep == endStep:
115
- startStep = 0
116
-
117
- assert step == endStep, (startStep, endStep, step)
118
-
119
- self.compute(values, startStep, endStep)
120
-
121
- self.seen.add(step)
122
-
123
- if len(self.seen) == len(self.steps):
124
- self.write(template=field)
125
-
126
- @classmethod
127
- def mars_date_time_steps(cls, dates, step1, step2, frequency, base_times, adjust_step):
128
-
129
- # assert step1 > 0, (step1, step2, frequency)
130
-
131
- for valid_date in dates:
132
- base_date = valid_date - datetime.timedelta(hours=step2)
133
- add_step = 0
134
- if base_date.hour not in base_times:
135
- if not adjust_step:
136
- raise ValueError(
137
- f"Cannot find a base time in {base_times} that validates on {valid_date.isoformat()} for step={step2}"
138
- )
139
-
140
- while base_date.hour not in base_times:
141
- # print(f'{base_date=}, {base_times=}, {add_step=} {frequency=}')
142
- base_date -= datetime.timedelta(hours=1)
143
- add_step += 1
144
-
145
- yield cls._mars_date_time_step(base_date, step1, step2, add_step, frequency)
146
-
147
- def __repr__(self) -> str:
148
- return f"{self.__class__.__name__}({self.key})"
149
-
150
-
151
- class AccumulationFromStart(Accumulation):
152
- buggy_steps = True
153
-
154
- def compute(self, values, startStep, endStep):
155
-
156
- assert startStep == 0, startStep
157
-
158
- if self.values is None:
159
-
160
- self.values = np.copy(values)
161
- self.startStep = 0
162
- self.endStep = endStep
163
-
164
- else:
165
- assert endStep != self.endStep, (self.endStep, endStep)
166
-
167
- if endStep > self.endStep:
168
- # assert endStep - self.endStep == self.stepping, (self.endStep, endStep, self.stepping)
169
- self.values = values - self.values
170
- self.startStep = self.endStep
171
- self.endStep = endStep
172
- else:
173
- # assert self.endStep - endStep == self.stepping, (self.endStep, endStep, self.stepping)
174
- self.values = self.values - values
175
- self.startStep = endStep
176
-
177
- if not np.all(self.values >= 0):
178
- warnings.warn(f"Negative values for {self.param}: {np.amin(self.values)} {np.amax(self.values)}")
179
- self.values = np.maximum(self.values, 0)
180
-
181
- @classmethod
182
- def _mars_date_time_step(cls, base_date, step1, step2, add_step, frequency):
183
- assert not frequency, frequency
184
-
185
- steps = (step1 + add_step, step2 + add_step)
186
- if steps[0] == 0:
187
- steps = (steps[1],)
188
-
189
- return (
190
- base_date.year * 10000 + base_date.month * 100 + base_date.day,
191
- base_date.hour * 100 + base_date.minute,
192
- steps,
193
- )
194
-
195
-
196
- class AccumulationFromLastStep(Accumulation):
197
- buggy_steps = False
198
-
199
- def compute(self, values, startStep, endStep):
200
-
201
- assert endStep - startStep == self.frequency, (
202
- startStep,
203
- endStep,
204
- self.frequency,
205
- )
206
-
207
- if self.startStep is None:
208
- self.startStep = startStep
209
- else:
210
- self.startStep = min(self.startStep, startStep)
211
-
212
- if self.endStep is None:
213
- self.endStep = endStep
214
- else:
215
- self.endStep = max(self.endStep, endStep)
216
-
217
- if self.values is None:
218
- self.values = np.zeros_like(values)
219
-
220
- self.values += values
221
-
222
- @classmethod
223
- def _mars_date_time_step(cls, base_date, step1, step2, add_step, frequency):
224
- assert frequency > 0, frequency
225
- # assert step1 > 0, (step1, step2, frequency, add_step, base_date)
226
-
227
- steps = []
228
- for step in range(step1 + frequency, step2 + frequency, frequency):
229
- steps.append(step + add_step)
230
- return (
231
- base_date.year * 10000 + base_date.month * 100 + base_date.day,
232
- base_date.hour * 100 + base_date.minute,
233
- tuple(steps),
234
- )
235
-
236
-
237
- def identity(x):
238
- return x
239
-
240
-
241
- def compute_accumulations(
242
- dates,
243
- request,
244
- user_accumulation_period=6,
245
- data_accumulation_period=None,
246
- patch=identity,
247
- base_times=None,
248
- ):
249
- adjust_step = isinstance(user_accumulation_period, int)
250
-
251
- if not isinstance(user_accumulation_period, (list, tuple)):
252
- user_accumulation_period = (0, user_accumulation_period)
253
-
254
- assert len(user_accumulation_period) == 2, user_accumulation_period
255
- step1, step2 = user_accumulation_period
256
- assert step1 < step2, user_accumulation_period
257
-
258
- if base_times is None:
259
- base_times = [0, 6, 12, 18]
260
-
261
- base_times = [t // 100 if t > 100 else t for t in base_times]
262
-
263
- AccumulationClass = AccumulationFromStart if data_accumulation_period in (0, None) else AccumulationFromLastStep
264
-
265
- mars_date_time_steps = AccumulationClass.mars_date_time_steps(
266
- dates,
267
- step1,
268
- step2,
269
- data_accumulation_period,
270
- base_times,
271
- adjust_step,
272
- )
273
-
274
- request = deepcopy(request)
275
-
276
- param = request["param"]
277
- if not isinstance(param, (list, tuple)):
278
- param = [param]
279
-
280
- number = request.get("number", [0])
281
- assert isinstance(number, (list, tuple))
282
-
283
- frequency = data_accumulation_period
284
-
285
- type_ = request.get("type", "an")
286
- if type_ == "an":
287
- type_ = "fc"
288
-
289
- request.update({"type": type_, "levtype": "sfc"})
290
-
291
- tmp = temp_file()
292
- path = tmp.path
293
- out = new_grib_output(path)
294
-
295
- requests = []
296
-
297
- accumulations = {}
298
-
299
- for date, time, steps in mars_date_time_steps:
300
- for p in param:
301
- for n in number:
302
- requests.append(
303
- patch(
304
- {
305
- "param": p,
306
- "date": date,
307
- "time": time,
308
- "step": sorted(steps),
309
- "number": n,
310
- }
311
- )
312
- )
313
-
314
- compressed = Availability(requests)
315
- ds = ekd.from_source("empty")
316
- for r in compressed.iterate():
317
- request.update(r)
318
- print("🌧️", request)
319
- ds = ds + ekd.from_source("mars", **request)
320
-
321
- accumulations = {}
322
- for a in [AccumulationClass(out, frequency=frequency, **r) for r in requests]:
323
- for s in a.steps:
324
- key = (a.param, a.date, a.time, s, a.number)
325
- accumulations.setdefault(key, []).append(a)
326
-
327
- for field in ds:
328
- key = (
329
- field.metadata("param"),
330
- field.metadata("date"),
331
- field.metadata("time"),
332
- field.metadata("step"),
333
- member(field),
334
- )
335
- values = field.values # optimisation
336
- assert accumulations[key], key
337
- for a in accumulations[key]:
338
- a.add(field, values)
339
-
340
- for acc in accumulations.values():
341
- for a in acc:
342
- assert a.done, (a.key, a.seen, a.steps)
343
-
344
- out.close()
345
-
346
- ds = ekd.from_source("file", path)
347
-
348
- assert len(ds) / len(param) / len(number) == len(dates), (
349
- len(ds),
350
- len(param),
351
- len(dates),
352
- )
353
- ds._tmp = tmp
354
-
355
- return ds
356
-
357
-
358
- def to_list(x):
24
+ def _to_list(x):
359
25
  if isinstance(x, (list, tuple)):
360
26
  return x
361
27
  return [x]
362
28
 
363
29
 
364
- def normalise_time_to_hours(r):
365
- r = deepcopy(r)
366
- if "time" not in r:
367
- return r
368
-
369
- times = []
370
- for t in to_list(r["time"]):
371
- assert len(t) == 4, r
372
- assert t.endswith("00"), r
373
- times.append(int(t) // 100)
374
- r["time"] = tuple(times)
375
- return r
376
-
377
-
378
- def normalise_number(r):
379
- if "number" not in r:
380
- return r
381
- number = r["number"]
382
- number = to_list(number)
383
-
384
- if len(number) > 4 and (number[1] == "to" and number[3] == "by"):
385
- return list(range(int(number[0]), int(number[2]) + 1, int(number[4])))
386
-
387
- if len(number) > 2 and number[1] == "to":
388
- return list(range(int(number[0]), int(number[2]) + 1))
389
-
390
- r["number"] = number
391
- return r
392
-
393
-
394
30
  class HindcastCompute:
395
31
  def __init__(self, base_times, available_steps, request):
396
32
  self.base_times = base_times
@@ -398,22 +34,34 @@ class HindcastCompute:
398
34
  self.request = request
399
35
 
400
36
  def compute_hindcast(self, date):
401
- for step in self.available_steps:
37
+ result = []
38
+ for step in sorted(self.available_steps): # Use the shortest step
402
39
  start_date = date - datetime.timedelta(hours=step)
403
40
  hours = start_date.hour
404
41
  if hours in self.base_times:
405
- r = deepcopy(self.request)
42
+ r = self.request.copy()
406
43
  r["date"] = start_date
407
44
  r["time"] = f"{start_date.hour:02d}00"
408
45
  r["step"] = step
409
- return r
410
- raise ValueError(
411
- f"Cannot find data for {self.request} for {date} (base_times={self.base_times}, available_steps={self.available_steps})"
412
- )
46
+ result.append(r)
47
+
48
+ if not result:
49
+ raise ValueError(
50
+ f"Cannot find data for {self.request} for {date} (base_times={self.base_times}, "
51
+ f"available_steps={self.available_steps})"
52
+ )
53
+
54
+ if len(result) > 1:
55
+ raise ValueError(
56
+ f"Multiple requests for {self.request} for {date} (base_times={self.base_times}, "
57
+ f"available_steps={self.available_steps})"
58
+ )
59
+
60
+ return result[0]
413
61
 
414
62
 
415
63
  def use_reference_year(reference_year, request):
416
- request = deepcopy(request)
64
+ request = request.copy()
417
65
  hdate = request.pop("date")
418
66
  date = datetime.datetime(reference_year, hdate.month, hdate.day)
419
67
  request.update(date=date.strftime("%Y-%m-%d"), hdate=hdate.strftime("%Y-%m-%d"))
@@ -421,15 +69,15 @@ def use_reference_year(reference_year, request):
421
69
 
422
70
 
423
71
  def hindcasts(context, dates, **request):
424
- request["param"] = to_list(request["param"])
425
- request["step"] = to_list(request["step"])
72
+ request["param"] = _to_list(request["param"])
73
+ request["step"] = _to_list(request["step"])
426
74
  request["step"] = [int(_) for _ in request["step"]]
427
75
 
428
76
  if request.get("stream") == "enfh" and "base_times" not in request:
429
77
  request["base_times"] = [0]
430
78
 
431
79
  available_steps = request.pop("step")
432
- available_steps = to_list(available_steps)
80
+ available_steps = _to_list(available_steps)
433
81
 
434
82
  base_times = request.pop("base_times")
435
83
 
@@ -444,7 +92,14 @@ def hindcasts(context, dates, **request):
444
92
  req = use_reference_year(reference_year, req)
445
93
 
446
94
  requests.append(req)
447
- return mars(context, dates, *requests, date_key="hdate")
95
+
96
+ return mars(
97
+ context,
98
+ dates,
99
+ *requests,
100
+ date_key="hdate",
101
+ request_already_using_valid_datetime=True,
102
+ )
448
103
 
449
104
 
450
105
  execute = hindcasts