anemoi-datasets 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/commands/cleanup.py +44 -0
  3. anemoi/datasets/commands/create.py +50 -20
  4. anemoi/datasets/commands/finalise-additions.py +45 -0
  5. anemoi/datasets/commands/finalise.py +39 -0
  6. anemoi/datasets/commands/init-additions.py +45 -0
  7. anemoi/datasets/commands/init.py +67 -0
  8. anemoi/datasets/commands/inspect.py +1 -1
  9. anemoi/datasets/commands/load-additions.py +47 -0
  10. anemoi/datasets/commands/load.py +47 -0
  11. anemoi/datasets/commands/patch.py +39 -0
  12. anemoi/datasets/compute/recentre.py +1 -1
  13. anemoi/datasets/create/__init__.py +961 -146
  14. anemoi/datasets/create/check.py +5 -3
  15. anemoi/datasets/create/config.py +53 -2
  16. anemoi/datasets/create/functions/sources/accumulations.py +6 -22
  17. anemoi/datasets/create/functions/sources/hindcasts.py +27 -12
  18. anemoi/datasets/create/functions/sources/tendencies.py +1 -1
  19. anemoi/datasets/create/functions/sources/xarray/__init__.py +12 -2
  20. anemoi/datasets/create/functions/sources/xarray/coordinates.py +7 -0
  21. anemoi/datasets/create/functions/sources/xarray/field.py +1 -1
  22. anemoi/datasets/create/functions/sources/xarray/fieldlist.py +0 -2
  23. anemoi/datasets/create/functions/sources/xarray/flavour.py +21 -1
  24. anemoi/datasets/create/functions/sources/xarray/metadata.py +27 -29
  25. anemoi/datasets/create/functions/sources/xarray/time.py +63 -30
  26. anemoi/datasets/create/functions/sources/xarray/variable.py +15 -38
  27. anemoi/datasets/create/input.py +62 -25
  28. anemoi/datasets/create/statistics/__init__.py +39 -23
  29. anemoi/datasets/create/utils.py +3 -2
  30. anemoi/datasets/data/__init__.py +1 -0
  31. anemoi/datasets/data/concat.py +46 -2
  32. anemoi/datasets/data/dataset.py +109 -34
  33. anemoi/datasets/data/forwards.py +17 -8
  34. anemoi/datasets/data/grids.py +17 -3
  35. anemoi/datasets/data/interpolate.py +133 -0
  36. anemoi/datasets/data/misc.py +56 -66
  37. anemoi/datasets/data/missing.py +240 -0
  38. anemoi/datasets/data/select.py +7 -1
  39. anemoi/datasets/data/stores.py +3 -3
  40. anemoi/datasets/data/subset.py +47 -5
  41. anemoi/datasets/data/unchecked.py +20 -22
  42. anemoi/datasets/data/xy.py +125 -0
  43. anemoi/datasets/dates/__init__.py +33 -20
  44. anemoi/datasets/dates/groups.py +2 -2
  45. anemoi/datasets/grids.py +66 -48
  46. {anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/METADATA +5 -5
  47. {anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/RECORD +51 -41
  48. {anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/WHEEL +1 -1
  49. anemoi/datasets/create/loaders.py +0 -924
  50. {anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/LICENSE +0 -0
  51. {anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/entry_points.txt +0 -0
  52. {anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/top_level.txt +0 -0
@@ -7,196 +7,1011 @@
7
7
  # nor does it submit to any jurisdiction.
8
8
  #
9
9
 
10
+ import datetime
11
+ import json
10
12
  import logging
11
13
  import os
14
+ import time
15
+ import uuid
16
+ import warnings
17
+ from functools import cached_property
18
+
19
+ import numpy as np
20
+ import tqdm
21
+ from anemoi.utils.config import DotDict as DotDict
22
+ from anemoi.utils.dates import as_datetime
23
+ from anemoi.utils.dates import frequency_to_string
24
+ from anemoi.utils.dates import frequency_to_timedelta
25
+ from anemoi.utils.humanize import compress_dates
26
+ from anemoi.utils.humanize import seconds_to_human
27
+
28
+ from anemoi.datasets import MissingDateError
29
+ from anemoi.datasets import open_dataset
30
+ from anemoi.datasets.create.persistent import build_storage
31
+ from anemoi.datasets.data.misc import as_first_date
32
+ from anemoi.datasets.data.misc import as_last_date
33
+ from anemoi.datasets.dates.groups import Groups
34
+
35
+ from .check import DatasetName
36
+ from .check import check_data_values
37
+ from .chunks import ChunkFilter
38
+ from .config import build_output
39
+ from .config import loader_config
40
+ from .input import build_input
41
+ from .statistics import Summary
42
+ from .statistics import TmpStatistics
43
+ from .statistics import check_variance
44
+ from .statistics import compute_statistics
45
+ from .statistics import default_statistics_dates
46
+ from .statistics import fix_variance
47
+ from .utils import normalize_and_check_dates
48
+ from .writer import ViewCacheArray
12
49
 
13
50
  LOG = logging.getLogger(__name__)
14
51
 
52
+ VERSION = "0.20"
53
+
54
+
55
+ def json_tidy(o):
56
+
57
+ if isinstance(o, datetime.datetime):
58
+ return o.isoformat()
59
+
60
+ if isinstance(o, datetime.datetime):
61
+ return o.isoformat()
62
+
63
+ if isinstance(o, datetime.timedelta):
64
+ return frequency_to_string(o)
65
+
66
+ raise TypeError(repr(o) + " is not JSON serializable")
67
+
68
+
69
+ def build_statistics_dates(dates, start, end):
70
+ """Compute the start and end dates for the statistics, based on :
71
+ - The start and end dates in the config
72
+ - The default statistics dates convention
73
+
74
+ Then adapt according to the actual dates in the dataset.
75
+ """
76
+ # if not specified, use the default statistics dates
77
+ default_start, default_end = default_statistics_dates(dates)
78
+ if start is None:
79
+ start = default_start
80
+ if end is None:
81
+ end = default_end
82
+
83
+ # in any case, adapt to the actual dates in the dataset
84
+ start = as_first_date(start, dates)
85
+ end = as_last_date(end, dates)
86
+
87
+ # and convert to datetime to isoformat
88
+ start = start.astype(datetime.datetime)
89
+ end = end.astype(datetime.datetime)
90
+ return (start.isoformat(), end.isoformat())
91
+
15
92
 
16
93
  def _ignore(*args, **kwargs):
17
94
  pass
18
95
 
19
96
 
20
- class Creator:
21
- def __init__(
22
- self,
23
- path,
24
- config=None,
25
- cache=None,
26
- use_threads=False,
27
- statistics_tmp=None,
28
- overwrite=False,
29
- test=None,
30
- progress=None,
31
- **kwargs,
32
- ):
33
- self.path = path # Output path
34
- self.config = config
97
+ def _path_readable(path):
98
+ import zarr
99
+
100
+ try:
101
+ zarr.open(path, "r")
102
+ return True
103
+ except zarr.errors.PathNotFoundError:
104
+ return False
105
+
106
+
107
+ class Dataset:
108
+ def __init__(self, path):
109
+ self.path = path
110
+
111
+ _, ext = os.path.splitext(self.path)
112
+ if ext != ".zarr":
113
+ raise ValueError(f"Unsupported extension={ext} for path={self.path}")
114
+
115
+ def add_dataset(self, mode="r+", **kwargs):
116
+ import zarr
117
+
118
+ z = zarr.open(self.path, mode=mode)
119
+ from .zarr import add_zarr_dataset
120
+
121
+ return add_zarr_dataset(zarr_root=z, **kwargs)
122
+
123
+ def update_metadata(self, **kwargs):
124
+ import zarr
125
+
126
+ LOG.debug(f"Updating metadata {kwargs}")
127
+ z = zarr.open(self.path, mode="w+")
128
+ for k, v in kwargs.items():
129
+ if isinstance(v, np.datetime64):
130
+ v = v.astype(datetime.datetime)
131
+ if isinstance(v, datetime.date):
132
+ v = v.isoformat()
133
+ z.attrs[k] = json.loads(json.dumps(v, default=json_tidy))
134
+
135
+ @property
136
+ def anemoi_dataset(self):
137
+ return open_dataset(self.path)
138
+
139
+ @cached_property
140
+ def zarr_metadata(self):
141
+ import zarr
142
+
143
+ return dict(zarr.open(self.path, mode="r").attrs)
144
+
145
+ def print_info(self):
146
+ import zarr
147
+
148
+ z = zarr.open(self.path, mode="r")
149
+ try:
150
+ LOG.info(z["data"].info)
151
+ except Exception as e:
152
+ LOG.info(e)
153
+
154
+ def get_zarr_chunks(self):
155
+ import zarr
156
+
157
+ z = zarr.open(self.path, mode="r")
158
+ return z["data"].chunks
159
+
160
+ def check_name(self, resolution, dates, frequency, raise_exception=True, is_test=False):
161
+ basename, _ = os.path.splitext(os.path.basename(self.path))
162
+ try:
163
+ DatasetName(basename, resolution, dates[0], dates[-1], frequency).raise_if_not_valid()
164
+ except Exception as e:
165
+ if raise_exception and not is_test:
166
+ raise e
167
+ else:
168
+ LOG.warning(f"Dataset name error: {e}")
169
+
170
+ def get_main_config(self):
171
+ """Returns None if the config is not found."""
172
+ import zarr
173
+
174
+ z = zarr.open(self.path, mode="r")
175
+ return loader_config(z.attrs.get("_create_yaml_config"))
176
+
177
+
178
+ class WritableDataset(Dataset):
179
+ def __init__(self, path):
180
+ super().__init__(path)
181
+ self.path = path
182
+
183
+ import zarr
184
+
185
+ self.z = zarr.open(self.path, mode="r+")
186
+
187
+ @cached_property
188
+ def data_array(self):
189
+ import zarr
190
+
191
+ return zarr.open(self.path, mode="r+")["data"]
192
+
193
+
194
+ class NewDataset(Dataset):
195
+ def __init__(self, path, overwrite=False):
196
+ super().__init__(path)
197
+ self.path = path
198
+
199
+ import zarr
200
+
201
+ self.z = zarr.open(self.path, mode="w")
202
+ self.z.create_group("_build")
203
+
204
+
205
+ class Actor: # TODO: rename to Creator
206
+ dataset_class = WritableDataset
207
+
208
+ def __init__(self, path, cache=None):
209
+ # Catch all floating point errors, including overflow, sqrt(<0), etc
210
+ np.seterr(all="raise", under="warn")
211
+
212
+ self.path = path
35
213
  self.cache = cache
214
+ self.dataset = self.dataset_class(self.path)
215
+
216
+ def run(self):
217
+ # to be implemented in the sub-classes
218
+ raise NotImplementedError()
219
+
220
+ def update_metadata(self, **kwargs):
221
+ self.dataset.update_metadata(**kwargs)
222
+
223
+ def _cache_context(self):
224
+ from .utils import cache_context
225
+
226
+ return cache_context(self.cache)
227
+
228
+ def check_unkown_kwargs(self, kwargs):
229
+ # remove this latter
230
+ LOG.warning(f"💬 Unknown kwargs for {self.__class__.__name__}: {kwargs}")
231
+
232
+ def read_dataset_metadata(self, path):
233
+ ds = open_dataset(path)
234
+ self.dataset_shape = ds.shape
235
+ self.variables_names = ds.variables
236
+ assert len(self.variables_names) == ds.shape[1], self.dataset_shape
237
+ self.dates = ds.dates
238
+
239
+ self.missing_dates = sorted(list([self.dates[i] for i in ds.missing]))
240
+
241
+ def check_missing_dates(expected):
242
+ import zarr
243
+
244
+ z = zarr.open(path, "r")
245
+ missing_dates = z.attrs.get("missing_dates", [])
246
+ missing_dates = sorted([np.datetime64(d) for d in missing_dates])
247
+ if missing_dates != expected:
248
+ LOG.warn("Missing dates given in recipe do not match the actual missing dates in the dataset.")
249
+ LOG.warn(f"Missing dates in recipe: {sorted(str(x) for x in missing_dates)}")
250
+ LOG.warn(f"Missing dates in dataset: {sorted(str(x) for x in expected)}")
251
+ raise ValueError("Missing dates given in recipe do not match the actual missing dates in the dataset.")
252
+
253
+ check_missing_dates(self.missing_dates)
254
+
255
+
256
+ class Patch(Actor):
257
+ def __init__(self, path, options=None, **kwargs):
258
+ self.path = path
259
+ self.options = options or {}
260
+
261
+ def run(self):
262
+ from .patch import apply_patch
263
+
264
+ apply_patch(self.path, **self.options)
265
+
266
+
267
+ class Size(Actor):
268
+ def __init__(self, path, **kwargs):
269
+ super().__init__(path)
270
+
271
+ def run(self):
272
+ from .size import compute_directory_sizes
273
+
274
+ metadata = compute_directory_sizes(self.path)
275
+ self.update_metadata(**metadata)
276
+
277
+
278
+ class HasRegistryMixin:
279
+ @cached_property
280
+ def registry(self):
281
+ from .zarr import ZarrBuiltRegistry
282
+
283
+ return ZarrBuiltRegistry(self.path, use_threads=self.use_threads)
284
+
285
+
286
+ class HasStatisticTempMixin:
287
+ @cached_property
288
+ def tmp_statistics(self):
289
+ directory = self.statistics_temp_dir or os.path.join(self.path + ".storage_for_statistics.tmp")
290
+ return TmpStatistics(directory)
291
+
292
+
293
+ class HasElementForDataMixin:
294
+ def create_elements(self, config):
295
+
296
+ assert self.registry
297
+ assert self.tmp_statistics
298
+
299
+ LOG.info(dict(config.dates))
300
+
301
+ self.groups = Groups(**config.dates)
302
+ LOG.info(self.groups)
303
+
304
+ self.output = build_output(config.output, parent=self)
305
+
306
+ self.input = build_input_(main_config=config, output_config=self.output)
307
+ LOG.info(self.input)
308
+
309
+
310
+ def build_input_(main_config, output_config):
311
+ from earthkit.data.core.order import build_remapping
312
+
313
+ builder = build_input(
314
+ main_config.input,
315
+ data_sources=main_config.get("data_sources", {}),
316
+ order_by=output_config.order_by,
317
+ flatten_grid=output_config.flatten_grid,
318
+ remapping=build_remapping(output_config.remapping),
319
+ use_grib_paramid=main_config.build.use_grib_paramid,
320
+ )
321
+ LOG.debug("✅ INPUT_BUILDER")
322
+ LOG.debug(builder)
323
+ return builder
324
+
325
+
326
+ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixin):
327
+ dataset_class = NewDataset
328
+ def __init__(self, path, config, check_name=False, overwrite=False, use_threads=False, statistics_temp_dir=None, progress=None, test=False, cache=None, **kwargs): # fmt: skip
329
+ if _path_readable(path) and not overwrite:
330
+ raise Exception(f"{self.path} already exists. Use overwrite=True to overwrite.")
331
+
332
+ super().__init__(path, cache=cache)
333
+ self.config = config
334
+ self.check_name = check_name
36
335
  self.use_threads = use_threads
37
- self.statistics_tmp = statistics_tmp
38
- self.overwrite = overwrite
336
+ self.statistics_temp_dir = statistics_temp_dir
337
+ self.progress = progress
39
338
  self.test = test
40
- self.progress = progress if progress is not None else _ignore
41
339
 
42
- def init(self, check_name=False):
43
- # check path
44
- _, ext = os.path.splitext(self.path)
45
- assert ext != "zarr", f"Unsupported extension={ext}"
46
- from .loaders import InitialiserLoader
340
+ self.main_config = loader_config(config, is_test=test)
47
341
 
48
- if self._path_readable() and not self.overwrite:
49
- raise Exception(f"{self.path} already exists. Use overwrite=True to overwrite.")
342
+ # self.registry.delete() ??
343
+ self.tmp_statistics.delete()
50
344
 
345
+ assert isinstance(self.main_config.output.order_by, dict), self.main_config.output.order_by
346
+ self.create_elements(self.main_config)
347
+
348
+ first_date = self.groups.dates[0]
349
+ self.minimal_input = self.input.select([first_date])
350
+ LOG.info("Minimal input for 'init' step (using only the first date) :")
351
+ LOG.info(self.minimal_input)
352
+
353
+ def run(self):
51
354
  with self._cache_context():
52
- obj = InitialiserLoader.from_config(
53
- path=self.path,
54
- config=self.config,
55
- statistics_tmp=self.statistics_tmp,
56
- use_threads=self.use_threads,
57
- progress=self.progress,
58
- test=self.test,
355
+ return self._run()
356
+
357
+ def _run(self):
358
+ """Create an empty dataset of the right final shape
359
+
360
+ Read a small part of the data to get the shape of the data and the resolution and more metadata.
361
+ """
362
+
363
+ LOG.info("Config loaded ok:")
364
+ # LOG.info(self.main_config)
365
+
366
+ dates = self.groups.dates
367
+ frequency = dates.frequency
368
+ assert isinstance(frequency, datetime.timedelta), frequency
369
+
370
+ LOG.info(f"Found {len(dates)} datetimes.")
371
+ LOG.info(f"Dates: Found {len(dates)} datetimes, in {len(self.groups)} groups: ")
372
+ LOG.info(f"Missing dates: {len(dates.missing)}")
373
+ lengths = tuple(len(g) for g in self.groups)
374
+
375
+ variables = self.minimal_input.variables
376
+ LOG.info(f"Found {len(variables)} variables : {','.join(variables)}.")
377
+
378
+ variables_with_nans = self.main_config.statistics.get("allow_nans", [])
379
+
380
+ ensembles = self.minimal_input.ensembles
381
+ LOG.info(f"Found {len(ensembles)} ensembles : {','.join([str(_) for _ in ensembles])}.")
382
+
383
+ grid_points = self.minimal_input.grid_points
384
+ LOG.info(f"gridpoints size: {[len(i) for i in grid_points]}")
385
+
386
+ resolution = self.minimal_input.resolution
387
+ LOG.info(f"{resolution=}")
388
+
389
+ coords = self.minimal_input.coords
390
+ coords["dates"] = dates
391
+ total_shape = self.minimal_input.shape
392
+ total_shape[0] = len(dates)
393
+ LOG.info(f"total_shape = {total_shape}")
394
+
395
+ chunks = self.output.get_chunking(coords)
396
+ LOG.info(f"{chunks=}")
397
+ dtype = self.output.dtype
398
+
399
+ LOG.info(f"Creating Dataset '{self.path}', with {total_shape=}, {chunks=} and {dtype=}")
400
+
401
+ metadata = {}
402
+ metadata["uuid"] = str(uuid.uuid4())
403
+
404
+ metadata.update(self.main_config.get("add_metadata", {}))
405
+
406
+ metadata["_create_yaml_config"] = self.main_config.get_serialisable_dict()
407
+
408
+ metadata["description"] = self.main_config.description
409
+ metadata["licence"] = self.main_config["licence"]
410
+ metadata["attribution"] = self.main_config["attribution"]
411
+
412
+ metadata["remapping"] = self.output.remapping
413
+ metadata["order_by"] = self.output.order_by_as_list
414
+ metadata["flatten_grid"] = self.output.flatten_grid
415
+
416
+ metadata["ensemble_dimension"] = len(ensembles)
417
+ metadata["variables"] = variables
418
+ metadata["variables_with_nans"] = variables_with_nans
419
+ metadata["allow_nans"] = self.main_config.build.get("allow_nans", False)
420
+ metadata["resolution"] = resolution
421
+
422
+ metadata["data_request"] = self.minimal_input.data_request
423
+ metadata["field_shape"] = self.minimal_input.field_shape
424
+ metadata["proj_string"] = self.minimal_input.proj_string
425
+
426
+ metadata["start_date"] = dates[0].isoformat()
427
+ metadata["end_date"] = dates[-1].isoformat()
428
+ metadata["frequency"] = frequency
429
+ metadata["missing_dates"] = [_.isoformat() for _ in dates.missing]
430
+
431
+ metadata["version"] = VERSION
432
+
433
+ self.dataset.check_name(
434
+ raise_exception=self.check_name,
435
+ is_test=self.test,
436
+ resolution=resolution,
437
+ dates=dates,
438
+ frequency=frequency,
439
+ )
440
+
441
+ if len(dates) != total_shape[0]:
442
+ raise ValueError(
443
+ f"Final date size {len(dates)} (from {dates[0]} to {dates[-1]}, {frequency=}) "
444
+ f"does not match data shape {total_shape[0]}. {total_shape=}"
59
445
  )
60
- return obj.initialise(check_name=check_name)
61
446
 
62
- def load(self, parts=None):
63
- from .loaders import ContentLoader
447
+ dates = normalize_and_check_dates(dates, metadata["start_date"], metadata["end_date"], metadata["frequency"])
448
+
449
+ metadata.update(self.main_config.get("force_metadata", {}))
450
+
451
+ ###############################################################
452
+ # write metadata
453
+ ###############################################################
454
+
455
+ self.update_metadata(**metadata)
456
+
457
+ self.dataset.add_dataset(
458
+ name="data",
459
+ chunks=chunks,
460
+ dtype=dtype,
461
+ shape=total_shape,
462
+ dimensions=("time", "variable", "ensemble", "cell"),
463
+ )
464
+ self.dataset.add_dataset(name="dates", array=dates, dimensions=("time",))
465
+ self.dataset.add_dataset(name="latitudes", array=grid_points[0], dimensions=("cell",))
466
+ self.dataset.add_dataset(name="longitudes", array=grid_points[1], dimensions=("cell",))
467
+
468
+ self.registry.create(lengths=lengths)
469
+ self.tmp_statistics.create(exist_ok=False)
470
+ self.registry.add_to_history("tmp_statistics_initialised", version=self.tmp_statistics.version)
471
+
472
+ statistics_start, statistics_end = build_statistics_dates(
473
+ dates,
474
+ self.main_config.statistics.get("start"),
475
+ self.main_config.statistics.get("end"),
476
+ )
477
+ self.update_metadata(statistics_start_date=statistics_start, statistics_end_date=statistics_end)
478
+ LOG.info(f"Will compute statistics from {statistics_start} to {statistics_end}")
479
+
480
+ self.registry.add_to_history("init finished")
481
+
482
+ assert chunks == self.dataset.get_zarr_chunks(), (chunks, self.dataset.get_zarr_chunks())
483
+
484
+ def sanity_check_config(a, b):
485
+ a = json.dumps(a, sort_keys=True, default=str)
486
+ b = json.dumps(b, sort_keys=True, default=str)
487
+ b = b.replace("T", " ") # dates are expected to be different because
488
+ if a != b:
489
+ print("❌❌❌ FIXME: Config serialisation to be checked")
490
+ print(a)
491
+ print(b)
492
+
493
+ sanity_check_config(self.main_config, self.dataset.get_main_config())
64
494
 
495
+ # Return the number of groups to process, so we can show a nice progress bar
496
+ return len(lengths)
497
+
498
+
499
+ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixin):
500
+ def __init__(self, path, parts=None, use_threads=False, statistics_temp_dir=None, progress=None, cache=None, **kwargs): # fmt: skip
501
+ super().__init__(path, cache=cache)
502
+ self.use_threads = use_threads
503
+ self.statistics_temp_dir = statistics_temp_dir
504
+ self.progress = progress
505
+ self.parts = parts
506
+ self.dataset = WritableDataset(self.path)
507
+
508
+ self.main_config = self.dataset.get_main_config()
509
+ self.create_elements(self.main_config)
510
+ self.read_dataset_metadata(self.dataset.path)
511
+
512
+ total = len(self.registry.get_flags())
513
+ self.chunk_filter = ChunkFilter(parts=self.parts, total=total)
514
+
515
+ self.data_array = self.dataset.data_array
516
+ self.n_groups = len(self.groups)
517
+
518
+ def run(self):
65
519
  with self._cache_context():
66
- loader = ContentLoader.from_dataset_config(
67
- path=self.path,
68
- statistics_tmp=self.statistics_tmp,
69
- use_threads=self.use_threads,
70
- progress=self.progress,
71
- parts=parts,
520
+ self._run()
521
+
522
+ def _run(self):
523
+ for igroup, group in enumerate(self.groups):
524
+ if not self.chunk_filter(igroup):
525
+ continue
526
+ if self.registry.get_flag(igroup):
527
+ LOG.info(f" -> Skipping {igroup} total={len(self.groups)} (already done)")
528
+ continue
529
+
530
+ assert isinstance(group[0], datetime.datetime), group
531
+ LOG.debug(f"Building data for group {igroup}/{self.n_groups}")
532
+
533
+ result = self.input.select(dates=group)
534
+ assert result.dates == group, (len(result.dates), len(group))
535
+
536
+ # There are several groups.
537
+ # There is one result to load for each group.
538
+ self.load_result(result)
539
+ self.registry.set_flag(igroup)
540
+
541
+ self.registry.add_provenance(name="provenance_load")
542
+ self.tmp_statistics.add_provenance(name="provenance_load", config=self.main_config)
543
+
544
+ self.dataset.print_info()
545
+
546
+ def load_result(self, result):
547
+ # There is one cube to load for each result.
548
+ dates = result.dates
549
+
550
+ cube = result.get_cube()
551
+ shape = cube.extended_user_shape
552
+ dates_in_data = cube.user_coords["valid_datetime"]
553
+
554
+ LOG.debug(f"Loading {shape=} in {self.data_array.shape=}")
555
+
556
+ def check_shape(cube, dates, dates_in_data):
557
+ if cube.extended_user_shape[0] != len(dates):
558
+ print(f"Cube shape does not match the number of dates {cube.extended_user_shape[0]}, {len(dates)}")
559
+ print("Requested dates", compress_dates(dates))
560
+ print("Cube dates", compress_dates(dates_in_data))
561
+
562
+ a = set(as_datetime(_) for _ in dates)
563
+ b = set(as_datetime(_) for _ in dates_in_data)
564
+
565
+ print("Missing dates", compress_dates(a - b))
566
+ print("Extra dates", compress_dates(b - a))
567
+
568
+ raise ValueError(
569
+ f"Cube shape does not match the number of dates {cube.extended_user_shape[0]}, {len(dates)}"
570
+ )
571
+
572
+ check_shape(cube, dates, dates_in_data)
573
+
574
+ def check_dates_in_data(lst, lst2):
575
+ lst2 = [np.datetime64(_) for _ in lst2]
576
+ lst = [np.datetime64(_) for _ in lst]
577
+ assert lst == lst2, ("Dates in data are not the requested ones:", lst, lst2)
578
+
579
+ check_dates_in_data(dates_in_data, dates)
580
+
581
+ def dates_to_indexes(dates, all_dates):
582
+ x = np.array(dates, dtype=np.datetime64)
583
+ y = np.array(all_dates, dtype=np.datetime64)
584
+ bitmap = np.isin(x, y)
585
+ return np.where(bitmap)[0]
586
+
587
+ indexes = dates_to_indexes(self.dates, dates_in_data)
588
+
589
+ array = ViewCacheArray(self.data_array, shape=shape, indexes=indexes)
590
+ self.load_cube(cube, array)
591
+
592
+ stats = compute_statistics(array.cache, self.variables_names, allow_nans=self._get_allow_nans())
593
+ self.tmp_statistics.write(indexes, stats, dates=dates_in_data)
594
+
595
+ array.flush()
596
+
597
+ def _get_allow_nans(self):
598
+ config = self.main_config
599
+ if "allow_nans" in config.build:
600
+ return config.build.allow_nans
601
+
602
+ return config.statistics.get("allow_nans", [])
603
+
604
+ def load_cube(self, cube, array):
605
+ # There are several cubelets for each cube
606
+ start = time.time()
607
+ load = 0
608
+ save = 0
609
+
610
+ reading_chunks = None
611
+ total = cube.count(reading_chunks)
612
+ LOG.debug(f"Loading datacube: {cube}")
613
+
614
+ def position(x):
615
+ if isinstance(x, str) and "/" in x:
616
+ x = x.split("/")
617
+ return int(x[0])
618
+ return None
619
+
620
+ bar = tqdm.tqdm(
621
+ iterable=cube.iterate_cubelets(reading_chunks),
622
+ total=total,
623
+ desc=f"Loading datacube {cube}",
624
+ position=position(self.parts),
625
+ )
626
+ for i, cubelet in enumerate(bar):
627
+ bar.set_description(f"Loading {i}/{total}")
628
+
629
+ now = time.time()
630
+ data = cubelet.to_numpy()
631
+ local_indexes = cubelet.coords
632
+ load += time.time() - now
633
+
634
+ name = self.variables_names[local_indexes[1]]
635
+ check_data_values(
636
+ data[:],
637
+ name=name,
638
+ log=[i, data.shape, local_indexes],
639
+ allow_nans=self._get_allow_nans(),
72
640
  )
73
- loader.load()
74
-
75
- def statistics(self, force=False, output=None, start=None, end=None):
76
- from .loaders import StatisticsAdder
77
-
78
- loader = StatisticsAdder.from_dataset(
79
- path=self.path,
80
- use_threads=self.use_threads,
81
- progress=self.progress,
82
- statistics_tmp=self.statistics_tmp,
83
- statistics_output=output,
84
- recompute=False,
85
- statistics_start=start,
86
- statistics_end=end,
641
+
642
+ now = time.time()
643
+ array[local_indexes] = data
644
+ save += time.time() - now
645
+
646
+ now = time.time()
647
+ save += time.time() - now
648
+ LOG.debug(
649
+ f"Elapsed: {seconds_to_human(time.time() - start)}, "
650
+ f"load time: {seconds_to_human(load)}, "
651
+ f"write time: {seconds_to_human(save)}."
87
652
  )
88
- loader.run()
89
- assert loader.ready()
90
653
 
91
- def size(self):
92
- from .loaders import DatasetHandler
93
- from .size import compute_directory_sizes
94
654
 
95
- metadata = compute_directory_sizes(self.path)
96
- handle = DatasetHandler.from_dataset(path=self.path, use_threads=self.use_threads)
97
- handle.update_metadata(**metadata)
655
+ class Cleanup(Actor, HasRegistryMixin, HasStatisticTempMixin):
656
+ def __init__(self, path, statistics_temp_dir=None, delta=[], use_threads=False, **kwargs):
657
+ super().__init__(path)
658
+ self.use_threads = use_threads
659
+ self.statistics_temp_dir = statistics_temp_dir
660
+ self.additinon_temp_dir = statistics_temp_dir
661
+ self.actors = [
662
+ _InitAdditions(path, delta=d, use_threads=use_threads, statistics_temp_dir=statistics_temp_dir)
663
+ for d in delta
664
+ ]
665
+
666
+ def run(self):
667
+ self.tmp_statistics.delete()
668
+ self.registry.clean()
669
+ for actor in self.actors:
670
+ actor.cleanup()
671
+
672
+
673
+ class Verify(Actor):
674
+ def __init__(self, path, **kwargs):
675
+ super().__init__(path)
676
+
677
+ def run(self):
678
+ LOG.info(f"Verifying dataset at {self.path}")
679
+ LOG.info(str(self.dataset.anemoi_dataset))
680
+
681
+
682
+ class AdditionsMixin:
683
+ def skip(self):
684
+ frequency = frequency_to_timedelta(self.dataset.anemoi_dataset.frequency)
685
+ if not self.delta.total_seconds() % frequency.total_seconds() == 0:
686
+ LOG.debug(f"Delta {self.delta} is not a multiple of frequency {frequency}. Skipping.")
687
+ return True
688
+ return False
689
+
690
+ @cached_property
691
+ def tmp_storage_path(self):
692
+ name = "storage_for_additions"
693
+ if self.delta:
694
+ name += frequency_to_string(self.delta)
695
+ return os.path.join(f"{self.path}.{name}.tmp")
696
+
697
+ def read_from_dataset(self):
698
+ self.variables = self.dataset.anemoi_dataset.variables
699
+ self.frequency = frequency_to_timedelta(self.dataset.anemoi_dataset.frequency)
700
+ start = self.dataset.zarr_metadata["statistics_start_date"]
701
+ end = self.dataset.zarr_metadata["statistics_end_date"]
702
+ self.start = datetime.datetime.fromisoformat(start)
703
+ self.end = datetime.datetime.fromisoformat(end)
704
+
705
+ ds = open_dataset(self.path, start=self.start, end=self.end)
706
+ self.dates = ds.dates
707
+ self.total = len(self.dates)
708
+
709
+ idelta = self.delta.total_seconds() // self.frequency.total_seconds()
710
+ assert int(idelta) == idelta, idelta
711
+ idelta = int(idelta)
712
+ self.ds = DeltaDataset(ds, idelta)
713
+
714
+
715
+ class DeltaDataset:
716
+ def __init__(self, ds, idelta):
717
+ self.ds = ds
718
+ self.idelta = idelta
719
+
720
+ def __getitem__(self, i):
721
+ j = i - self.idelta
722
+ if j < 0:
723
+ raise MissingDateError(f"Missing date {j}")
724
+ return self.ds[i : i + 1, ...] - self.ds[j : j + 1, ...]
725
+
726
+
727
+ class _InitAdditions(Actor, HasRegistryMixin, AdditionsMixin):
728
+ def __init__(self, path, delta, use_threads=False, progress=None, **kwargs):
729
+ super().__init__(path)
730
+ self.delta = frequency_to_timedelta(delta)
731
+ self.use_threads = use_threads
732
+ self.progress = progress
733
+
734
+ def run(self):
735
+ if self.skip():
736
+ LOG.info(f"Skipping delta={self.delta}")
737
+ return
738
+
739
+ self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=True)
740
+ self.tmp_storage.delete()
741
+ self.tmp_storage.create()
742
+ LOG.info(f"Dataset {self.tmp_storage_path} additions initialized.")
98
743
 
99
744
  def cleanup(self):
100
- from .loaders import DatasetHandlerWithStatistics
745
+ self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=False)
746
+ self.tmp_storage.delete()
747
+ LOG.info(f"Cleaned temporary storage {self.tmp_storage_path}")
101
748
 
102
- cleaner = DatasetHandlerWithStatistics.from_dataset(
103
- path=self.path, use_threads=self.use_threads, progress=self.progress, statistics_tmp=self.statistics_tmp
104
- )
105
- cleaner.tmp_statistics.delete()
106
- cleaner.registry.clean()
107
749
 
108
- def patch(self, **kwargs):
109
- from .patch import apply_patch
750
+ class _RunAdditions(Actor, HasRegistryMixin, AdditionsMixin):
751
+ def __init__(self, path, delta, parts=None, use_threads=False, progress=None, **kwargs):
752
+ super().__init__(path)
753
+ self.delta = frequency_to_timedelta(delta)
754
+ self.use_threads = use_threads
755
+ self.progress = progress
756
+ self.parts = parts
110
757
 
111
- apply_patch(self.path, **kwargs)
758
+ self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=False)
759
+ LOG.info(f"Writing in {self.tmp_storage_path}")
112
760
 
113
- def init_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
114
- from .loaders import StatisticsAddition
115
- from .loaders import TendenciesStatisticsAddition
116
- from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
761
+ def run(self):
762
+ if self.skip():
763
+ LOG.info(f"Skipping delta={self.delta}")
764
+ return
117
765
 
118
- if statistics:
119
- a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
120
- a.initialise()
766
+ self.read_from_dataset()
121
767
 
122
- for d in delta:
768
+ chunk_filter = ChunkFilter(parts=self.parts, total=self.total)
769
+ for i in range(0, self.total):
770
+ if not chunk_filter(i):
771
+ continue
772
+ date = self.dates[i]
123
773
  try:
124
- a = TendenciesStatisticsAddition.from_dataset(
125
- path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
126
- )
127
- a.initialise()
128
- except TendenciesStatisticsDeltaNotMultipleOfFrequency:
129
- LOG.info(f"Skipping delta={d} as it is not a multiple of the frequency.")
774
+ arr = self.ds[i]
775
+ stats = compute_statistics(arr, self.variables, allow_nans=self.allow_nans)
776
+ self.tmp_storage.add([date, i, stats], key=date)
777
+ except MissingDateError:
778
+ self.tmp_storage.add([date, i, "missing"], key=date)
779
+ self.tmp_storage.flush()
780
+ LOG.debug(f"Dataset {self.path} additions run.")
781
+
782
+ def allow_nans(self):
783
+ if self.dataset.anemoi_dataset.metadata.get("allow_nans", False):
784
+ return True
130
785
 
131
- def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24], statistics=True):
132
- from .loaders import StatisticsAddition
133
- from .loaders import TendenciesStatisticsAddition
134
- from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
786
+ variables_with_nans = self.dataset.anemoi_dataset.metadata.get("variables_with_nans", None)
787
+ if variables_with_nans is not None:
788
+ return variables_with_nans
789
+ warnings.warn(f"❗Cannot find 'variables_with_nans' in {self.path}, assuming nans allowed.")
790
+ return True
135
791
 
136
- if statistics:
137
- a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
138
- a.run(parts)
139
792
 
140
- for d in delta:
141
- try:
142
- a = TendenciesStatisticsAddition.from_dataset(
143
- path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
144
- )
145
- a.run(parts)
146
- except TendenciesStatisticsDeltaNotMultipleOfFrequency:
147
- LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
793
+ class _FinaliseAdditions(Actor, HasRegistryMixin, AdditionsMixin):
794
+ def __init__(self, path, delta, use_threads=False, progress=None, **kwargs):
795
+ super().__init__(path)
796
+ self.delta = frequency_to_timedelta(delta)
797
+ self.use_threads = use_threads
798
+ self.progress = progress
799
+
800
+ self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=False)
801
+ LOG.info(f"Reading from {self.tmp_storage_path}.")
802
+
803
+ def run(self):
804
+ if self.skip():
805
+ LOG.info(f"Skipping delta={self.delta}.")
806
+ return
807
+
808
+ self.read_from_dataset()
809
+
810
+ shape = (len(self.dates), len(self.variables))
811
+ agg = dict(
812
+ minimum=np.full(shape, np.nan, dtype=np.float64),
813
+ maximum=np.full(shape, np.nan, dtype=np.float64),
814
+ sums=np.full(shape, np.nan, dtype=np.float64),
815
+ squares=np.full(shape, np.nan, dtype=np.float64),
816
+ count=np.full(shape, -1, dtype=np.int64),
817
+ has_nans=np.full(shape, False, dtype=np.bool_),
818
+ )
819
+ LOG.debug(f"Aggregating {self.__class__.__name__} statistics on shape={shape}. Variables : {self.variables}")
820
+
821
+ found = set()
822
+ ifound = set()
823
+ missing = set()
824
+ for _date, (date, i, stats) in self.tmp_storage.items():
825
+ assert _date == date
826
+ if stats == "missing":
827
+ missing.add(date)
828
+ continue
829
+
830
+ assert date not in found, f"Duplicates found {date}"
831
+ found.add(date)
832
+ ifound.add(i)
833
+
834
+ for k in ["minimum", "maximum", "sums", "squares", "count", "has_nans"]:
835
+ agg[k][i, ...] = stats[k]
836
+
837
+ assert len(found) + len(missing) == len(self.dates), (
838
+ len(found),
839
+ len(missing),
840
+ len(self.dates),
841
+ )
842
+ assert found.union(missing) == set(self.dates), (
843
+ found,
844
+ missing,
845
+ set(self.dates),
846
+ )
148
847
 
149
- def finalise_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
150
- from .loaders import StatisticsAddition
151
- from .loaders import TendenciesStatisticsAddition
152
- from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
848
+ if len(ifound) < 2:
849
+ LOG.warn(f"Not enough data found in {self.path} to compute {self.__class__.__name__}. Skipped.")
850
+ self.tmp_storage.delete()
851
+ return
153
852
 
154
- if statistics:
155
- a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
156
- a.finalise()
853
+ mask = sorted(list(ifound))
854
+ for k in ["minimum", "maximum", "sums", "squares", "count", "has_nans"]:
855
+ agg[k] = agg[k][mask, ...]
157
856
 
158
- for d in delta:
159
- try:
160
- a = TendenciesStatisticsAddition.from_dataset(
161
- path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
162
- )
163
- a.finalise()
164
- except TendenciesStatisticsDeltaNotMultipleOfFrequency:
165
- LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
166
-
167
- def finalise(self, **kwargs):
168
- self.statistics(**kwargs)
169
- self.size()
170
-
171
- def create(self):
172
- self.init()
173
- self.load()
174
- self.finalise()
175
- self.additions()
176
- self.cleanup()
177
-
178
- def additions(self, delta=[1, 3, 6, 12, 24]):
179
- self.init_additions(delta=delta)
180
- self.run_additions(delta=delta)
181
- self.finalise_additions(delta=delta)
857
+ for k in ["minimum", "maximum", "sums", "squares", "count", "has_nans"]:
858
+ assert agg[k].shape == agg["count"].shape, (
859
+ agg[k].shape,
860
+ agg["count"].shape,
861
+ )
182
862
 
183
- def _cache_context(self):
184
- from .utils import cache_context
863
+ minimum = np.nanmin(agg["minimum"], axis=0)
864
+ maximum = np.nanmax(agg["maximum"], axis=0)
865
+ sums = np.nansum(agg["sums"], axis=0)
866
+ squares = np.nansum(agg["squares"], axis=0)
867
+ count = np.nansum(agg["count"], axis=0)
868
+ has_nans = np.any(agg["has_nans"], axis=0)
869
+
870
+ assert sums.shape == count.shape
871
+ assert sums.shape == squares.shape
872
+ assert sums.shape == minimum.shape
873
+ assert sums.shape == maximum.shape
874
+ assert sums.shape == has_nans.shape
875
+
876
+ mean = sums / count
877
+ assert sums.shape == mean.shape
878
+
879
+ x = squares / count - mean * mean
880
+ # x[- 1e-15 < (x / (np.sqrt(squares / count) + np.abs(mean))) < 0] = 0
881
+ # remove negative variance due to numerical errors
882
+ for i, name in enumerate(self.variables):
883
+ x[i] = fix_variance(x[i], name, agg["count"][i : i + 1], agg["sums"][i : i + 1], agg["squares"][i : i + 1])
884
+ check_variance(x, self.variables, minimum, maximum, mean, count, sums, squares)
885
+
886
+ stdev = np.sqrt(x)
887
+ assert sums.shape == stdev.shape
888
+
889
+ self.summary = Summary(
890
+ minimum=minimum,
891
+ maximum=maximum,
892
+ mean=mean,
893
+ count=count,
894
+ sums=sums,
895
+ squares=squares,
896
+ stdev=stdev,
897
+ variables_names=self.variables,
898
+ has_nans=has_nans,
899
+ )
900
+ LOG.info(f"Dataset {self.path} additions finalised.")
901
+ # self.check_statistics()
902
+ self._write(self.summary)
903
+ self.tmp_storage.delete()
185
904
 
186
- return cache_context(self.cache)
905
+ def _write(self, summary):
906
+ for k in ["mean", "stdev", "minimum", "maximum", "sums", "squares", "count", "has_nans"]:
907
+ name = f"statistics_tendencies_{frequency_to_string(self.delta)}_{k}"
908
+ self.dataset.add_dataset(name=name, array=summary[k], dimensions=("variable",))
909
+ self.registry.add_to_history(f"compute_statistics_{self.__class__.__name__.lower()}_end")
910
+ LOG.debug(f"Wrote additions in {self.path}")
187
911
 
188
- def _path_readable(self):
189
- import zarr
190
912
 
191
- try:
192
- zarr.open(self.path, "r")
193
- return True
194
- except zarr.errors.PathNotFoundError:
195
- return False
913
+ def multi_addition(cls):
914
+ class MultiAdditions:
915
+ def __init__(self, *args, **kwargs):
916
+ self.actors = []
917
+
918
+ for k in kwargs.pop("delta", []):
919
+ self.actors.append(cls(*args, delta=k, **kwargs))
920
+
921
+ if not self.actors:
922
+ LOG.warning("No delta found in kwargs, no addtions will be computed.")
196
923
 
197
- def verify(self):
198
- from .loaders import DatasetVerifier
924
+ def run(self):
925
+ for actor in self.actors:
926
+ actor.run()
199
927
 
200
- handle = DatasetVerifier.from_dataset(path=self.path, use_threads=self.use_threads)
928
+ return MultiAdditions
929
+
930
+
931
+ InitAdditions = multi_addition(_InitAdditions)
932
+ RunAdditions = multi_addition(_RunAdditions)
933
+ FinaliseAdditions = multi_addition(_FinaliseAdditions)
934
+
935
+
936
+ class Statistics(Actor, HasStatisticTempMixin, HasRegistryMixin):
937
+ def __init__(self, path, use_threads=False, statistics_temp_dir=None, progress=None, **kwargs):
938
+ super().__init__(path)
939
+ self.use_threads = use_threads
940
+ self.progress = progress
941
+ self.statistics_temp_dir = statistics_temp_dir
942
+
943
+ def run(self):
944
+ start, end = (
945
+ self.dataset.zarr_metadata["statistics_start_date"],
946
+ self.dataset.zarr_metadata["statistics_end_date"],
947
+ )
948
+ start, end = np.datetime64(start), np.datetime64(end)
949
+ dates = self.dataset.anemoi_dataset.dates
950
+ assert type(dates[0]) == type(start), (type(dates[0]), type(start)) # noqa
951
+ dates = [d for d in dates if d >= start and d <= end]
952
+ dates = [d for i, d in enumerate(dates) if i not in self.dataset.anemoi_dataset.missing]
953
+ variables = self.dataset.anemoi_dataset.variables
954
+ stats = self.tmp_statistics.get_aggregated(dates, variables, self.allow_nans)
955
+
956
+ LOG.info(stats)
957
+
958
+ if not all(self.registry.get_flags(sync=False)):
959
+ raise Exception(f"❗Zarr {self.path} is not fully built, not writting statistics into dataset.")
960
+
961
+ for k in ["mean", "stdev", "minimum", "maximum", "sums", "squares", "count", "has_nans"]:
962
+ self.dataset.add_dataset(name=k, array=stats[k], dimensions=("variable",))
963
+
964
+ self.registry.add_to_history("compute_statistics_end")
965
+ LOG.info(f"Wrote statistics in {self.path}")
966
+
967
+ @cached_property
968
+ def allow_nans(self):
969
+ import zarr
201
970
 
202
- handle.verify()
971
+ z = zarr.open(self.path, mode="r")
972
+ if "allow_nans" in z.attrs:
973
+ return z.attrs["allow_nans"]
974
+
975
+ if "variables_with_nans" in z.attrs:
976
+ return z.attrs["variables_with_nans"]
977
+
978
+ warnings.warn(f"Cannot find 'variables_with_nans' of 'allow_nans' in {self.path}.")
979
+ return True
980
+
981
+
982
+ def chain(tasks):
983
+ class Chain(Actor):
984
+ def __init__(self, **kwargs):
985
+ self.kwargs = kwargs
986
+
987
+ def run(self):
988
+ for cls in tasks:
989
+ t = cls(**self.kwargs)
990
+ t.run()
991
+
992
+ return Chain
993
+
994
+
995
+ def creator_factory(name, trace=None, **kwargs):
996
+ if trace:
997
+ from anemoi.datasets.create.trace import enable_trace
998
+
999
+ enable_trace(trace)
1000
+
1001
+ cls = dict(
1002
+ init=Init,
1003
+ load=Load,
1004
+ size=Size,
1005
+ patch=Patch,
1006
+ statistics=Statistics,
1007
+ finalise=chain([Statistics, Size, Cleanup]),
1008
+ cleanup=Cleanup,
1009
+ verify=Verify,
1010
+ init_additions=InitAdditions,
1011
+ load_additions=RunAdditions,
1012
+ run_additions=RunAdditions,
1013
+ finalise_additions=chain([FinaliseAdditions, Size]),
1014
+ additions=chain([InitAdditions, RunAdditions, FinaliseAdditions, Size, Cleanup]),
1015
+ )[name]
1016
+ LOG.debug(f"Creating {cls.__name__} with {kwargs}")
1017
+ return cls(**kwargs)