anemoi-datasets 0.3.10__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. anemoi/datasets/_version.py +2 -2
  2. anemoi/datasets/commands/compare.py +59 -0
  3. anemoi/datasets/commands/create.py +84 -3
  4. anemoi/datasets/commands/inspect.py +9 -9
  5. anemoi/datasets/commands/scan.py +4 -4
  6. anemoi/datasets/compute/recentre.py +14 -9
  7. anemoi/datasets/create/__init__.py +44 -17
  8. anemoi/datasets/create/check.py +6 -5
  9. anemoi/datasets/create/chunks.py +1 -1
  10. anemoi/datasets/create/config.py +6 -27
  11. anemoi/datasets/create/functions/__init__.py +3 -3
  12. anemoi/datasets/create/functions/filters/empty.py +4 -4
  13. anemoi/datasets/create/functions/filters/rename.py +14 -6
  14. anemoi/datasets/create/functions/filters/rotate_winds.py +16 -60
  15. anemoi/datasets/create/functions/filters/unrotate_winds.py +14 -64
  16. anemoi/datasets/create/functions/sources/__init__.py +39 -0
  17. anemoi/datasets/create/functions/sources/accumulations.py +38 -56
  18. anemoi/datasets/create/functions/sources/constants.py +11 -4
  19. anemoi/datasets/create/functions/sources/empty.py +2 -2
  20. anemoi/datasets/create/functions/sources/forcings.py +3 -3
  21. anemoi/datasets/create/functions/sources/grib.py +8 -4
  22. anemoi/datasets/create/functions/sources/hindcasts.py +32 -364
  23. anemoi/datasets/create/functions/sources/mars.py +57 -26
  24. anemoi/datasets/create/functions/sources/netcdf.py +2 -60
  25. anemoi/datasets/create/functions/sources/opendap.py +3 -2
  26. anemoi/datasets/create/functions/sources/source.py +3 -3
  27. anemoi/datasets/create/functions/sources/tendencies.py +7 -7
  28. anemoi/datasets/create/functions/sources/xarray/__init__.py +73 -0
  29. anemoi/datasets/create/functions/sources/xarray/coordinates.py +234 -0
  30. anemoi/datasets/create/functions/sources/xarray/field.py +109 -0
  31. anemoi/datasets/create/functions/sources/xarray/fieldlist.py +171 -0
  32. anemoi/datasets/create/functions/sources/xarray/flavour.py +330 -0
  33. anemoi/datasets/create/functions/sources/xarray/grid.py +46 -0
  34. anemoi/datasets/create/functions/sources/xarray/metadata.py +161 -0
  35. anemoi/datasets/create/functions/sources/xarray/time.py +98 -0
  36. anemoi/datasets/create/functions/sources/xarray/variable.py +198 -0
  37. anemoi/datasets/create/functions/sources/xarray_kerchunk.py +42 -0
  38. anemoi/datasets/create/functions/sources/xarray_zarr.py +15 -0
  39. anemoi/datasets/create/functions/sources/zenodo.py +40 -0
  40. anemoi/datasets/create/input.py +309 -191
  41. anemoi/datasets/create/loaders.py +155 -77
  42. anemoi/datasets/create/patch.py +17 -14
  43. anemoi/datasets/create/persistent.py +1 -1
  44. anemoi/datasets/create/size.py +4 -5
  45. anemoi/datasets/create/statistics/__init__.py +51 -17
  46. anemoi/datasets/create/template.py +11 -61
  47. anemoi/datasets/create/trace.py +91 -0
  48. anemoi/datasets/create/utils.py +5 -52
  49. anemoi/datasets/create/zarr.py +24 -10
  50. anemoi/datasets/data/dataset.py +4 -4
  51. anemoi/datasets/data/misc.py +9 -37
  52. anemoi/datasets/data/stores.py +37 -14
  53. anemoi/datasets/dates/__init__.py +7 -1
  54. anemoi/datasets/dates/groups.py +3 -0
  55. {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/METADATA +24 -8
  56. anemoi_datasets-0.4.2.dist-info/RECORD +86 -0
  57. {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/WHEEL +1 -1
  58. anemoi_datasets-0.3.10.dist-info/RECORD +0 -73
  59. {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/LICENSE +0 -0
  60. {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/entry_points.txt +0 -0
  61. {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/top_level.txt +0 -0
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.3.10'
16
- __version_tuple__ = version_tuple = (0, 3, 10)
15
+ __version__ = version = '0.4.2'
16
+ __version_tuple__ = version_tuple = (0, 4, 2)
@@ -8,6 +8,10 @@
8
8
  # nor does it submit to any jurisdiction.
9
9
  #
10
10
 
11
+ import numpy as np
12
+ import tqdm
13
+ import zarr
14
+
11
15
  from anemoi.datasets import open_dataset
12
16
 
13
17
  from . import Command
@@ -19,6 +23,8 @@ class Compare(Command):
19
23
  def add_arguments(self, command_parser):
20
24
  command_parser.add_argument("dataset1")
21
25
  command_parser.add_argument("dataset2")
26
+ command_parser.add_argument("--data", action="store_true", help="Compare the data.")
27
+ command_parser.add_argument("--statistics", action="store_true", help="Compare the statistics.")
22
28
 
23
29
  def run(self, args):
24
30
  ds1 = open_dataset(args.dataset1)
@@ -42,5 +48,58 @@ class Compare(Command):
42
48
  f"{ds2.statistics['mean'][ds2.name_to_index[v]]:14g}",
43
49
  )
44
50
 
51
+ if args.data:
52
+ print()
53
+ print("Data:")
54
+ print("-----")
55
+ print()
56
+
57
+ diff = 0
58
+ for a, b in tqdm.tqdm(zip(ds1, ds2)):
59
+ if not np.array_equal(a, b, equal_nan=True):
60
+ diff += 1
61
+
62
+ print(f"Number of different rows: {diff}/{len(ds1)}")
63
+
64
+ if args.data:
65
+ print()
66
+ print("Data 2:")
67
+ print("-----")
68
+ print()
69
+
70
+ ds1 = zarr.open(args.dataset1, mode="r")
71
+ ds2 = zarr.open(args.dataset2, mode="r")
72
+
73
+ for name in (
74
+ "data",
75
+ "count",
76
+ "sums",
77
+ "squares",
78
+ "mean",
79
+ "stdev",
80
+ "minimum",
81
+ "maximum",
82
+ "latitudes",
83
+ "longitudes",
84
+ ):
85
+ a1 = ds1[name]
86
+ a2 = ds2[name]
87
+
88
+ if len(a1) != len(a2):
89
+ print(f"{name}: lengths mismatch {len(a1)} != {len(a2)}")
90
+ continue
91
+
92
+ diff = 0
93
+ for a, b in tqdm.tqdm(zip(a1, a2), leave=False):
94
+ if not np.array_equal(a, b, equal_nan=True):
95
+ if diff == 0:
96
+ print(f"\n{name}: first different row:")
97
+ print(a[a != b])
98
+ print(b[a != b])
99
+
100
+ diff += 1
101
+
102
+ print(f"{name}: {diff} different rows out of {len(a1)}")
103
+
45
104
 
46
105
  command = Compare
@@ -1,7 +1,39 @@
1
- from anemoi.datasets.create import Creator
1
+ import datetime
2
+ import logging
3
+ import time
4
+ from concurrent.futures import ProcessPoolExecutor
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from concurrent.futures import as_completed
7
+
8
+ import tqdm
9
+ from anemoi.utils.humanize import seconds_to_human
10
+
11
+ from anemoi.datasets.create.trace import enable_trace
2
12
 
3
13
  from . import Command
4
14
 
15
+ LOG = logging.getLogger(__name__)
16
+
17
+
18
+ def task(what, options, *args, **kwargs):
19
+ """
20
+ Make sure `import Creator` is done in the sub-processes, and not in the main one.
21
+ """
22
+
23
+ now = datetime.datetime.now()
24
+ LOG.debug(f"Task {what}({args},{kwargs}) starting")
25
+
26
+ from anemoi.datasets.create import Creator
27
+
28
+ if "trace" in options:
29
+ enable_trace(options["trace"])
30
+
31
+ c = Creator(**options)
32
+ result = getattr(c, what)(*args, **kwargs)
33
+
34
+ LOG.debug(f"Task {what}({args},{kwargs}) completed ({datetime.datetime.now()-now})")
35
+ return result
36
+
5
37
 
6
38
  class Create(Command):
7
39
  """Create a dataset."""
@@ -22,12 +54,61 @@ class Create(Command):
22
54
  )
23
55
  command_parser.add_argument("config", help="Configuration yaml file defining the recipe to create the dataset.")
24
56
  command_parser.add_argument("path", help="Path to store the created data.")
57
+ group = command_parser.add_mutually_exclusive_group()
58
+ group.add_argument("--threads", help="Use `n` parallel thread workers.", type=int, default=0)
59
+ group.add_argument("--processes", help="Use `n` parallel process workers.", type=int, default=0)
60
+ command_parser.add_argument("--trace", action="store_true")
25
61
 
26
62
  def run(self, args):
27
- kwargs = vars(args)
63
+ now = time.time()
64
+ if args.threads + args.processes:
65
+ self.parallel_create(args)
66
+ else:
67
+ self.serial_create(args)
68
+ LOG.info(f"Create completed in {seconds_to_human(time.time()-now)}")
28
69
 
29
- c = Creator(**kwargs)
70
+ def serial_create(self, args):
71
+ from anemoi.datasets.create import Creator
72
+
73
+ options = vars(args)
74
+ c = Creator(**options)
30
75
  c.create()
31
76
 
77
+ def parallel_create(self, args):
78
+ """Some modules, like fsspec do not work well with fork()
79
+ Other modules may not be thread safe. So we implement
80
+ parallel loadining using multiprocessing before any
81
+ of the modules are imported.
82
+ """
83
+
84
+ options = vars(args)
85
+ parallel = args.threads + args.processes
86
+ args.use_threads = args.threads > 0
87
+
88
+ if args.use_threads:
89
+ ExecutorClass = ThreadPoolExecutor
90
+ else:
91
+ ExecutorClass = ProcessPoolExecutor
92
+
93
+ with ExecutorClass(max_workers=1) as executor:
94
+ total = executor.submit(task, "init", options).result()
95
+
96
+ futures = []
97
+
98
+ with ExecutorClass(max_workers=parallel) as executor:
99
+ for n in range(total):
100
+ futures.append(executor.submit(task, "load", options, parts=f"{n+1}/{total}"))
101
+
102
+ for future in tqdm.tqdm(
103
+ as_completed(futures), desc="Loading", total=len(futures), colour="green", position=parallel + 1
104
+ ):
105
+ future.result()
106
+
107
+ with ExecutorClass(max_workers=1) as executor:
108
+ executor.submit(task, "statistics", options).result()
109
+ executor.submit(task, "additions", options).result()
110
+ executor.submit(task, "cleanup", options).result()
111
+ executor.submit(task, "verify", options).result()
112
+
32
113
 
33
114
  command = Create
@@ -16,7 +16,7 @@ import numpy as np
16
16
  import semantic_version
17
17
  import tqdm
18
18
  from anemoi.utils.humanize import bytes
19
- from anemoi.utils.humanize import number
19
+ from anemoi.utils.humanize import bytes_to_human
20
20
  from anemoi.utils.humanize import when
21
21
  from anemoi.utils.text import dotted_line
22
22
  from anemoi.utils.text import progress
@@ -215,9 +215,9 @@ class Version:
215
215
  total_size, n = compute_directory_size(self.path)
216
216
 
217
217
  if total_size is not None:
218
- print(f"💽 Size : {bytes(total_size)} ({number(total_size)})")
218
+ print(f"💽 Size : {bytes(total_size)} ({bytes_to_human(total_size)})")
219
219
  if n is not None:
220
- print(f"📁 Files : {number(n)}")
220
+ print(f"📁 Files : {n:,}")
221
221
 
222
222
  @property
223
223
  def statistics(self):
@@ -382,7 +382,7 @@ class NoVersion(Version):
382
382
  @property
383
383
  def last_date(self):
384
384
  monthly = find(self.metadata, "monthly")
385
- time = max([int(t) for t in find(self.metadata["climetlab"], "time")])
385
+ time = max([int(t) for t in find(self.metadata["earthkit-data"], "time")])
386
386
  assert isinstance(time, int), (time, type(time))
387
387
  if time > 100:
388
388
  time = time // 100
@@ -390,7 +390,7 @@ class NoVersion(Version):
390
390
 
391
391
  @property
392
392
  def frequency(self):
393
- time = find(self.metadata["climetlab"], "time")
393
+ time = find(self.metadata["earthkit-data"], "time")
394
394
  return 24 // len(time)
395
395
 
396
396
  @property
@@ -444,9 +444,9 @@ class Version0_4(Version):
444
444
  z = self.zarr
445
445
 
446
446
  # for backward compatibility
447
- if "climetlab" in z.attrs:
448
- climetlab_version = z.attrs["climetlab"].get("versions", {}).get("climetlab", "unkwown")
449
- print(f"climetlab version used to create this zarr: {climetlab_version}. Not supported.")
447
+ if "earthkit-data" in z.attrs:
448
+ ekd_version = z.attrs["earthkit-data"].get("versions", {}).get("earthkit-data", "unkwown")
449
+ print(f"earthkit-data version used to create this zarr: {ekd_version}. Not supported.")
450
450
  return
451
451
 
452
452
  version = z.attrs.get("version")
@@ -455,7 +455,7 @@ class Version0_4(Version):
455
455
  print(" Cannot find metadata information about versions.")
456
456
  else:
457
457
  print(f"Zarr format (version {version})", end="")
458
- print(f" created by climetlab={versions.pop('climetlab')}", end="")
458
+ print(f" created by earthkit-data={versions.pop('earthkit-data')}", end="")
459
459
  timestamp = z.attrs.get("creation_timestamp")
460
460
  timestamp = datetime.datetime.fromisoformat(timestamp)
461
461
  print(f" on {timestamp}", end="")
@@ -3,7 +3,7 @@ import os
3
3
  import sys
4
4
  from collections import defaultdict
5
5
 
6
- import climetlab as cml
6
+ import earthkit.data as ekd
7
7
  import tqdm
8
8
  import yaml
9
9
 
@@ -50,9 +50,9 @@ class Scan(Command):
50
50
  for path in tqdm.tqdm(paths, leave=False):
51
51
  if not match(path):
52
52
  continue
53
- for field in tqdm.tqdm(cml.load_source("file", path), leave=False):
54
- dates.add(field.valid_datetime())
55
- mars = field.as_mars()
53
+ for field in tqdm.tqdm(ekd.from_source("file", path), leave=False):
54
+ dates.add(field.datetime()["valid_time"])
55
+ mars = field.metadata(namespace="mars")
56
56
  keys = tuple(mars.get(k) for k in KEYS)
57
57
  gribs[keys].add(path)
58
58
  for k, v in mars.items():
@@ -10,10 +10,10 @@
10
10
  import logging
11
11
 
12
12
  import numpy as np
13
- from climetlab.core.temporary import temp_file
14
- from climetlab.readers.grib.output import new_grib_output
13
+ from earthkit.data.core.temporary import temp_file
14
+ from earthkit.data.readers.grib.output import new_grib_output
15
15
 
16
- from anemoi.datasets.create.functions import assert_is_fieldset
16
+ from anemoi.datasets.create.functions import assert_is_fieldlist
17
17
 
18
18
  LOG = logging.getLogger(__name__)
19
19
 
@@ -96,7 +96,7 @@ def recentre(
96
96
 
97
97
  for i, centre_field in enumerate(centre):
98
98
  param = centre_field.metadata("param")
99
- centre_field_as_mars = centre_field.as_mars()
99
+ centre_field_as_mars = centre_field.metadata(namespace="mars")
100
100
 
101
101
  # load the centre field
102
102
  centre_np = centre_field.to_numpy()
@@ -106,8 +106,13 @@ def recentre(
106
106
 
107
107
  for j in range(n_numbers):
108
108
  ensemble_field = members[i * n_numbers + j]
109
- ensemble_field_as_mars = ensemble_field.as_mars()
110
- check_compatible(centre_field, ensemble_field, centre_field_as_mars, ensemble_field_as_mars)
109
+ ensemble_field_as_mars = ensemble_field.metadata(namespace="mars")
110
+ check_compatible(
111
+ centre_field,
112
+ ensemble_field,
113
+ centre_field_as_mars,
114
+ ensemble_field_as_mars,
115
+ )
111
116
  members_np[j] = ensemble_field.to_numpy()
112
117
 
113
118
  ensemble_field_as_mars = tuple(sorted(ensemble_field_as_mars.items()))
@@ -149,10 +154,10 @@ def recentre(
149
154
  if output is not None:
150
155
  return path
151
156
 
152
- from climetlab import load_source
157
+ from earthkit.data import from_source
153
158
 
154
- ds = load_source("file", path)
155
- assert_is_fieldset(ds)
159
+ ds = from_source("file", path)
160
+ assert_is_fieldlist(ds)
156
161
  # save a reference to the tmp file so it is deleted
157
162
  # only when the dataset is not used anymore
158
163
  ds._tmp = tmp
@@ -7,8 +7,15 @@
7
7
  # nor does it submit to any jurisdiction.
8
8
  #
9
9
 
10
+ import logging
10
11
  import os
11
12
 
13
+ LOG = logging.getLogger(__name__)
14
+
15
+
16
+ def _ignore(*args, **kwargs):
17
+ pass
18
+
12
19
 
13
20
  class Creator:
14
21
  def __init__(
@@ -16,19 +23,21 @@ class Creator:
16
23
  path,
17
24
  config=None,
18
25
  cache=None,
19
- print=print,
26
+ use_threads=False,
20
27
  statistics_tmp=None,
21
28
  overwrite=False,
22
29
  test=None,
30
+ progress=None,
23
31
  **kwargs,
24
32
  ):
25
33
  self.path = path # Output path
26
34
  self.config = config
27
35
  self.cache = cache
28
- self.print = print
36
+ self.use_threads = use_threads
29
37
  self.statistics_tmp = statistics_tmp
30
38
  self.overwrite = overwrite
31
39
  self.test = test
40
+ self.progress = progress if progress is not None else _ignore
32
41
 
33
42
  def init(self, check_name=False):
34
43
  # check path
@@ -44,10 +53,11 @@ class Creator:
44
53
  path=self.path,
45
54
  config=self.config,
46
55
  statistics_tmp=self.statistics_tmp,
47
- print=self.print,
56
+ use_threads=self.use_threads,
57
+ progress=self.progress,
48
58
  test=self.test,
49
59
  )
50
- obj.initialise(check_name=check_name)
60
+ return obj.initialise(check_name=check_name)
51
61
 
52
62
  def load(self, parts=None):
53
63
  from .loaders import ContentLoader
@@ -56,7 +66,8 @@ class Creator:
56
66
  loader = ContentLoader.from_dataset_config(
57
67
  path=self.path,
58
68
  statistics_tmp=self.statistics_tmp,
59
- print=self.print,
69
+ use_threads=self.use_threads,
70
+ progress=self.progress,
60
71
  parts=parts,
61
72
  )
62
73
  loader.load()
@@ -66,7 +77,8 @@ class Creator:
66
77
 
67
78
  loader = StatisticsAdder.from_dataset(
68
79
  path=self.path,
69
- print=self.print,
80
+ use_threads=self.use_threads,
81
+ progress=self.progress,
70
82
  statistics_tmp=self.statistics_tmp,
71
83
  statistics_output=output,
72
84
  recompute=False,
@@ -74,20 +86,22 @@ class Creator:
74
86
  statistics_end=end,
75
87
  )
76
88
  loader.run()
89
+ assert loader.ready()
77
90
 
78
91
  def size(self):
79
92
  from .loaders import DatasetHandler
80
93
  from .size import compute_directory_sizes
81
94
 
82
95
  metadata = compute_directory_sizes(self.path)
83
- handle = DatasetHandler.from_dataset(path=self.path, print=self.print)
96
+ handle = DatasetHandler.from_dataset(path=self.path, use_threads=self.use_threads)
84
97
  handle.update_metadata(**metadata)
98
+ assert handle.ready()
85
99
 
86
100
  def cleanup(self):
87
101
  from .loaders import DatasetHandlerWithStatistics
88
102
 
89
103
  cleaner = DatasetHandlerWithStatistics.from_dataset(
90
- path=self.path, print=self.print, statistics_tmp=self.statistics_tmp
104
+ path=self.path, use_threads=self.use_threads, progress=self.progress, statistics_tmp=self.statistics_tmp
91
105
  )
92
106
  cleaner.tmp_statistics.delete()
93
107
  cleaner.registry.clean()
@@ -103,15 +117,17 @@ class Creator:
103
117
  from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
104
118
 
105
119
  if statistics:
106
- a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
120
+ a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
107
121
  a.initialise()
108
122
 
109
123
  for d in delta:
110
124
  try:
111
- a = TendenciesStatisticsAddition.from_dataset(path=self.path, print=self.print, delta=d)
125
+ a = TendenciesStatisticsAddition.from_dataset(
126
+ path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
127
+ )
112
128
  a.initialise()
113
129
  except TendenciesStatisticsDeltaNotMultipleOfFrequency:
114
- self.print(f"Skipping delta={d} as it is not a multiple of the frequency.")
130
+ LOG.info(f"Skipping delta={d} as it is not a multiple of the frequency.")
115
131
 
116
132
  def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24], statistics=True):
117
133
  from .loaders import StatisticsAddition
@@ -119,15 +135,17 @@ class Creator:
119
135
  from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
120
136
 
121
137
  if statistics:
122
- a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
138
+ a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
123
139
  a.run(parts)
124
140
 
125
141
  for d in delta:
126
142
  try:
127
- a = TendenciesStatisticsAddition.from_dataset(path=self.path, print=self.print, delta=d)
143
+ a = TendenciesStatisticsAddition.from_dataset(
144
+ path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
145
+ )
128
146
  a.run(parts)
129
147
  except TendenciesStatisticsDeltaNotMultipleOfFrequency:
130
- self.print(f"Skipping delta={d} as it is not a multiple of the frequency.")
148
+ LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
131
149
 
132
150
  def finalise_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
133
151
  from .loaders import StatisticsAddition
@@ -135,15 +153,17 @@ class Creator:
135
153
  from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
136
154
 
137
155
  if statistics:
138
- a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
156
+ a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
139
157
  a.finalise()
140
158
 
141
159
  for d in delta:
142
160
  try:
143
- a = TendenciesStatisticsAddition.from_dataset(path=self.path, print=self.print, delta=d)
161
+ a = TendenciesStatisticsAddition.from_dataset(
162
+ path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
163
+ )
144
164
  a.finalise()
145
165
  except TendenciesStatisticsDeltaNotMultipleOfFrequency:
146
- self.print(f"Skipping delta={d} as it is not a multiple of the frequency.")
166
+ LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
147
167
 
148
168
  def finalise(self, **kwargs):
149
169
  self.statistics(**kwargs)
@@ -174,3 +194,10 @@ class Creator:
174
194
  return True
175
195
  except zarr.errors.PathNotFoundError:
176
196
  return False
197
+
198
+ def verify(self):
199
+ from .loaders import DatasetVerifier
200
+
201
+ handle = DatasetVerifier.from_dataset(path=self.path, use_threads=self.use_threads)
202
+
203
+ handle.verify()
@@ -56,7 +56,7 @@ class DatasetName:
56
56
  raise ValueError(self.error_message)
57
57
 
58
58
  def _parse(self, name):
59
- pattern = r"^(\w+)-([\w-]+)-(\w+)-(\w+)-(\d\d\d\d)-(\d\d\d\d)-(\d+h)-v(\d+)-?(.*)$"
59
+ pattern = r"^(\w+)-([\w-]+)-(\w+)-(\w+)-(\d\d\d\d)-(\d\d\d\d)-(\d+h)-v(\d+)-?([a-zA-Z0-9-]+)$"
60
60
  match = re.match(pattern, name)
61
61
 
62
62
  assert match, (name, pattern)
@@ -136,18 +136,19 @@ class StatisticsValueError(ValueError):
136
136
  pass
137
137
 
138
138
 
139
- def check_data_values(arr, *, name: str, log=[], allow_nan=False):
140
- if allow_nan is False:
141
- allow_nan = lambda x: False # noqa: E731
139
+ def check_data_values(arr, *, name: str, log=[], allow_nans=False):
142
140
 
143
- if allow_nan(name):
141
+ if (isinstance(allow_nans, (set, list, tuple, dict)) and name in allow_nans) or allow_nans:
144
142
  arr = arr[~np.isnan(arr)]
145
143
 
144
+ assert arr.size > 0, (name, *log)
145
+
146
146
  min, max = arr.min(), arr.max()
147
147
  assert not (np.isnan(arr).any()), (name, min, max, *log)
148
148
 
149
149
  if min == 9999.0:
150
150
  warnings.warn(f"Min value 9999 for {name}")
151
+
151
152
  if max == 9999.0:
152
153
  warnings.warn(f"Max value 9999 for {name}")
153
154
 
@@ -57,7 +57,7 @@ class ChunkFilter:
57
57
  if not parts:
58
58
  warnings.warn(f"Nothing to do for chunk {i}/{n}.")
59
59
 
60
- LOG.info(f"Running parts: {parts}")
60
+ LOG.debug(f"Running parts: {parts}")
61
61
 
62
62
  self.allowed = parts
63
63
 
@@ -12,9 +12,9 @@ import os
12
12
  from copy import deepcopy
13
13
 
14
14
  import yaml
15
- from climetlab.core.order import normalize_order_by
16
-
17
- from .utils import load_json_or_yaml
15
+ from anemoi.utils.config import DotDict
16
+ from anemoi.utils.config import load_any_dict_format
17
+ from earthkit.data.core.order import normalize_order_by
18
18
 
19
19
  LOG = logging.getLogger(__name__)
20
20
 
@@ -43,31 +43,10 @@ def check_dict_value_and_set(dic, key, value):
43
43
  if dic[key] == value:
44
44
  return
45
45
  raise ValueError(f"Cannot use {key}={dic[key]}. Must use {value}.")
46
- print(f"Setting {key}={value} in config")
46
+ LOG.info(f"Setting {key}={value} in config")
47
47
  dic[key] = value
48
48
 
49
49
 
50
- class DictObj(dict):
51
- def __init__(self, *args, **kwargs):
52
- super().__init__(*args, **kwargs)
53
- for key, value in self.items():
54
- if isinstance(value, dict):
55
- self[key] = DictObj(value)
56
- continue
57
- if isinstance(value, list):
58
- self[key] = [DictObj(item) if isinstance(item, dict) else item for item in value]
59
- continue
60
-
61
- def __getattr__(self, attr):
62
- try:
63
- return self[attr]
64
- except KeyError:
65
- raise AttributeError(attr)
66
-
67
- def __setattr__(self, attr, value):
68
- self[attr] = value
69
-
70
-
71
50
  def resolve_includes(config):
72
51
  if isinstance(config, list):
73
52
  return [resolve_includes(c) for c in config]
@@ -79,11 +58,11 @@ def resolve_includes(config):
79
58
  return config
80
59
 
81
60
 
82
- class Config(DictObj):
61
+ class Config(DotDict):
83
62
  def __init__(self, config=None, **kwargs):
84
63
  if isinstance(config, str):
85
64
  self.config_path = os.path.realpath(config)
86
- config = load_json_or_yaml(config)
65
+ config = load_any_dict_format(config)
87
66
  else:
88
67
  config = deepcopy(config if config is not None else {})
89
68
  config = resolve_includes(config)
@@ -13,10 +13,10 @@ import importlib
13
13
  import entrypoints
14
14
 
15
15
 
16
- def assert_is_fieldset(obj):
17
- from climetlab.readers.grib.index import FieldSet
16
+ def assert_is_fieldlist(obj):
17
+ from earthkit.data.indexing.fieldlist import FieldList
18
18
 
19
- assert isinstance(obj, FieldSet), type(obj)
19
+ assert isinstance(obj, FieldList), type(obj)
20
20
 
21
21
 
22
22
  def import_function(name, kind):
@@ -7,10 +7,10 @@
7
7
  # nor does it submit to any jurisdiction.
8
8
  #
9
9
 
10
- import climetlab as cml
10
+ import earthkit.data as ekd
11
11
 
12
12
 
13
13
  def execute(context, input, **kwargs):
14
- # Usefull to create a pipeline that returns an empty result
15
- # So we can reference an earlier step in a function like 'contants'
16
- return cml.load_source("empty")
14
+ # Useful to create a pipeline that returns an empty result
15
+ # So we can reference an earlier step in a function like 'constants'
16
+ return ekd.from_source("empty")
@@ -9,7 +9,7 @@
9
9
 
10
10
  import re
11
11
 
12
- from climetlab.indexing.fieldset import FieldArray
12
+ from earthkit.data.indexing.fieldlist import FieldArray
13
13
 
14
14
 
15
15
  class RenamedFieldMapping:
@@ -26,15 +26,23 @@ class RenamedFieldMapping:
26
26
  self.what = what
27
27
  self.renaming = renaming
28
28
 
29
- def metadata(self, key):
30
- value = self.field.metadata(key)
29
+ def metadata(self, key=None, **kwargs):
30
+ if key is None:
31
+ return self.field.metadata(**kwargs)
32
+
33
+ value = self.field.metadata(key, **kwargs)
31
34
  if key == self.what:
32
35
  return self.renaming.get(value, value)
36
+
33
37
  return value
34
38
 
35
39
  def __getattr__(self, name):
36
40
  return getattr(self.field, name)
37
41
 
42
+ def __repr__(self) -> str:
43
+ return repr(self.field)
44
+ return f"{self.field} -> {self.what} -> {self.renaming}"
45
+
38
46
 
39
47
  class RenamedFieldFormat:
40
48
  """Rename a field based on a format string.
@@ -48,10 +56,10 @@ class RenamedFieldFormat:
48
56
  self.format = format
49
57
  self.bits = re.findall(r"{(\w+)}", format)
50
58
 
51
- def metadata(self, key):
52
- value = self.field.metadata(key)
59
+ def metadata(self, key, **kwargs):
60
+ value = self.field.metadata(key, **kwargs)
53
61
  if "{" + key + "}" in self.format:
54
- bits = {b: self.field.metadata(b) for b in self.bits}
62
+ bits = {b: self.field.metadata(b, **kwargs) for b in self.bits}
55
63
  return self.format.format(**bits)
56
64
  return value
57
65