anemoi-datasets 0.3.10__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anemoi/datasets/_version.py +2 -2
- anemoi/datasets/commands/compare.py +59 -0
- anemoi/datasets/commands/create.py +84 -3
- anemoi/datasets/commands/inspect.py +9 -9
- anemoi/datasets/commands/scan.py +4 -4
- anemoi/datasets/compute/recentre.py +14 -9
- anemoi/datasets/create/__init__.py +44 -17
- anemoi/datasets/create/check.py +6 -5
- anemoi/datasets/create/chunks.py +1 -1
- anemoi/datasets/create/config.py +6 -27
- anemoi/datasets/create/functions/__init__.py +3 -3
- anemoi/datasets/create/functions/filters/empty.py +4 -4
- anemoi/datasets/create/functions/filters/rename.py +14 -6
- anemoi/datasets/create/functions/filters/rotate_winds.py +16 -60
- anemoi/datasets/create/functions/filters/unrotate_winds.py +14 -64
- anemoi/datasets/create/functions/sources/__init__.py +39 -0
- anemoi/datasets/create/functions/sources/accumulations.py +38 -56
- anemoi/datasets/create/functions/sources/constants.py +11 -4
- anemoi/datasets/create/functions/sources/empty.py +2 -2
- anemoi/datasets/create/functions/sources/forcings.py +3 -3
- anemoi/datasets/create/functions/sources/grib.py +8 -4
- anemoi/datasets/create/functions/sources/hindcasts.py +32 -364
- anemoi/datasets/create/functions/sources/mars.py +57 -26
- anemoi/datasets/create/functions/sources/netcdf.py +2 -60
- anemoi/datasets/create/functions/sources/opendap.py +3 -2
- anemoi/datasets/create/functions/sources/source.py +3 -3
- anemoi/datasets/create/functions/sources/tendencies.py +7 -7
- anemoi/datasets/create/functions/sources/xarray/__init__.py +73 -0
- anemoi/datasets/create/functions/sources/xarray/coordinates.py +234 -0
- anemoi/datasets/create/functions/sources/xarray/field.py +109 -0
- anemoi/datasets/create/functions/sources/xarray/fieldlist.py +171 -0
- anemoi/datasets/create/functions/sources/xarray/flavour.py +330 -0
- anemoi/datasets/create/functions/sources/xarray/grid.py +46 -0
- anemoi/datasets/create/functions/sources/xarray/metadata.py +161 -0
- anemoi/datasets/create/functions/sources/xarray/time.py +98 -0
- anemoi/datasets/create/functions/sources/xarray/variable.py +198 -0
- anemoi/datasets/create/functions/sources/xarray_kerchunk.py +42 -0
- anemoi/datasets/create/functions/sources/xarray_zarr.py +15 -0
- anemoi/datasets/create/functions/sources/zenodo.py +40 -0
- anemoi/datasets/create/input.py +309 -191
- anemoi/datasets/create/loaders.py +155 -77
- anemoi/datasets/create/patch.py +17 -14
- anemoi/datasets/create/persistent.py +1 -1
- anemoi/datasets/create/size.py +4 -5
- anemoi/datasets/create/statistics/__init__.py +51 -17
- anemoi/datasets/create/template.py +11 -61
- anemoi/datasets/create/trace.py +91 -0
- anemoi/datasets/create/utils.py +5 -52
- anemoi/datasets/create/zarr.py +24 -10
- anemoi/datasets/data/dataset.py +4 -4
- anemoi/datasets/data/misc.py +9 -37
- anemoi/datasets/data/stores.py +37 -14
- anemoi/datasets/dates/__init__.py +7 -1
- anemoi/datasets/dates/groups.py +3 -0
- {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/METADATA +24 -8
- anemoi_datasets-0.4.2.dist-info/RECORD +86 -0
- {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/WHEEL +1 -1
- anemoi_datasets-0.3.10.dist-info/RECORD +0 -73
- {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/LICENSE +0 -0
- {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/entry_points.txt +0 -0
- {anemoi_datasets-0.3.10.dist-info → anemoi_datasets-0.4.2.dist-info}/top_level.txt +0 -0
anemoi/datasets/_version.py
CHANGED
|
@@ -8,6 +8,10 @@
|
|
|
8
8
|
# nor does it submit to any jurisdiction.
|
|
9
9
|
#
|
|
10
10
|
|
|
11
|
+
import numpy as np
|
|
12
|
+
import tqdm
|
|
13
|
+
import zarr
|
|
14
|
+
|
|
11
15
|
from anemoi.datasets import open_dataset
|
|
12
16
|
|
|
13
17
|
from . import Command
|
|
@@ -19,6 +23,8 @@ class Compare(Command):
|
|
|
19
23
|
def add_arguments(self, command_parser):
|
|
20
24
|
command_parser.add_argument("dataset1")
|
|
21
25
|
command_parser.add_argument("dataset2")
|
|
26
|
+
command_parser.add_argument("--data", action="store_true", help="Compare the data.")
|
|
27
|
+
command_parser.add_argument("--statistics", action="store_true", help="Compare the statistics.")
|
|
22
28
|
|
|
23
29
|
def run(self, args):
|
|
24
30
|
ds1 = open_dataset(args.dataset1)
|
|
@@ -42,5 +48,58 @@ class Compare(Command):
|
|
|
42
48
|
f"{ds2.statistics['mean'][ds2.name_to_index[v]]:14g}",
|
|
43
49
|
)
|
|
44
50
|
|
|
51
|
+
if args.data:
|
|
52
|
+
print()
|
|
53
|
+
print("Data:")
|
|
54
|
+
print("-----")
|
|
55
|
+
print()
|
|
56
|
+
|
|
57
|
+
diff = 0
|
|
58
|
+
for a, b in tqdm.tqdm(zip(ds1, ds2)):
|
|
59
|
+
if not np.array_equal(a, b, equal_nan=True):
|
|
60
|
+
diff += 1
|
|
61
|
+
|
|
62
|
+
print(f"Number of different rows: {diff}/{len(ds1)}")
|
|
63
|
+
|
|
64
|
+
if args.data:
|
|
65
|
+
print()
|
|
66
|
+
print("Data 2:")
|
|
67
|
+
print("-----")
|
|
68
|
+
print()
|
|
69
|
+
|
|
70
|
+
ds1 = zarr.open(args.dataset1, mode="r")
|
|
71
|
+
ds2 = zarr.open(args.dataset2, mode="r")
|
|
72
|
+
|
|
73
|
+
for name in (
|
|
74
|
+
"data",
|
|
75
|
+
"count",
|
|
76
|
+
"sums",
|
|
77
|
+
"squares",
|
|
78
|
+
"mean",
|
|
79
|
+
"stdev",
|
|
80
|
+
"minimum",
|
|
81
|
+
"maximum",
|
|
82
|
+
"latitudes",
|
|
83
|
+
"longitudes",
|
|
84
|
+
):
|
|
85
|
+
a1 = ds1[name]
|
|
86
|
+
a2 = ds2[name]
|
|
87
|
+
|
|
88
|
+
if len(a1) != len(a2):
|
|
89
|
+
print(f"{name}: lengths mismatch {len(a1)} != {len(a2)}")
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
diff = 0
|
|
93
|
+
for a, b in tqdm.tqdm(zip(a1, a2), leave=False):
|
|
94
|
+
if not np.array_equal(a, b, equal_nan=True):
|
|
95
|
+
if diff == 0:
|
|
96
|
+
print(f"\n{name}: first different row:")
|
|
97
|
+
print(a[a != b])
|
|
98
|
+
print(b[a != b])
|
|
99
|
+
|
|
100
|
+
diff += 1
|
|
101
|
+
|
|
102
|
+
print(f"{name}: {diff} different rows out of {len(a1)}")
|
|
103
|
+
|
|
45
104
|
|
|
46
105
|
command = Compare
|
|
@@ -1,7 +1,39 @@
|
|
|
1
|
-
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
from concurrent.futures import as_completed
|
|
7
|
+
|
|
8
|
+
import tqdm
|
|
9
|
+
from anemoi.utils.humanize import seconds_to_human
|
|
10
|
+
|
|
11
|
+
from anemoi.datasets.create.trace import enable_trace
|
|
2
12
|
|
|
3
13
|
from . import Command
|
|
4
14
|
|
|
15
|
+
LOG = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def task(what, options, *args, **kwargs):
|
|
19
|
+
"""
|
|
20
|
+
Make sure `import Creator` is done in the sub-processes, and not in the main one.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
now = datetime.datetime.now()
|
|
24
|
+
LOG.debug(f"Task {what}({args},{kwargs}) starting")
|
|
25
|
+
|
|
26
|
+
from anemoi.datasets.create import Creator
|
|
27
|
+
|
|
28
|
+
if "trace" in options:
|
|
29
|
+
enable_trace(options["trace"])
|
|
30
|
+
|
|
31
|
+
c = Creator(**options)
|
|
32
|
+
result = getattr(c, what)(*args, **kwargs)
|
|
33
|
+
|
|
34
|
+
LOG.debug(f"Task {what}({args},{kwargs}) completed ({datetime.datetime.now()-now})")
|
|
35
|
+
return result
|
|
36
|
+
|
|
5
37
|
|
|
6
38
|
class Create(Command):
|
|
7
39
|
"""Create a dataset."""
|
|
@@ -22,12 +54,61 @@ class Create(Command):
|
|
|
22
54
|
)
|
|
23
55
|
command_parser.add_argument("config", help="Configuration yaml file defining the recipe to create the dataset.")
|
|
24
56
|
command_parser.add_argument("path", help="Path to store the created data.")
|
|
57
|
+
group = command_parser.add_mutually_exclusive_group()
|
|
58
|
+
group.add_argument("--threads", help="Use `n` parallel thread workers.", type=int, default=0)
|
|
59
|
+
group.add_argument("--processes", help="Use `n` parallel process workers.", type=int, default=0)
|
|
60
|
+
command_parser.add_argument("--trace", action="store_true")
|
|
25
61
|
|
|
26
62
|
def run(self, args):
|
|
27
|
-
|
|
63
|
+
now = time.time()
|
|
64
|
+
if args.threads + args.processes:
|
|
65
|
+
self.parallel_create(args)
|
|
66
|
+
else:
|
|
67
|
+
self.serial_create(args)
|
|
68
|
+
LOG.info(f"Create completed in {seconds_to_human(time.time()-now)}")
|
|
28
69
|
|
|
29
|
-
|
|
70
|
+
def serial_create(self, args):
|
|
71
|
+
from anemoi.datasets.create import Creator
|
|
72
|
+
|
|
73
|
+
options = vars(args)
|
|
74
|
+
c = Creator(**options)
|
|
30
75
|
c.create()
|
|
31
76
|
|
|
77
|
+
def parallel_create(self, args):
|
|
78
|
+
"""Some modules, like fsspec do not work well with fork()
|
|
79
|
+
Other modules may not be thread safe. So we implement
|
|
80
|
+
parallel loadining using multiprocessing before any
|
|
81
|
+
of the modules are imported.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
options = vars(args)
|
|
85
|
+
parallel = args.threads + args.processes
|
|
86
|
+
args.use_threads = args.threads > 0
|
|
87
|
+
|
|
88
|
+
if args.use_threads:
|
|
89
|
+
ExecutorClass = ThreadPoolExecutor
|
|
90
|
+
else:
|
|
91
|
+
ExecutorClass = ProcessPoolExecutor
|
|
92
|
+
|
|
93
|
+
with ExecutorClass(max_workers=1) as executor:
|
|
94
|
+
total = executor.submit(task, "init", options).result()
|
|
95
|
+
|
|
96
|
+
futures = []
|
|
97
|
+
|
|
98
|
+
with ExecutorClass(max_workers=parallel) as executor:
|
|
99
|
+
for n in range(total):
|
|
100
|
+
futures.append(executor.submit(task, "load", options, parts=f"{n+1}/{total}"))
|
|
101
|
+
|
|
102
|
+
for future in tqdm.tqdm(
|
|
103
|
+
as_completed(futures), desc="Loading", total=len(futures), colour="green", position=parallel + 1
|
|
104
|
+
):
|
|
105
|
+
future.result()
|
|
106
|
+
|
|
107
|
+
with ExecutorClass(max_workers=1) as executor:
|
|
108
|
+
executor.submit(task, "statistics", options).result()
|
|
109
|
+
executor.submit(task, "additions", options).result()
|
|
110
|
+
executor.submit(task, "cleanup", options).result()
|
|
111
|
+
executor.submit(task, "verify", options).result()
|
|
112
|
+
|
|
32
113
|
|
|
33
114
|
command = Create
|
|
@@ -16,7 +16,7 @@ import numpy as np
|
|
|
16
16
|
import semantic_version
|
|
17
17
|
import tqdm
|
|
18
18
|
from anemoi.utils.humanize import bytes
|
|
19
|
-
from anemoi.utils.humanize import
|
|
19
|
+
from anemoi.utils.humanize import bytes_to_human
|
|
20
20
|
from anemoi.utils.humanize import when
|
|
21
21
|
from anemoi.utils.text import dotted_line
|
|
22
22
|
from anemoi.utils.text import progress
|
|
@@ -215,9 +215,9 @@ class Version:
|
|
|
215
215
|
total_size, n = compute_directory_size(self.path)
|
|
216
216
|
|
|
217
217
|
if total_size is not None:
|
|
218
|
-
print(f"💽 Size : {bytes(total_size)} ({
|
|
218
|
+
print(f"💽 Size : {bytes(total_size)} ({bytes_to_human(total_size)})")
|
|
219
219
|
if n is not None:
|
|
220
|
-
print(f"📁 Files : {
|
|
220
|
+
print(f"📁 Files : {n:,}")
|
|
221
221
|
|
|
222
222
|
@property
|
|
223
223
|
def statistics(self):
|
|
@@ -382,7 +382,7 @@ class NoVersion(Version):
|
|
|
382
382
|
@property
|
|
383
383
|
def last_date(self):
|
|
384
384
|
monthly = find(self.metadata, "monthly")
|
|
385
|
-
time = max([int(t) for t in find(self.metadata["
|
|
385
|
+
time = max([int(t) for t in find(self.metadata["earthkit-data"], "time")])
|
|
386
386
|
assert isinstance(time, int), (time, type(time))
|
|
387
387
|
if time > 100:
|
|
388
388
|
time = time // 100
|
|
@@ -390,7 +390,7 @@ class NoVersion(Version):
|
|
|
390
390
|
|
|
391
391
|
@property
|
|
392
392
|
def frequency(self):
|
|
393
|
-
time = find(self.metadata["
|
|
393
|
+
time = find(self.metadata["earthkit-data"], "time")
|
|
394
394
|
return 24 // len(time)
|
|
395
395
|
|
|
396
396
|
@property
|
|
@@ -444,9 +444,9 @@ class Version0_4(Version):
|
|
|
444
444
|
z = self.zarr
|
|
445
445
|
|
|
446
446
|
# for backward compatibility
|
|
447
|
-
if "
|
|
448
|
-
|
|
449
|
-
print(f"
|
|
447
|
+
if "earthkit-data" in z.attrs:
|
|
448
|
+
ekd_version = z.attrs["earthkit-data"].get("versions", {}).get("earthkit-data", "unkwown")
|
|
449
|
+
print(f"earthkit-data version used to create this zarr: {ekd_version}. Not supported.")
|
|
450
450
|
return
|
|
451
451
|
|
|
452
452
|
version = z.attrs.get("version")
|
|
@@ -455,7 +455,7 @@ class Version0_4(Version):
|
|
|
455
455
|
print(" Cannot find metadata information about versions.")
|
|
456
456
|
else:
|
|
457
457
|
print(f"Zarr format (version {version})", end="")
|
|
458
|
-
print(f" created by
|
|
458
|
+
print(f" created by earthkit-data={versions.pop('earthkit-data')}", end="")
|
|
459
459
|
timestamp = z.attrs.get("creation_timestamp")
|
|
460
460
|
timestamp = datetime.datetime.fromisoformat(timestamp)
|
|
461
461
|
print(f" on {timestamp}", end="")
|
anemoi/datasets/commands/scan.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
import sys
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
import earthkit.data as ekd
|
|
7
7
|
import tqdm
|
|
8
8
|
import yaml
|
|
9
9
|
|
|
@@ -50,9 +50,9 @@ class Scan(Command):
|
|
|
50
50
|
for path in tqdm.tqdm(paths, leave=False):
|
|
51
51
|
if not match(path):
|
|
52
52
|
continue
|
|
53
|
-
for field in tqdm.tqdm(
|
|
54
|
-
dates.add(field.
|
|
55
|
-
mars = field.
|
|
53
|
+
for field in tqdm.tqdm(ekd.from_source("file", path), leave=False):
|
|
54
|
+
dates.add(field.datetime()["valid_time"])
|
|
55
|
+
mars = field.metadata(namespace="mars")
|
|
56
56
|
keys = tuple(mars.get(k) for k in KEYS)
|
|
57
57
|
gribs[keys].add(path)
|
|
58
58
|
for k, v in mars.items():
|
|
@@ -10,10 +10,10 @@
|
|
|
10
10
|
import logging
|
|
11
11
|
|
|
12
12
|
import numpy as np
|
|
13
|
-
from
|
|
14
|
-
from
|
|
13
|
+
from earthkit.data.core.temporary import temp_file
|
|
14
|
+
from earthkit.data.readers.grib.output import new_grib_output
|
|
15
15
|
|
|
16
|
-
from anemoi.datasets.create.functions import
|
|
16
|
+
from anemoi.datasets.create.functions import assert_is_fieldlist
|
|
17
17
|
|
|
18
18
|
LOG = logging.getLogger(__name__)
|
|
19
19
|
|
|
@@ -96,7 +96,7 @@ def recentre(
|
|
|
96
96
|
|
|
97
97
|
for i, centre_field in enumerate(centre):
|
|
98
98
|
param = centre_field.metadata("param")
|
|
99
|
-
centre_field_as_mars = centre_field.
|
|
99
|
+
centre_field_as_mars = centre_field.metadata(namespace="mars")
|
|
100
100
|
|
|
101
101
|
# load the centre field
|
|
102
102
|
centre_np = centre_field.to_numpy()
|
|
@@ -106,8 +106,13 @@ def recentre(
|
|
|
106
106
|
|
|
107
107
|
for j in range(n_numbers):
|
|
108
108
|
ensemble_field = members[i * n_numbers + j]
|
|
109
|
-
ensemble_field_as_mars = ensemble_field.
|
|
110
|
-
check_compatible(
|
|
109
|
+
ensemble_field_as_mars = ensemble_field.metadata(namespace="mars")
|
|
110
|
+
check_compatible(
|
|
111
|
+
centre_field,
|
|
112
|
+
ensemble_field,
|
|
113
|
+
centre_field_as_mars,
|
|
114
|
+
ensemble_field_as_mars,
|
|
115
|
+
)
|
|
111
116
|
members_np[j] = ensemble_field.to_numpy()
|
|
112
117
|
|
|
113
118
|
ensemble_field_as_mars = tuple(sorted(ensemble_field_as_mars.items()))
|
|
@@ -149,10 +154,10 @@ def recentre(
|
|
|
149
154
|
if output is not None:
|
|
150
155
|
return path
|
|
151
156
|
|
|
152
|
-
from
|
|
157
|
+
from earthkit.data import from_source
|
|
153
158
|
|
|
154
|
-
ds =
|
|
155
|
-
|
|
159
|
+
ds = from_source("file", path)
|
|
160
|
+
assert_is_fieldlist(ds)
|
|
156
161
|
# save a reference to the tmp file so it is deleted
|
|
157
162
|
# only when the dataset is not used anymore
|
|
158
163
|
ds._tmp = tmp
|
|
@@ -7,8 +7,15 @@
|
|
|
7
7
|
# nor does it submit to any jurisdiction.
|
|
8
8
|
#
|
|
9
9
|
|
|
10
|
+
import logging
|
|
10
11
|
import os
|
|
11
12
|
|
|
13
|
+
LOG = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _ignore(*args, **kwargs):
|
|
17
|
+
pass
|
|
18
|
+
|
|
12
19
|
|
|
13
20
|
class Creator:
|
|
14
21
|
def __init__(
|
|
@@ -16,19 +23,21 @@ class Creator:
|
|
|
16
23
|
path,
|
|
17
24
|
config=None,
|
|
18
25
|
cache=None,
|
|
19
|
-
|
|
26
|
+
use_threads=False,
|
|
20
27
|
statistics_tmp=None,
|
|
21
28
|
overwrite=False,
|
|
22
29
|
test=None,
|
|
30
|
+
progress=None,
|
|
23
31
|
**kwargs,
|
|
24
32
|
):
|
|
25
33
|
self.path = path # Output path
|
|
26
34
|
self.config = config
|
|
27
35
|
self.cache = cache
|
|
28
|
-
self.
|
|
36
|
+
self.use_threads = use_threads
|
|
29
37
|
self.statistics_tmp = statistics_tmp
|
|
30
38
|
self.overwrite = overwrite
|
|
31
39
|
self.test = test
|
|
40
|
+
self.progress = progress if progress is not None else _ignore
|
|
32
41
|
|
|
33
42
|
def init(self, check_name=False):
|
|
34
43
|
# check path
|
|
@@ -44,10 +53,11 @@ class Creator:
|
|
|
44
53
|
path=self.path,
|
|
45
54
|
config=self.config,
|
|
46
55
|
statistics_tmp=self.statistics_tmp,
|
|
47
|
-
|
|
56
|
+
use_threads=self.use_threads,
|
|
57
|
+
progress=self.progress,
|
|
48
58
|
test=self.test,
|
|
49
59
|
)
|
|
50
|
-
obj.initialise(check_name=check_name)
|
|
60
|
+
return obj.initialise(check_name=check_name)
|
|
51
61
|
|
|
52
62
|
def load(self, parts=None):
|
|
53
63
|
from .loaders import ContentLoader
|
|
@@ -56,7 +66,8 @@ class Creator:
|
|
|
56
66
|
loader = ContentLoader.from_dataset_config(
|
|
57
67
|
path=self.path,
|
|
58
68
|
statistics_tmp=self.statistics_tmp,
|
|
59
|
-
|
|
69
|
+
use_threads=self.use_threads,
|
|
70
|
+
progress=self.progress,
|
|
60
71
|
parts=parts,
|
|
61
72
|
)
|
|
62
73
|
loader.load()
|
|
@@ -66,7 +77,8 @@ class Creator:
|
|
|
66
77
|
|
|
67
78
|
loader = StatisticsAdder.from_dataset(
|
|
68
79
|
path=self.path,
|
|
69
|
-
|
|
80
|
+
use_threads=self.use_threads,
|
|
81
|
+
progress=self.progress,
|
|
70
82
|
statistics_tmp=self.statistics_tmp,
|
|
71
83
|
statistics_output=output,
|
|
72
84
|
recompute=False,
|
|
@@ -74,20 +86,22 @@ class Creator:
|
|
|
74
86
|
statistics_end=end,
|
|
75
87
|
)
|
|
76
88
|
loader.run()
|
|
89
|
+
assert loader.ready()
|
|
77
90
|
|
|
78
91
|
def size(self):
|
|
79
92
|
from .loaders import DatasetHandler
|
|
80
93
|
from .size import compute_directory_sizes
|
|
81
94
|
|
|
82
95
|
metadata = compute_directory_sizes(self.path)
|
|
83
|
-
handle = DatasetHandler.from_dataset(path=self.path,
|
|
96
|
+
handle = DatasetHandler.from_dataset(path=self.path, use_threads=self.use_threads)
|
|
84
97
|
handle.update_metadata(**metadata)
|
|
98
|
+
assert handle.ready()
|
|
85
99
|
|
|
86
100
|
def cleanup(self):
|
|
87
101
|
from .loaders import DatasetHandlerWithStatistics
|
|
88
102
|
|
|
89
103
|
cleaner = DatasetHandlerWithStatistics.from_dataset(
|
|
90
|
-
path=self.path,
|
|
104
|
+
path=self.path, use_threads=self.use_threads, progress=self.progress, statistics_tmp=self.statistics_tmp
|
|
91
105
|
)
|
|
92
106
|
cleaner.tmp_statistics.delete()
|
|
93
107
|
cleaner.registry.clean()
|
|
@@ -103,15 +117,17 @@ class Creator:
|
|
|
103
117
|
from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
|
|
104
118
|
|
|
105
119
|
if statistics:
|
|
106
|
-
a = StatisticsAddition.from_dataset(path=self.path,
|
|
120
|
+
a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
|
|
107
121
|
a.initialise()
|
|
108
122
|
|
|
109
123
|
for d in delta:
|
|
110
124
|
try:
|
|
111
|
-
a = TendenciesStatisticsAddition.from_dataset(
|
|
125
|
+
a = TendenciesStatisticsAddition.from_dataset(
|
|
126
|
+
path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
|
|
127
|
+
)
|
|
112
128
|
a.initialise()
|
|
113
129
|
except TendenciesStatisticsDeltaNotMultipleOfFrequency:
|
|
114
|
-
|
|
130
|
+
LOG.info(f"Skipping delta={d} as it is not a multiple of the frequency.")
|
|
115
131
|
|
|
116
132
|
def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24], statistics=True):
|
|
117
133
|
from .loaders import StatisticsAddition
|
|
@@ -119,15 +135,17 @@ class Creator:
|
|
|
119
135
|
from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
|
|
120
136
|
|
|
121
137
|
if statistics:
|
|
122
|
-
a = StatisticsAddition.from_dataset(path=self.path,
|
|
138
|
+
a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
|
|
123
139
|
a.run(parts)
|
|
124
140
|
|
|
125
141
|
for d in delta:
|
|
126
142
|
try:
|
|
127
|
-
a = TendenciesStatisticsAddition.from_dataset(
|
|
143
|
+
a = TendenciesStatisticsAddition.from_dataset(
|
|
144
|
+
path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
|
|
145
|
+
)
|
|
128
146
|
a.run(parts)
|
|
129
147
|
except TendenciesStatisticsDeltaNotMultipleOfFrequency:
|
|
130
|
-
|
|
148
|
+
LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
|
|
131
149
|
|
|
132
150
|
def finalise_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
|
|
133
151
|
from .loaders import StatisticsAddition
|
|
@@ -135,15 +153,17 @@ class Creator:
|
|
|
135
153
|
from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
|
|
136
154
|
|
|
137
155
|
if statistics:
|
|
138
|
-
a = StatisticsAddition.from_dataset(path=self.path,
|
|
156
|
+
a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
|
|
139
157
|
a.finalise()
|
|
140
158
|
|
|
141
159
|
for d in delta:
|
|
142
160
|
try:
|
|
143
|
-
a = TendenciesStatisticsAddition.from_dataset(
|
|
161
|
+
a = TendenciesStatisticsAddition.from_dataset(
|
|
162
|
+
path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
|
|
163
|
+
)
|
|
144
164
|
a.finalise()
|
|
145
165
|
except TendenciesStatisticsDeltaNotMultipleOfFrequency:
|
|
146
|
-
|
|
166
|
+
LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
|
|
147
167
|
|
|
148
168
|
def finalise(self, **kwargs):
|
|
149
169
|
self.statistics(**kwargs)
|
|
@@ -174,3 +194,10 @@ class Creator:
|
|
|
174
194
|
return True
|
|
175
195
|
except zarr.errors.PathNotFoundError:
|
|
176
196
|
return False
|
|
197
|
+
|
|
198
|
+
def verify(self):
|
|
199
|
+
from .loaders import DatasetVerifier
|
|
200
|
+
|
|
201
|
+
handle = DatasetVerifier.from_dataset(path=self.path, use_threads=self.use_threads)
|
|
202
|
+
|
|
203
|
+
handle.verify()
|
anemoi/datasets/create/check.py
CHANGED
|
@@ -56,7 +56,7 @@ class DatasetName:
|
|
|
56
56
|
raise ValueError(self.error_message)
|
|
57
57
|
|
|
58
58
|
def _parse(self, name):
|
|
59
|
-
pattern = r"^(\w+)-([\w-]+)-(\w+)-(\w+)-(\d\d\d\d)-(\d\d\d\d)-(\d+h)-v(\d+)-?(
|
|
59
|
+
pattern = r"^(\w+)-([\w-]+)-(\w+)-(\w+)-(\d\d\d\d)-(\d\d\d\d)-(\d+h)-v(\d+)-?([a-zA-Z0-9-]+)$"
|
|
60
60
|
match = re.match(pattern, name)
|
|
61
61
|
|
|
62
62
|
assert match, (name, pattern)
|
|
@@ -136,18 +136,19 @@ class StatisticsValueError(ValueError):
|
|
|
136
136
|
pass
|
|
137
137
|
|
|
138
138
|
|
|
139
|
-
def check_data_values(arr, *, name: str, log=[],
|
|
140
|
-
if allow_nan is False:
|
|
141
|
-
allow_nan = lambda x: False # noqa: E731
|
|
139
|
+
def check_data_values(arr, *, name: str, log=[], allow_nans=False):
|
|
142
140
|
|
|
143
|
-
if
|
|
141
|
+
if (isinstance(allow_nans, (set, list, tuple, dict)) and name in allow_nans) or allow_nans:
|
|
144
142
|
arr = arr[~np.isnan(arr)]
|
|
145
143
|
|
|
144
|
+
assert arr.size > 0, (name, *log)
|
|
145
|
+
|
|
146
146
|
min, max = arr.min(), arr.max()
|
|
147
147
|
assert not (np.isnan(arr).any()), (name, min, max, *log)
|
|
148
148
|
|
|
149
149
|
if min == 9999.0:
|
|
150
150
|
warnings.warn(f"Min value 9999 for {name}")
|
|
151
|
+
|
|
151
152
|
if max == 9999.0:
|
|
152
153
|
warnings.warn(f"Max value 9999 for {name}")
|
|
153
154
|
|
anemoi/datasets/create/chunks.py
CHANGED
anemoi/datasets/create/config.py
CHANGED
|
@@ -12,9 +12,9 @@ import os
|
|
|
12
12
|
from copy import deepcopy
|
|
13
13
|
|
|
14
14
|
import yaml
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
from .
|
|
15
|
+
from anemoi.utils.config import DotDict
|
|
16
|
+
from anemoi.utils.config import load_any_dict_format
|
|
17
|
+
from earthkit.data.core.order import normalize_order_by
|
|
18
18
|
|
|
19
19
|
LOG = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -43,31 +43,10 @@ def check_dict_value_and_set(dic, key, value):
|
|
|
43
43
|
if dic[key] == value:
|
|
44
44
|
return
|
|
45
45
|
raise ValueError(f"Cannot use {key}={dic[key]}. Must use {value}.")
|
|
46
|
-
|
|
46
|
+
LOG.info(f"Setting {key}={value} in config")
|
|
47
47
|
dic[key] = value
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
class DictObj(dict):
|
|
51
|
-
def __init__(self, *args, **kwargs):
|
|
52
|
-
super().__init__(*args, **kwargs)
|
|
53
|
-
for key, value in self.items():
|
|
54
|
-
if isinstance(value, dict):
|
|
55
|
-
self[key] = DictObj(value)
|
|
56
|
-
continue
|
|
57
|
-
if isinstance(value, list):
|
|
58
|
-
self[key] = [DictObj(item) if isinstance(item, dict) else item for item in value]
|
|
59
|
-
continue
|
|
60
|
-
|
|
61
|
-
def __getattr__(self, attr):
|
|
62
|
-
try:
|
|
63
|
-
return self[attr]
|
|
64
|
-
except KeyError:
|
|
65
|
-
raise AttributeError(attr)
|
|
66
|
-
|
|
67
|
-
def __setattr__(self, attr, value):
|
|
68
|
-
self[attr] = value
|
|
69
|
-
|
|
70
|
-
|
|
71
50
|
def resolve_includes(config):
|
|
72
51
|
if isinstance(config, list):
|
|
73
52
|
return [resolve_includes(c) for c in config]
|
|
@@ -79,11 +58,11 @@ def resolve_includes(config):
|
|
|
79
58
|
return config
|
|
80
59
|
|
|
81
60
|
|
|
82
|
-
class Config(
|
|
61
|
+
class Config(DotDict):
|
|
83
62
|
def __init__(self, config=None, **kwargs):
|
|
84
63
|
if isinstance(config, str):
|
|
85
64
|
self.config_path = os.path.realpath(config)
|
|
86
|
-
config =
|
|
65
|
+
config = load_any_dict_format(config)
|
|
87
66
|
else:
|
|
88
67
|
config = deepcopy(config if config is not None else {})
|
|
89
68
|
config = resolve_includes(config)
|
|
@@ -13,10 +13,10 @@ import importlib
|
|
|
13
13
|
import entrypoints
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def
|
|
17
|
-
from
|
|
16
|
+
def assert_is_fieldlist(obj):
|
|
17
|
+
from earthkit.data.indexing.fieldlist import FieldList
|
|
18
18
|
|
|
19
|
-
assert isinstance(obj,
|
|
19
|
+
assert isinstance(obj, FieldList), type(obj)
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def import_function(name, kind):
|
|
@@ -7,10 +7,10 @@
|
|
|
7
7
|
# nor does it submit to any jurisdiction.
|
|
8
8
|
#
|
|
9
9
|
|
|
10
|
-
import
|
|
10
|
+
import earthkit.data as ekd
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def execute(context, input, **kwargs):
|
|
14
|
-
#
|
|
15
|
-
# So we can reference an earlier step in a function like '
|
|
16
|
-
return
|
|
14
|
+
# Useful to create a pipeline that returns an empty result
|
|
15
|
+
# So we can reference an earlier step in a function like 'constants'
|
|
16
|
+
return ekd.from_source("empty")
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
import re
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from earthkit.data.indexing.fieldlist import FieldArray
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class RenamedFieldMapping:
|
|
@@ -26,15 +26,23 @@ class RenamedFieldMapping:
|
|
|
26
26
|
self.what = what
|
|
27
27
|
self.renaming = renaming
|
|
28
28
|
|
|
29
|
-
def metadata(self, key):
|
|
30
|
-
|
|
29
|
+
def metadata(self, key=None, **kwargs):
|
|
30
|
+
if key is None:
|
|
31
|
+
return self.field.metadata(**kwargs)
|
|
32
|
+
|
|
33
|
+
value = self.field.metadata(key, **kwargs)
|
|
31
34
|
if key == self.what:
|
|
32
35
|
return self.renaming.get(value, value)
|
|
36
|
+
|
|
33
37
|
return value
|
|
34
38
|
|
|
35
39
|
def __getattr__(self, name):
|
|
36
40
|
return getattr(self.field, name)
|
|
37
41
|
|
|
42
|
+
def __repr__(self) -> str:
|
|
43
|
+
return repr(self.field)
|
|
44
|
+
return f"{self.field} -> {self.what} -> {self.renaming}"
|
|
45
|
+
|
|
38
46
|
|
|
39
47
|
class RenamedFieldFormat:
|
|
40
48
|
"""Rename a field based on a format string.
|
|
@@ -48,10 +56,10 @@ class RenamedFieldFormat:
|
|
|
48
56
|
self.format = format
|
|
49
57
|
self.bits = re.findall(r"{(\w+)}", format)
|
|
50
58
|
|
|
51
|
-
def metadata(self, key):
|
|
52
|
-
value = self.field.metadata(key)
|
|
59
|
+
def metadata(self, key, **kwargs):
|
|
60
|
+
value = self.field.metadata(key, **kwargs)
|
|
53
61
|
if "{" + key + "}" in self.format:
|
|
54
|
-
bits = {b: self.field.metadata(b) for b in self.bits}
|
|
62
|
+
bits = {b: self.field.metadata(b, **kwargs) for b in self.bits}
|
|
55
63
|
return self.format.format(**bits)
|
|
56
64
|
return value
|
|
57
65
|
|