anemoi-datasets 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anemoi/datasets/__init__.py +5 -2
- anemoi/datasets/_version.py +2 -2
- anemoi/datasets/commands/copy.py +87 -29
- anemoi/datasets/create/__init__.py +12 -9
- anemoi/datasets/create/functions/sources/accumulations.py +9 -1
- anemoi/datasets/create/functions/sources/mars.py +74 -0
- anemoi/datasets/create/loaders.py +96 -86
- anemoi/datasets/create/statistics/__init__.py +6 -139
- anemoi/datasets/data/__init__.py +7 -0
- anemoi/datasets/data/dataset.py +3 -0
- anemoi/datasets/data/forwards.py +7 -0
- anemoi/datasets/data/masked.py +6 -2
- anemoi/datasets/data/misc.py +3 -1
- anemoi/datasets/data/select.py +8 -4
- anemoi/datasets/data/statistics.py +4 -0
- anemoi/datasets/data/stores.py +10 -1
- {anemoi_datasets-0.3.6.dist-info → anemoi_datasets-0.3.8.dist-info}/METADATA +2 -2
- {anemoi_datasets-0.3.6.dist-info → anemoi_datasets-0.3.8.dist-info}/RECORD +22 -22
- {anemoi_datasets-0.3.6.dist-info → anemoi_datasets-0.3.8.dist-info}/WHEEL +1 -1
- {anemoi_datasets-0.3.6.dist-info → anemoi_datasets-0.3.8.dist-info}/LICENSE +0 -0
- {anemoi_datasets-0.3.6.dist-info → anemoi_datasets-0.3.8.dist-info}/entry_points.txt +0 -0
- {anemoi_datasets-0.3.6.dist-info → anemoi_datasets-0.3.8.dist-info}/top_level.txt +0 -0
anemoi/datasets/__init__.py
CHANGED
|
@@ -9,11 +9,14 @@ from ._version import __version__
|
|
|
9
9
|
from .data import MissingDateError
|
|
10
10
|
from .data import add_dataset_path
|
|
11
11
|
from .data import add_named_dataset
|
|
12
|
+
from .data import list_dataset_names
|
|
12
13
|
from .data import open_dataset
|
|
13
14
|
|
|
14
15
|
__all__ = [
|
|
15
|
-
"
|
|
16
|
-
"MissingDateError",
|
|
16
|
+
"__version__",
|
|
17
17
|
"add_dataset_path",
|
|
18
18
|
"add_named_dataset",
|
|
19
|
+
"list_dataset_names",
|
|
20
|
+
"MissingDateError",
|
|
21
|
+
"open_dataset",
|
|
19
22
|
]
|
anemoi/datasets/_version.py
CHANGED
anemoi/datasets/commands/copy.py
CHANGED
|
@@ -7,11 +7,14 @@
|
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
9
|
import os
|
|
10
|
+
import shutil
|
|
10
11
|
import sys
|
|
11
12
|
from concurrent.futures import ThreadPoolExecutor
|
|
12
13
|
from concurrent.futures import as_completed
|
|
13
14
|
|
|
14
15
|
import tqdm
|
|
16
|
+
from anemoi.utils.s3 import download
|
|
17
|
+
from anemoi.utils.s3 import upload
|
|
15
18
|
|
|
16
19
|
from . import Command
|
|
17
20
|
|
|
@@ -22,34 +25,63 @@ try:
|
|
|
22
25
|
except AttributeError:
|
|
23
26
|
isatty = False
|
|
24
27
|
|
|
25
|
-
"""
|
|
26
28
|
|
|
27
|
-
|
|
29
|
+
class S3Downloader:
|
|
30
|
+
def __init__(self, source, target, transfers, overwrite, resume, verbosity, **kwargs):
|
|
31
|
+
self.source = source
|
|
32
|
+
self.target = target
|
|
33
|
+
self.transfers = transfers
|
|
34
|
+
self.overwrite = overwrite
|
|
35
|
+
self.resume = resume
|
|
36
|
+
self.verbosity = verbosity
|
|
28
37
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
38
|
+
def run(self):
|
|
39
|
+
if self.target == ".":
|
|
40
|
+
self.target = os.path.basename(self.source)
|
|
41
|
+
|
|
42
|
+
if self.overwrite and os.path.exists(self.target):
|
|
43
|
+
LOG.info(f"Deleting {self.target}")
|
|
44
|
+
shutil.rmtree(self.target)
|
|
45
|
+
|
|
46
|
+
download(
|
|
47
|
+
self.source + "/" if not self.source.endswith("/") else self.source,
|
|
48
|
+
self.target,
|
|
49
|
+
overwrite=self.overwrite,
|
|
50
|
+
resume=self.resume,
|
|
51
|
+
verbosity=self.verbosity,
|
|
52
|
+
threads=self.transfers,
|
|
53
|
+
)
|
|
33
54
|
|
|
34
|
-
Then:
|
|
35
55
|
|
|
36
|
-
|
|
37
|
-
|
|
56
|
+
class S3Uploader:
|
|
57
|
+
def __init__(self, source, target, transfers, overwrite, resume, verbosity, **kwargs):
|
|
58
|
+
self.source = source
|
|
59
|
+
self.target = target
|
|
60
|
+
self.transfers = transfers
|
|
61
|
+
self.overwrite = overwrite
|
|
62
|
+
self.resume = resume
|
|
63
|
+
self.verbosity = verbosity
|
|
38
64
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
65
|
+
def run(self):
|
|
66
|
+
upload(
|
|
67
|
+
self.source,
|
|
68
|
+
self.target,
|
|
69
|
+
overwrite=self.overwrite,
|
|
70
|
+
resume=self.resume,
|
|
71
|
+
verbosity=self.verbosity,
|
|
72
|
+
threads=self.transfers,
|
|
73
|
+
)
|
|
42
74
|
|
|
43
75
|
|
|
44
|
-
class
|
|
45
|
-
def __init__(self, source, target, transfers, block_size, overwrite, resume,
|
|
76
|
+
class DefaultCopier:
|
|
77
|
+
def __init__(self, source, target, transfers, block_size, overwrite, resume, verbosity, nested, rechunk, **kwargs):
|
|
46
78
|
self.source = source
|
|
47
79
|
self.target = target
|
|
48
80
|
self.transfers = transfers
|
|
49
81
|
self.block_size = block_size
|
|
50
82
|
self.overwrite = overwrite
|
|
51
83
|
self.resume = resume
|
|
52
|
-
self.
|
|
84
|
+
self.verbosity = verbosity
|
|
53
85
|
self.nested = nested
|
|
54
86
|
self.rechunk = rechunk
|
|
55
87
|
|
|
@@ -62,7 +94,7 @@ class Copier:
|
|
|
62
94
|
return zarr.storage.NestedDirectoryStore(path)
|
|
63
95
|
return path
|
|
64
96
|
|
|
65
|
-
def copy_chunk(self, n, m, source, target, _copy,
|
|
97
|
+
def copy_chunk(self, n, m, source, target, _copy, verbosity):
|
|
66
98
|
if _copy[n:m].all():
|
|
67
99
|
LOG.info(f"Skipping {n} to {m}")
|
|
68
100
|
return None
|
|
@@ -82,7 +114,7 @@ class Copier:
|
|
|
82
114
|
range(n, m),
|
|
83
115
|
desc=f"Copying {n} to {m}",
|
|
84
116
|
leave=False,
|
|
85
|
-
disable=not isatty and not
|
|
117
|
+
disable=not isatty and not verbosity,
|
|
86
118
|
):
|
|
87
119
|
target[i] = source[i]
|
|
88
120
|
|
|
@@ -107,7 +139,7 @@ class Copier:
|
|
|
107
139
|
# raise NotImplementedError("Rechunking with multiple transfers is not implemented")
|
|
108
140
|
return chunks
|
|
109
141
|
|
|
110
|
-
def copy_data(self, source, target, _copy,
|
|
142
|
+
def copy_data(self, source, target, _copy, verbosity):
|
|
111
143
|
LOG.info("Copying data")
|
|
112
144
|
source_data = source["data"]
|
|
113
145
|
|
|
@@ -121,6 +153,7 @@ class Copier:
|
|
|
121
153
|
shape=source_data.shape,
|
|
122
154
|
chunks=self.data_chunks,
|
|
123
155
|
dtype=source_data.dtype,
|
|
156
|
+
fill_value=source_data.fill_value,
|
|
124
157
|
)
|
|
125
158
|
)
|
|
126
159
|
|
|
@@ -136,7 +169,7 @@ class Copier:
|
|
|
136
169
|
source_data,
|
|
137
170
|
target_data,
|
|
138
171
|
_copy,
|
|
139
|
-
|
|
172
|
+
verbosity,
|
|
140
173
|
)
|
|
141
174
|
)
|
|
142
175
|
n += self.block_size
|
|
@@ -151,7 +184,7 @@ class Copier:
|
|
|
151
184
|
|
|
152
185
|
LOG.info("Copied data")
|
|
153
186
|
|
|
154
|
-
def copy_array(self, name, source, target, _copy,
|
|
187
|
+
def copy_array(self, name, source, target, _copy, verbosity):
|
|
155
188
|
for k, v in source.attrs.items():
|
|
156
189
|
target.attrs[k] = v
|
|
157
190
|
|
|
@@ -159,14 +192,14 @@ class Copier:
|
|
|
159
192
|
return
|
|
160
193
|
|
|
161
194
|
if name == "data":
|
|
162
|
-
self.copy_data(source, target, _copy,
|
|
195
|
+
self.copy_data(source, target, _copy, verbosity)
|
|
163
196
|
return
|
|
164
197
|
|
|
165
198
|
LOG.info(f"Copying {name}")
|
|
166
199
|
target[name] = source[name]
|
|
167
200
|
LOG.info(f"Copied {name}")
|
|
168
201
|
|
|
169
|
-
def copy_group(self, source, target, _copy,
|
|
202
|
+
def copy_group(self, source, target, _copy, verbosity):
|
|
170
203
|
import zarr
|
|
171
204
|
|
|
172
205
|
for k, v in source.attrs.items():
|
|
@@ -179,7 +212,7 @@ class Copier:
|
|
|
179
212
|
source[name],
|
|
180
213
|
group,
|
|
181
214
|
_copy,
|
|
182
|
-
|
|
215
|
+
verbosity,
|
|
183
216
|
)
|
|
184
217
|
else:
|
|
185
218
|
self.copy_array(
|
|
@@ -187,10 +220,10 @@ class Copier:
|
|
|
187
220
|
source,
|
|
188
221
|
target,
|
|
189
222
|
_copy,
|
|
190
|
-
|
|
223
|
+
verbosity,
|
|
191
224
|
)
|
|
192
225
|
|
|
193
|
-
def copy(self, source, target,
|
|
226
|
+
def copy(self, source, target, verbosity):
|
|
194
227
|
import zarr
|
|
195
228
|
|
|
196
229
|
if "_copy" not in target:
|
|
@@ -201,7 +234,7 @@ class Copier:
|
|
|
201
234
|
_copy = target["_copy"]
|
|
202
235
|
_copy_np = _copy[:]
|
|
203
236
|
|
|
204
|
-
self.copy_group(source, target, _copy_np,
|
|
237
|
+
self.copy_group(source, target, _copy_np, verbosity)
|
|
205
238
|
del target["_copy"]
|
|
206
239
|
|
|
207
240
|
def run(self):
|
|
@@ -260,7 +293,7 @@ class Copier:
|
|
|
260
293
|
assert target is not None, target
|
|
261
294
|
|
|
262
295
|
source = zarr.open(self._store(self.source), mode="r")
|
|
263
|
-
self.copy(source, target, self.
|
|
296
|
+
self.copy(source, target, self.verbosity)
|
|
264
297
|
|
|
265
298
|
|
|
266
299
|
class CopyMixin:
|
|
@@ -279,7 +312,10 @@ class CopyMixin:
|
|
|
279
312
|
)
|
|
280
313
|
command_parser.add_argument("--transfers", type=int, default=8, help="Number of parallel transfers.")
|
|
281
314
|
command_parser.add_argument(
|
|
282
|
-
"--
|
|
315
|
+
"--verbosity",
|
|
316
|
+
type=int,
|
|
317
|
+
help="Verbosity level. 0 is silent, 1 is normal, 2 is verbose.",
|
|
318
|
+
default=1,
|
|
283
319
|
)
|
|
284
320
|
command_parser.add_argument("--nested", action="store_true", help="Use ZARR's nested directpry backend.")
|
|
285
321
|
command_parser.add_argument(
|
|
@@ -295,7 +331,29 @@ class CopyMixin:
|
|
|
295
331
|
command_parser.add_argument("target", help="Target location.")
|
|
296
332
|
|
|
297
333
|
def run(self, args):
|
|
298
|
-
|
|
334
|
+
if args.source == args.target:
|
|
335
|
+
raise ValueError("Source and target are the same.")
|
|
336
|
+
|
|
337
|
+
kwargs = vars(args)
|
|
338
|
+
|
|
339
|
+
if args.overwrite and args.resume:
|
|
340
|
+
raise ValueError("Cannot use --overwrite and --resume together.")
|
|
341
|
+
|
|
342
|
+
source_in_s3 = args.source.startswith("s3://")
|
|
343
|
+
target_in_s3 = args.target.startswith("s3://")
|
|
344
|
+
|
|
345
|
+
copier = None
|
|
346
|
+
|
|
347
|
+
if args.rechunk or (source_in_s3 and target_in_s3):
|
|
348
|
+
copier = DefaultCopier(**kwargs)
|
|
349
|
+
else:
|
|
350
|
+
if source_in_s3:
|
|
351
|
+
copier = S3Downloader(**kwargs)
|
|
352
|
+
|
|
353
|
+
if target_in_s3:
|
|
354
|
+
copier = S3Uploader(**kwargs)
|
|
355
|
+
|
|
356
|
+
copier.run()
|
|
299
357
|
|
|
300
358
|
|
|
301
359
|
class Copy(CopyMixin, Command):
|
|
@@ -97,13 +97,14 @@ class Creator:
|
|
|
97
97
|
|
|
98
98
|
apply_patch(self.path, **kwargs)
|
|
99
99
|
|
|
100
|
-
def init_additions(self, delta=[1, 3, 6, 12, 24]):
|
|
100
|
+
def init_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
|
|
101
101
|
from .loaders import StatisticsAddition
|
|
102
102
|
from .loaders import TendenciesStatisticsAddition
|
|
103
103
|
from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
|
|
105
|
+
if statistics:
|
|
106
|
+
a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
|
|
107
|
+
a.initialise()
|
|
107
108
|
|
|
108
109
|
for d in delta:
|
|
109
110
|
try:
|
|
@@ -112,13 +113,14 @@ class Creator:
|
|
|
112
113
|
except TendenciesStatisticsDeltaNotMultipleOfFrequency:
|
|
113
114
|
self.print(f"Skipping delta={d} as it is not a multiple of the frequency.")
|
|
114
115
|
|
|
115
|
-
def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24]):
|
|
116
|
+
def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24], statistics=True):
|
|
116
117
|
from .loaders import StatisticsAddition
|
|
117
118
|
from .loaders import TendenciesStatisticsAddition
|
|
118
119
|
from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
|
|
119
120
|
|
|
120
|
-
|
|
121
|
-
|
|
121
|
+
if statistics:
|
|
122
|
+
a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
|
|
123
|
+
a.run(parts)
|
|
122
124
|
|
|
123
125
|
for d in delta:
|
|
124
126
|
try:
|
|
@@ -127,13 +129,14 @@ class Creator:
|
|
|
127
129
|
except TendenciesStatisticsDeltaNotMultipleOfFrequency:
|
|
128
130
|
self.print(f"Skipping delta={d} as it is not a multiple of the frequency.")
|
|
129
131
|
|
|
130
|
-
def finalise_additions(self, delta=[1, 3, 6, 12, 24]):
|
|
132
|
+
def finalise_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
|
|
131
133
|
from .loaders import StatisticsAddition
|
|
132
134
|
from .loaders import TendenciesStatisticsAddition
|
|
133
135
|
from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
|
|
134
136
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
+
if statistics:
|
|
138
|
+
a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
|
|
139
|
+
a.finalise()
|
|
137
140
|
|
|
138
141
|
for d in delta:
|
|
139
142
|
try:
|
|
@@ -19,6 +19,8 @@ from climetlab.utils.availability import Availability
|
|
|
19
19
|
|
|
20
20
|
from anemoi.datasets.create.utils import to_datetime_list
|
|
21
21
|
|
|
22
|
+
from .mars import use_grib_paramid
|
|
23
|
+
|
|
22
24
|
LOG = logging.getLogger(__name__)
|
|
23
25
|
|
|
24
26
|
|
|
@@ -85,6 +87,7 @@ class Accumulation:
|
|
|
85
87
|
stepType="accum",
|
|
86
88
|
startStep=self.startStep,
|
|
87
89
|
endStep=self.endStep,
|
|
90
|
+
check_nans=True,
|
|
88
91
|
)
|
|
89
92
|
self.values = None
|
|
90
93
|
self.done = True
|
|
@@ -230,6 +233,7 @@ def identity(x):
|
|
|
230
233
|
|
|
231
234
|
|
|
232
235
|
def compute_accumulations(
|
|
236
|
+
context,
|
|
233
237
|
dates,
|
|
234
238
|
request,
|
|
235
239
|
user_accumulation_period=6,
|
|
@@ -306,7 +310,10 @@ def compute_accumulations(
|
|
|
306
310
|
ds = cml.load_source("empty")
|
|
307
311
|
for r in compressed.iterate():
|
|
308
312
|
request.update(r)
|
|
313
|
+
if context.use_grib_paramid and "param" in request:
|
|
314
|
+
request = use_grib_paramid(request)
|
|
309
315
|
print("🌧️", request)
|
|
316
|
+
|
|
310
317
|
ds = ds + cml.load_source("mars", **request)
|
|
311
318
|
|
|
312
319
|
accumulations = {}
|
|
@@ -395,7 +402,7 @@ def accumulations(context, dates, **request):
|
|
|
395
402
|
class_ = request.get("class", "od")
|
|
396
403
|
stream = request.get("stream", "oper")
|
|
397
404
|
|
|
398
|
-
user_accumulation_period = request.
|
|
405
|
+
user_accumulation_period = request.pop("accumulation_period", 6)
|
|
399
406
|
|
|
400
407
|
KWARGS = {
|
|
401
408
|
("od", "oper"): dict(patch=scda),
|
|
@@ -409,6 +416,7 @@ def accumulations(context, dates, **request):
|
|
|
409
416
|
context.trace("🌧️", f"accumulations {request} {user_accumulation_period} {kwargs}")
|
|
410
417
|
|
|
411
418
|
return compute_accumulations(
|
|
419
|
+
context,
|
|
412
420
|
dates,
|
|
413
421
|
request,
|
|
414
422
|
user_accumulation_period=user_accumulation_period,
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
import datetime
|
|
10
10
|
from copy import deepcopy
|
|
11
11
|
|
|
12
|
+
from anemoi.utils.humanize import did_you_mean
|
|
12
13
|
from climetlab import load_source
|
|
13
14
|
from climetlab.utils.availability import Availability
|
|
14
15
|
|
|
@@ -102,6 +103,74 @@ def use_grib_paramid(r):
|
|
|
102
103
|
return r
|
|
103
104
|
|
|
104
105
|
|
|
106
|
+
MARS_KEYS = [
|
|
107
|
+
"accuracy",
|
|
108
|
+
"activity",
|
|
109
|
+
"anoffset",
|
|
110
|
+
"area",
|
|
111
|
+
"bitmap",
|
|
112
|
+
"channel",
|
|
113
|
+
"class",
|
|
114
|
+
"database",
|
|
115
|
+
"dataset",
|
|
116
|
+
"date",
|
|
117
|
+
"diagnostic",
|
|
118
|
+
"direction",
|
|
119
|
+
"domain",
|
|
120
|
+
"expect",
|
|
121
|
+
"experiment",
|
|
122
|
+
"expver",
|
|
123
|
+
"fcmonth",
|
|
124
|
+
"fcperiod",
|
|
125
|
+
"fieldset",
|
|
126
|
+
"filter",
|
|
127
|
+
"format",
|
|
128
|
+
"frame",
|
|
129
|
+
"frequency",
|
|
130
|
+
"gaussian",
|
|
131
|
+
"generation",
|
|
132
|
+
"grid",
|
|
133
|
+
"hdate",
|
|
134
|
+
"ident",
|
|
135
|
+
"instrument",
|
|
136
|
+
"interpolation",
|
|
137
|
+
"intgrid",
|
|
138
|
+
"iteration",
|
|
139
|
+
"level",
|
|
140
|
+
"levelist",
|
|
141
|
+
"levtype",
|
|
142
|
+
"method",
|
|
143
|
+
"model",
|
|
144
|
+
"month",
|
|
145
|
+
"number",
|
|
146
|
+
"obsgroup",
|
|
147
|
+
"obstype",
|
|
148
|
+
"offsetdate",
|
|
149
|
+
"offsettime",
|
|
150
|
+
"optimise",
|
|
151
|
+
"origin",
|
|
152
|
+
"packing",
|
|
153
|
+
"padding",
|
|
154
|
+
"param",
|
|
155
|
+
"quantile",
|
|
156
|
+
"realization",
|
|
157
|
+
"reference",
|
|
158
|
+
"reportype",
|
|
159
|
+
"repres",
|
|
160
|
+
"resol",
|
|
161
|
+
"resolution",
|
|
162
|
+
"rotation",
|
|
163
|
+
"step",
|
|
164
|
+
"stream",
|
|
165
|
+
"system",
|
|
166
|
+
"target",
|
|
167
|
+
"time",
|
|
168
|
+
"truncation",
|
|
169
|
+
"type",
|
|
170
|
+
"year",
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
|
|
105
174
|
def mars(context, dates, *requests, date_key="date", **kwargs):
|
|
106
175
|
if not requests:
|
|
107
176
|
requests = [kwargs]
|
|
@@ -117,6 +186,11 @@ def mars(context, dates, *requests, date_key="date", **kwargs):
|
|
|
117
186
|
if DEBUG:
|
|
118
187
|
context.trace("✅", f"load_source(mars, {r}")
|
|
119
188
|
|
|
189
|
+
for k, v in r.items():
|
|
190
|
+
if k not in MARS_KEYS:
|
|
191
|
+
raise ValueError(
|
|
192
|
+
f"⚠️ Unknown key {k}={v} in MARS request. Did you mean '{did_you_mean(k, MARS_KEYS)}' ?"
|
|
193
|
+
)
|
|
120
194
|
ds = ds + load_source("mars", **r)
|
|
121
195
|
return ds
|
|
122
196
|
|
|
@@ -46,8 +46,44 @@ LOG = logging.getLogger(__name__)
|
|
|
46
46
|
VERSION = "0.20"
|
|
47
47
|
|
|
48
48
|
|
|
49
|
+
def set_to_test_mode(cfg):
|
|
50
|
+
NUMBER_OF_DATES = 4
|
|
51
|
+
|
|
52
|
+
dates = cfg.dates
|
|
53
|
+
LOG.warn(f"Running in test mode. Changing the list of dates to use only {NUMBER_OF_DATES}.")
|
|
54
|
+
groups = Groups(**cfg.dates)
|
|
55
|
+
dates = groups.dates
|
|
56
|
+
cfg.dates = dict(
|
|
57
|
+
start=dates[0],
|
|
58
|
+
end=dates[NUMBER_OF_DATES - 1],
|
|
59
|
+
frequency=dates.frequency,
|
|
60
|
+
group_by=NUMBER_OF_DATES,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def set_element_to_test(obj):
|
|
64
|
+
if isinstance(obj, (list, tuple)):
|
|
65
|
+
for v in obj:
|
|
66
|
+
set_element_to_test(v)
|
|
67
|
+
return
|
|
68
|
+
if isinstance(obj, (dict, DictObj)):
|
|
69
|
+
if "grid" in obj:
|
|
70
|
+
previous = obj["grid"]
|
|
71
|
+
obj["grid"] = "20./20."
|
|
72
|
+
LOG.warn(f"Running in test mode. Setting grid to {obj['grid']} instead of {previous}")
|
|
73
|
+
if "number" in obj:
|
|
74
|
+
if isinstance(obj["number"], (list, tuple)):
|
|
75
|
+
previous = obj["number"]
|
|
76
|
+
obj["number"] = previous[0:3]
|
|
77
|
+
LOG.warn(f"Running in test mode. Setting number to {obj['number']} instead of {previous}")
|
|
78
|
+
for k, v in obj.items():
|
|
79
|
+
set_element_to_test(v)
|
|
80
|
+
|
|
81
|
+
set_element_to_test(cfg)
|
|
82
|
+
|
|
83
|
+
|
|
49
84
|
class GenericDatasetHandler:
|
|
50
85
|
def __init__(self, *, path, print=print, **kwargs):
|
|
86
|
+
|
|
51
87
|
# Catch all floating point errors, including overflow, sqrt(<0), etc
|
|
52
88
|
np.seterr(all="raise", under="warn")
|
|
53
89
|
|
|
@@ -61,12 +97,15 @@ class GenericDatasetHandler:
|
|
|
61
97
|
|
|
62
98
|
@classmethod
|
|
63
99
|
def from_config(cls, *, config, path, print=print, **kwargs):
|
|
64
|
-
|
|
100
|
+
"""Config is the path to the config file or a dict with the config"""
|
|
101
|
+
|
|
65
102
|
assert isinstance(config, dict) or isinstance(config, str), config
|
|
66
103
|
return cls(config=config, path=path, print=print, **kwargs)
|
|
67
104
|
|
|
68
105
|
@classmethod
|
|
69
106
|
def from_dataset_config(cls, *, path, print=print, **kwargs):
|
|
107
|
+
"""Read the config saved inside the zarr dataset and instantiate the class for this config."""
|
|
108
|
+
|
|
70
109
|
assert os.path.exists(path), f"Path {path} does not exist."
|
|
71
110
|
z = zarr.open(path, mode="r")
|
|
72
111
|
config = z.attrs["_create_yaml_config"]
|
|
@@ -75,6 +114,8 @@ class GenericDatasetHandler:
|
|
|
75
114
|
|
|
76
115
|
@classmethod
|
|
77
116
|
def from_dataset(cls, *, path, **kwargs):
|
|
117
|
+
"""Instanciate the class from the path to the zarr dataset, without config."""
|
|
118
|
+
|
|
78
119
|
assert os.path.exists(path), f"Path {path} does not exist."
|
|
79
120
|
return cls(path=path, **kwargs)
|
|
80
121
|
|
|
@@ -156,68 +197,50 @@ class Loader(DatasetHandlerWithStatistics):
|
|
|
156
197
|
class InitialiserLoader(Loader):
|
|
157
198
|
def __init__(self, config, **kwargs):
|
|
158
199
|
super().__init__(**kwargs)
|
|
159
|
-
self.main_config = loader_config(config)
|
|
160
|
-
|
|
161
|
-
self.tmp_statistics.delete()
|
|
162
200
|
|
|
201
|
+
self.main_config = loader_config(config)
|
|
163
202
|
if self.test:
|
|
164
|
-
|
|
165
|
-
def test_dates(cfg, n=4):
|
|
166
|
-
LOG.warn("Running in test mode. Changing the list of dates to use only 4.")
|
|
167
|
-
groups = Groups(**cfg)
|
|
168
|
-
dates = groups.dates
|
|
169
|
-
return dict(start=dates[0], end=dates[n - 1], frequency=dates.frequency, group_by=n)
|
|
170
|
-
|
|
171
|
-
self.main_config.dates = test_dates(self.main_config.dates)
|
|
172
|
-
|
|
173
|
-
def set_to_test_mode(obj):
|
|
174
|
-
if isinstance(obj, (list, tuple)):
|
|
175
|
-
for v in obj:
|
|
176
|
-
set_to_test_mode(v)
|
|
177
|
-
return
|
|
178
|
-
if isinstance(obj, (dict, DictObj)):
|
|
179
|
-
if "grid" in obj:
|
|
180
|
-
previous = obj["grid"]
|
|
181
|
-
obj["grid"] = "20./20."
|
|
182
|
-
LOG.warn(f"Running in test mode. Setting grid to {obj['grid']} instead of {previous}")
|
|
183
|
-
if "number" in obj:
|
|
184
|
-
if isinstance(obj["number"], (list, tuple)):
|
|
185
|
-
previous = obj["number"]
|
|
186
|
-
obj["number"] = previous[0:3]
|
|
187
|
-
LOG.warn(f"Running in test mode. Setting number to {obj['number']} instead of {previous}")
|
|
188
|
-
for k, v in obj.items():
|
|
189
|
-
set_to_test_mode(v)
|
|
190
|
-
|
|
191
203
|
set_to_test_mode(self.main_config)
|
|
192
204
|
|
|
193
205
|
LOG.info(self.main_config.dates)
|
|
194
206
|
|
|
207
|
+
self.tmp_statistics.delete()
|
|
208
|
+
|
|
195
209
|
self.groups = Groups(**self.main_config.dates)
|
|
210
|
+
LOG.info(self.groups)
|
|
196
211
|
|
|
197
212
|
self.output = build_output(self.main_config.output, parent=self)
|
|
198
213
|
self.input = self.build_input()
|
|
199
|
-
|
|
200
214
|
LOG.info(self.input)
|
|
201
|
-
all_dates = self.groups.dates
|
|
202
|
-
self.minimal_input = self.input.select([all_dates[0]])
|
|
203
215
|
|
|
204
|
-
|
|
205
|
-
|
|
216
|
+
first_date = self.groups.dates[0]
|
|
217
|
+
self.minimal_input = self.input.select([first_date])
|
|
218
|
+
LOG.info("Minimal input (using only the first date) :")
|
|
206
219
|
LOG.info(self.minimal_input)
|
|
207
220
|
|
|
208
221
|
def build_statistics_dates(self, start, end):
|
|
222
|
+
"""Compute the start and end dates for the statistics, based on :
|
|
223
|
+
- The start and end dates in the config
|
|
224
|
+
- The default statistics dates convention
|
|
225
|
+
|
|
226
|
+
Then adapt according to the actual dates in the dataset.
|
|
227
|
+
"""
|
|
228
|
+
|
|
209
229
|
ds = open_dataset(self.path)
|
|
210
230
|
dates = ds.dates
|
|
211
231
|
|
|
232
|
+
# if not specified, use the default statistics dates
|
|
212
233
|
default_start, default_end = default_statistics_dates(dates)
|
|
213
234
|
if start is None:
|
|
214
235
|
start = default_start
|
|
215
236
|
if end is None:
|
|
216
237
|
end = default_end
|
|
217
238
|
|
|
239
|
+
# in any case, adapt to the actual dates in the dataset
|
|
218
240
|
start = as_first_date(start, dates)
|
|
219
241
|
end = as_last_date(end, dates)
|
|
220
242
|
|
|
243
|
+
# and convert to datetime to isoformat
|
|
221
244
|
start = start.astype(datetime.datetime)
|
|
222
245
|
end = end.astype(datetime.datetime)
|
|
223
246
|
return (start.isoformat(), end.isoformat())
|
|
@@ -227,7 +250,10 @@ class InitialiserLoader(Loader):
|
|
|
227
250
|
z.create_group("_build")
|
|
228
251
|
|
|
229
252
|
def initialise(self, check_name=True):
|
|
230
|
-
"""Create empty dataset
|
|
253
|
+
"""Create an empty dataset of the right final shape
|
|
254
|
+
|
|
255
|
+
Read a small part of the data to get the shape of the data and the resolution and more metadata.
|
|
256
|
+
"""
|
|
231
257
|
|
|
232
258
|
self.print("Config loaded ok:")
|
|
233
259
|
LOG.info(self.main_config)
|
|
@@ -276,11 +302,10 @@ class InitialiserLoader(Loader):
|
|
|
276
302
|
metadata["_create_yaml_config"] = self.main_config.get_serialisable_dict()
|
|
277
303
|
|
|
278
304
|
metadata["description"] = self.main_config.description
|
|
279
|
-
metadata["
|
|
305
|
+
metadata["licence"] = self.main_config["licence"]
|
|
306
|
+
metadata["attribution"] = self.main_config["attribution"]
|
|
280
307
|
|
|
281
|
-
metadata["data_request"] = self.minimal_input.data_request
|
|
282
308
|
metadata["remapping"] = self.output.remapping
|
|
283
|
-
|
|
284
309
|
metadata["order_by"] = self.output.order_by_as_list
|
|
285
310
|
metadata["flatten_grid"] = self.output.flatten_grid
|
|
286
311
|
|
|
@@ -288,26 +313,21 @@ class InitialiserLoader(Loader):
|
|
|
288
313
|
metadata["variables"] = variables
|
|
289
314
|
metadata["variables_with_nans"] = variables_with_nans
|
|
290
315
|
metadata["resolution"] = resolution
|
|
316
|
+
|
|
317
|
+
metadata["data_request"] = self.minimal_input.data_request
|
|
291
318
|
metadata["field_shape"] = self.minimal_input.field_shape
|
|
292
319
|
metadata["proj_string"] = self.minimal_input.proj_string
|
|
293
320
|
|
|
294
|
-
metadata["licence"] = self.main_config["licence"]
|
|
295
|
-
metadata["attribution"] = self.main_config["attribution"]
|
|
296
|
-
|
|
297
|
-
metadata["frequency"] = frequency
|
|
298
321
|
metadata["start_date"] = dates[0].isoformat()
|
|
299
322
|
metadata["end_date"] = dates[-1].isoformat()
|
|
323
|
+
metadata["frequency"] = frequency
|
|
300
324
|
metadata["missing_dates"] = [_.isoformat() for _ in dates.missing]
|
|
301
325
|
|
|
326
|
+
metadata["version"] = VERSION
|
|
327
|
+
|
|
302
328
|
if check_name:
|
|
303
329
|
basename, ext = os.path.splitext(os.path.basename(self.path)) # noqa: F841
|
|
304
|
-
ds_name = DatasetName(
|
|
305
|
-
basename,
|
|
306
|
-
resolution,
|
|
307
|
-
dates[0],
|
|
308
|
-
dates[-1],
|
|
309
|
-
frequency,
|
|
310
|
-
)
|
|
330
|
+
ds_name = DatasetName(basename, resolution, dates[0], dates[-1], frequency)
|
|
311
331
|
ds_name.raise_if_not_valid(print=self.print)
|
|
312
332
|
|
|
313
333
|
if len(dates) != total_shape[0]:
|
|
@@ -316,17 +336,12 @@ class InitialiserLoader(Loader):
|
|
|
316
336
|
f"does not match data shape {total_shape[0]}. {total_shape=}"
|
|
317
337
|
)
|
|
318
338
|
|
|
319
|
-
dates = normalize_and_check_dates(
|
|
320
|
-
dates,
|
|
321
|
-
metadata["start_date"],
|
|
322
|
-
metadata["end_date"],
|
|
323
|
-
metadata["frequency"],
|
|
324
|
-
)
|
|
339
|
+
dates = normalize_and_check_dates(dates, metadata["start_date"], metadata["end_date"], metadata["frequency"])
|
|
325
340
|
|
|
326
341
|
metadata.update(self.main_config.get("force_metadata", {}))
|
|
327
342
|
|
|
328
343
|
###############################################################
|
|
329
|
-
# write
|
|
344
|
+
# write metadata
|
|
330
345
|
###############################################################
|
|
331
346
|
|
|
332
347
|
self.initialise_dataset_backend()
|
|
@@ -346,10 +361,7 @@ class InitialiserLoader(Loader):
|
|
|
346
361
|
self.main_config.statistics.get("start"),
|
|
347
362
|
self.main_config.statistics.get("end"),
|
|
348
363
|
)
|
|
349
|
-
self.update_metadata(
|
|
350
|
-
statistics_start_date=statistics_start,
|
|
351
|
-
statistics_end_date=statistics_end,
|
|
352
|
-
)
|
|
364
|
+
self.update_metadata(statistics_start_date=statistics_start, statistics_end_date=statistics_end)
|
|
353
365
|
LOG.info(f"Will compute statistics from {statistics_start} to {statistics_end}")
|
|
354
366
|
|
|
355
367
|
self.registry.add_to_history("init finished")
|
|
@@ -586,37 +598,22 @@ class GenericAdditions(GenericDatasetHandler):
|
|
|
586
598
|
|
|
587
599
|
@property
|
|
588
600
|
def tmp_storage_path(self):
|
|
589
|
-
|
|
601
|
+
"""This should be implemented in the subclass."""
|
|
602
|
+
raise NotImplementedError()
|
|
590
603
|
|
|
591
604
|
@property
|
|
592
605
|
def final_storage_path(self):
|
|
593
|
-
|
|
606
|
+
"""This should be implemented in the subclass."""
|
|
607
|
+
raise NotImplementedError()
|
|
594
608
|
|
|
595
609
|
def initialise(self):
|
|
596
610
|
self.tmp_storage.delete()
|
|
597
611
|
self.tmp_storage.create()
|
|
598
612
|
LOG.info(f"Dataset {self.path} additions initialized.")
|
|
599
613
|
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
if "variables_with_nans" in z.attrs:
|
|
604
|
-
return z.attrs["variables_with_nans"]
|
|
605
|
-
return None
|
|
606
|
-
|
|
607
|
-
def allow_nan(self, name):
|
|
608
|
-
if self._variables_with_nans is not None:
|
|
609
|
-
return name in self._variables_with_nans
|
|
610
|
-
warnings.warn(f"❗Cannot find 'variables_with_nans' in {self.path}, Assuming nans allowed for {name}.")
|
|
611
|
-
return True
|
|
612
|
-
|
|
613
|
-
@classmethod
|
|
614
|
-
def _check_type_equal(cls, a, b):
|
|
615
|
-
a = list(a)
|
|
616
|
-
b = list(b)
|
|
617
|
-
a = a[0] if a else None
|
|
618
|
-
b = b[0] if b else None
|
|
619
|
-
assert type(a) is type(b), (type(a), type(b))
|
|
614
|
+
def run(self, parts):
|
|
615
|
+
"""This should be implemented in the subclass."""
|
|
616
|
+
raise NotImplementedError()
|
|
620
617
|
|
|
621
618
|
def finalise(self):
|
|
622
619
|
shape = (len(self.dates), len(self.variables))
|
|
@@ -696,7 +693,7 @@ class GenericAdditions(GenericDatasetHandler):
|
|
|
696
693
|
variables_names=self.variables,
|
|
697
694
|
has_nans=has_nans,
|
|
698
695
|
)
|
|
699
|
-
LOG.info(f"Dataset {self.path} additions
|
|
696
|
+
LOG.info(f"Dataset {self.path} additions finalised.")
|
|
700
697
|
self.check_statistics()
|
|
701
698
|
self._write(self.summary)
|
|
702
699
|
self.tmp_storage.delete()
|
|
@@ -711,6 +708,19 @@ class GenericAdditions(GenericDatasetHandler):
|
|
|
711
708
|
def check_statistics(self):
|
|
712
709
|
pass
|
|
713
710
|
|
|
711
|
+
@cached_property
|
|
712
|
+
def _variables_with_nans(self):
|
|
713
|
+
z = zarr.open(self.path, mode="r")
|
|
714
|
+
if "variables_with_nans" in z.attrs:
|
|
715
|
+
return z.attrs["variables_with_nans"]
|
|
716
|
+
return None
|
|
717
|
+
|
|
718
|
+
def allow_nan(self, name):
|
|
719
|
+
if self._variables_with_nans is not None:
|
|
720
|
+
return name in self._variables_with_nans
|
|
721
|
+
warnings.warn(f"❗Cannot find 'variables_with_nans' in {self.path}, Assuming nans allowed for {name}.")
|
|
722
|
+
return True
|
|
723
|
+
|
|
714
724
|
|
|
715
725
|
class StatisticsAddition(GenericAdditions):
|
|
716
726
|
def __init__(self, **kwargs):
|
|
@@ -798,7 +808,7 @@ class TendenciesStatisticsAddition(GenericAdditions):
|
|
|
798
808
|
start = z.attrs["statistics_start_date"]
|
|
799
809
|
end = z.attrs["statistics_end_date"]
|
|
800
810
|
start = datetime.datetime.fromisoformat(start)
|
|
801
|
-
ds = open_dataset(self.path, start=start
|
|
811
|
+
ds = open_dataset(self.path, start=start, end=end)
|
|
802
812
|
self.dates = ds.dates
|
|
803
813
|
self.total = len(self.dates)
|
|
804
814
|
|
|
@@ -98,6 +98,8 @@ def check_variance(x, variables_names, minimum, maximum, mean, count, sums, squa
|
|
|
98
98
|
|
|
99
99
|
|
|
100
100
|
def compute_statistics(array, check_variables_names=None, allow_nan=False):
|
|
101
|
+
"""Compute statistics for a given array, provides minimum, maximum, sum, squares, count and has_nans as a dictionary."""
|
|
102
|
+
|
|
101
103
|
nvars = array.shape[1]
|
|
102
104
|
|
|
103
105
|
LOG.info(f"Stats {nvars}, {array.shape}, {check_variables_names}")
|
|
@@ -242,10 +244,7 @@ class StatAggregator:
|
|
|
242
244
|
offset = 0
|
|
243
245
|
for _, _dates, stats in self.owner._gather_data():
|
|
244
246
|
assert isinstance(stats, dict), stats
|
|
245
|
-
assert stats["minimum"].shape[0] == len(_dates), (
|
|
246
|
-
stats["minimum"].shape,
|
|
247
|
-
len(_dates),
|
|
248
|
-
)
|
|
247
|
+
assert stats["minimum"].shape[0] == len(_dates), (stats["minimum"].shape, len(_dates))
|
|
249
248
|
assert stats["minimum"].shape[1] == len(self.variables_names), (
|
|
250
249
|
stats["minimum"].shape,
|
|
251
250
|
len(self.variables_names),
|
|
@@ -270,19 +269,13 @@ class StatAggregator:
|
|
|
270
269
|
for k in self.NAMES:
|
|
271
270
|
stats[k] = stats[k][bitmap]
|
|
272
271
|
|
|
273
|
-
assert stats["minimum"].shape[0] == len(dates), (
|
|
274
|
-
stats["minimum"].shape,
|
|
275
|
-
len(dates),
|
|
276
|
-
)
|
|
272
|
+
assert stats["minimum"].shape[0] == len(dates), (stats["minimum"].shape, len(dates))
|
|
277
273
|
|
|
278
274
|
# store data in self
|
|
279
275
|
found |= set(dates)
|
|
280
276
|
for name in self.NAMES:
|
|
281
277
|
array = getattr(self, name)
|
|
282
|
-
assert stats[name].shape[0] == len(dates), (
|
|
283
|
-
stats[name].shape,
|
|
284
|
-
len(dates),
|
|
285
|
-
)
|
|
278
|
+
assert stats[name].shape[0] == len(dates), (stats[name].shape, len(dates))
|
|
286
279
|
array[offset : offset + len(dates)] = stats[name]
|
|
287
280
|
offset += len(dates)
|
|
288
281
|
|
|
@@ -310,133 +303,7 @@ class StatAggregator:
|
|
|
310
303
|
stdev = np.sqrt(x)
|
|
311
304
|
|
|
312
305
|
for j, name in enumerate(self.variables_names):
|
|
313
|
-
check_data_values(
|
|
314
|
-
np.array(
|
|
315
|
-
[
|
|
316
|
-
mean[j],
|
|
317
|
-
]
|
|
318
|
-
),
|
|
319
|
-
name=name,
|
|
320
|
-
allow_nan=False,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
return Summary(
|
|
324
|
-
minimum=minimum,
|
|
325
|
-
maximum=maximum,
|
|
326
|
-
mean=mean,
|
|
327
|
-
count=count,
|
|
328
|
-
sums=sums,
|
|
329
|
-
squares=squares,
|
|
330
|
-
stdev=stdev,
|
|
331
|
-
variables_names=self.variables_names,
|
|
332
|
-
has_nans=has_nans,
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
class SummaryAggregator:
|
|
337
|
-
NAMES = ["minimum", "maximum", "sums", "squares", "count", "has_nans"]
|
|
338
|
-
|
|
339
|
-
def __init__(self, owner, dates, variables_names, allow_nan):
|
|
340
|
-
dates = sorted(dates)
|
|
341
|
-
dates = to_datetimes(dates)
|
|
342
|
-
assert dates, "No dates selected"
|
|
343
|
-
self.owner = owner
|
|
344
|
-
self.dates = dates
|
|
345
|
-
self.variables_names = variables_names
|
|
346
|
-
self.allow_nan = allow_nan
|
|
347
|
-
|
|
348
|
-
self.shape = (len(self.dates), len(self.variables_names))
|
|
349
|
-
LOG.info(f"Aggregating statistics on shape={self.shape}. Variables : {self.variables_names}")
|
|
350
|
-
|
|
351
|
-
self.minimum = np.full(self.shape, np.nan, dtype=np.float64)
|
|
352
|
-
self.maximum = np.full(self.shape, np.nan, dtype=np.float64)
|
|
353
|
-
self.sums = np.full(self.shape, np.nan, dtype=np.float64)
|
|
354
|
-
self.squares = np.full(self.shape, np.nan, dtype=np.float64)
|
|
355
|
-
self.count = np.full(self.shape, -1, dtype=np.int64)
|
|
356
|
-
self.has_nans = np.full(self.shape, False, dtype=np.bool_)
|
|
357
|
-
|
|
358
|
-
self._read()
|
|
359
|
-
|
|
360
|
-
def _read(self):
|
|
361
|
-
def check_type(a, b):
|
|
362
|
-
a = list(a)
|
|
363
|
-
b = list(b)
|
|
364
|
-
a = a[0] if a else None
|
|
365
|
-
b = b[0] if b else None
|
|
366
|
-
assert type(a) is type(b), (type(a), type(b))
|
|
367
|
-
|
|
368
|
-
found = set()
|
|
369
|
-
offset = 0
|
|
370
|
-
for _, _dates, stats in self.owner._gather_data():
|
|
371
|
-
for n in self.NAMES:
|
|
372
|
-
assert n in stats, (n, list(stats.keys()))
|
|
373
|
-
_dates = to_datetimes(_dates)
|
|
374
|
-
check_type(_dates, self.dates)
|
|
375
|
-
if found:
|
|
376
|
-
check_type(found, self.dates)
|
|
377
|
-
assert found.isdisjoint(_dates), "Duplicate dates found in precomputed statistics"
|
|
378
|
-
|
|
379
|
-
# filter dates
|
|
380
|
-
dates = set(_dates) & set(self.dates)
|
|
381
|
-
|
|
382
|
-
if not dates:
|
|
383
|
-
# dates have been completely filtered for this chunk
|
|
384
|
-
continue
|
|
385
|
-
|
|
386
|
-
# filter data
|
|
387
|
-
bitmap = np.isin(_dates, self.dates)
|
|
388
|
-
for k in self.NAMES:
|
|
389
|
-
stats[k] = stats[k][bitmap]
|
|
390
|
-
|
|
391
|
-
assert stats["minimum"].shape[0] == len(dates), (
|
|
392
|
-
stats["minimum"].shape,
|
|
393
|
-
len(dates),
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
# store data in self
|
|
397
|
-
found |= set(dates)
|
|
398
|
-
for name in self.NAMES:
|
|
399
|
-
array = getattr(self, name)
|
|
400
|
-
assert stats[name].shape[0] == len(dates), (
|
|
401
|
-
stats[name].shape,
|
|
402
|
-
len(dates),
|
|
403
|
-
)
|
|
404
|
-
array[offset : offset + len(dates)] = stats[name]
|
|
405
|
-
offset += len(dates)
|
|
406
|
-
|
|
407
|
-
for d in self.dates:
|
|
408
|
-
assert d in found, f"Statistics for date {d} not precomputed."
|
|
409
|
-
assert len(self.dates) == len(found), "Not all dates found in precomputed statistics"
|
|
410
|
-
assert len(self.dates) == offset, "Not all dates found in precomputed statistics."
|
|
411
|
-
LOG.info(f"Statistics for {len(found)} dates found.")
|
|
412
|
-
|
|
413
|
-
def aggregate(self):
|
|
414
|
-
minimum = np.nanmin(self.minimum, axis=0)
|
|
415
|
-
maximum = np.nanmax(self.maximum, axis=0)
|
|
416
|
-
sums = np.nansum(self.sums, axis=0)
|
|
417
|
-
squares = np.nansum(self.squares, axis=0)
|
|
418
|
-
count = np.nansum(self.count, axis=0)
|
|
419
|
-
has_nans = np.any(self.has_nans, axis=0)
|
|
420
|
-
mean = sums / count
|
|
421
|
-
|
|
422
|
-
assert sums.shape == count.shape == squares.shape == mean.shape == minimum.shape == maximum.shape
|
|
423
|
-
|
|
424
|
-
x = squares / count - mean * mean
|
|
425
|
-
# remove negative variance due to numerical errors
|
|
426
|
-
# x[- 1e-15 < (x / (np.sqrt(squares / count) + np.abs(mean))) < 0] = 0
|
|
427
|
-
check_variance(x, self.variables_names, minimum, maximum, mean, count, sums, squares)
|
|
428
|
-
stdev = np.sqrt(x)
|
|
429
|
-
|
|
430
|
-
for j, name in enumerate(self.variables_names):
|
|
431
|
-
check_data_values(
|
|
432
|
-
np.array(
|
|
433
|
-
[
|
|
434
|
-
mean[j],
|
|
435
|
-
]
|
|
436
|
-
),
|
|
437
|
-
name=name,
|
|
438
|
-
allow_nan=False,
|
|
439
|
-
)
|
|
306
|
+
check_data_values(np.array([mean[j]]), name=name, allow_nan=False)
|
|
440
307
|
|
|
441
308
|
return Summary(
|
|
442
309
|
minimum=minimum,
|
anemoi/datasets/data/__init__.py
CHANGED
|
@@ -30,3 +30,10 @@ def open_dataset(*args, **kwargs):
|
|
|
30
30
|
ds.arguments = {"args": args, "kwargs": kwargs}
|
|
31
31
|
ds._check()
|
|
32
32
|
return ds
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def list_dataset_names(*args, **kwargs):
|
|
36
|
+
ds = _open_dataset(*args, **kwargs)
|
|
37
|
+
names = set()
|
|
38
|
+
ds.get_dataset_names(names)
|
|
39
|
+
return sorted(names)
|
anemoi/datasets/data/dataset.py
CHANGED
anemoi/datasets/data/forwards.py
CHANGED
|
@@ -103,6 +103,9 @@ class Forwards(Dataset):
|
|
|
103
103
|
f"subclass_metadata_specific() must be implemented in derived class {self.__class__.__name__}"
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
+
def get_dataset_names(self, names):
|
|
107
|
+
self.forward.get_dataset_names(names)
|
|
108
|
+
|
|
106
109
|
|
|
107
110
|
class Combined(Forwards):
|
|
108
111
|
def __init__(self, datasets):
|
|
@@ -193,6 +196,10 @@ class Combined(Forwards):
|
|
|
193
196
|
offset += len(d)
|
|
194
197
|
return result
|
|
195
198
|
|
|
199
|
+
def get_dataset_names(self, names):
|
|
200
|
+
for d in self.datasets:
|
|
201
|
+
d.get_dataset_names(names)
|
|
202
|
+
|
|
196
203
|
|
|
197
204
|
class GivenAxis(Combined):
|
|
198
205
|
"""Given a given axis, combine the datasets along that axis."""
|
anemoi/datasets/data/masked.py
CHANGED
|
@@ -70,8 +70,12 @@ class Thinning(Masked):
|
|
|
70
70
|
self.thinning = thinning
|
|
71
71
|
self.method = method
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
shape = forward.field_shape
|
|
74
|
+
if len(shape) != 2:
|
|
75
|
+
raise ValueError("Thinning only works latitude/longitude fields")
|
|
76
|
+
|
|
77
|
+
latitudes = forward.latitudes.reshape(shape)
|
|
78
|
+
longitudes = forward.longitudes.reshape(shape)
|
|
75
79
|
latitudes = latitudes[::thinning, ::thinning].flatten()
|
|
76
80
|
longitudes = longitudes[::thinning, ::thinning].flatten()
|
|
77
81
|
|
anemoi/datasets/data/misc.py
CHANGED
|
@@ -49,7 +49,9 @@ def load_config():
|
|
|
49
49
|
if CONFIG is not None:
|
|
50
50
|
return CONFIG
|
|
51
51
|
|
|
52
|
-
conf = os.path.expanduser("~/.anemoi.toml")
|
|
52
|
+
conf = os.path.expanduser("~/.config/anemoi/settings.toml")
|
|
53
|
+
if not os.path.exists(conf):
|
|
54
|
+
conf = os.path.expanduser("~/.anemoi.toml")
|
|
53
55
|
|
|
54
56
|
if os.path.exists(conf):
|
|
55
57
|
|
anemoi/datasets/data/select.py
CHANGED
|
@@ -23,15 +23,19 @@ LOG = logging.getLogger(__name__)
|
|
|
23
23
|
class Select(Forwards):
|
|
24
24
|
"""Select a subset of the variables."""
|
|
25
25
|
|
|
26
|
-
def __init__(self, dataset, indices,
|
|
26
|
+
def __init__(self, dataset, indices, reason):
|
|
27
|
+
|
|
28
|
+
reason = reason.copy()
|
|
29
|
+
|
|
27
30
|
while isinstance(dataset, Select):
|
|
28
31
|
indices = [dataset.indices[i] for i in indices]
|
|
32
|
+
reason.update(dataset.reason)
|
|
29
33
|
dataset = dataset.dataset
|
|
30
34
|
|
|
31
35
|
self.dataset = dataset
|
|
32
36
|
self.indices = list(indices)
|
|
33
37
|
assert len(self.indices) > 0
|
|
34
|
-
self.
|
|
38
|
+
self.reason = reason or {"indices": self.indices}
|
|
35
39
|
|
|
36
40
|
# Forward other properties to the main dataset
|
|
37
41
|
super().__init__(dataset)
|
|
@@ -86,11 +90,11 @@ class Select(Forwards):
|
|
|
86
90
|
return Source(self, index, self.dataset.source(self.indices[index]))
|
|
87
91
|
|
|
88
92
|
def tree(self):
|
|
89
|
-
return Node(self, [self.dataset.tree()], **self.
|
|
93
|
+
return Node(self, [self.dataset.tree()], **self.reason)
|
|
90
94
|
|
|
91
95
|
def subclass_metadata_specific(self):
|
|
92
96
|
# return dict(indices=self.indices)
|
|
93
|
-
return
|
|
97
|
+
return self.reason
|
|
94
98
|
|
|
95
99
|
|
|
96
100
|
class Rename(Forwards):
|
anemoi/datasets/data/stores.py
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
# nor does it submit to any jurisdiction.
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
|
+
import os
|
|
9
10
|
import warnings
|
|
10
11
|
from functools import cached_property
|
|
11
12
|
|
|
@@ -235,7 +236,11 @@ class Zarr(Dataset):
|
|
|
235
236
|
|
|
236
237
|
@property
|
|
237
238
|
def field_shape(self):
|
|
238
|
-
|
|
239
|
+
try:
|
|
240
|
+
return tuple(self.z.attrs["field_shape"])
|
|
241
|
+
except KeyError:
|
|
242
|
+
LOG.warning("No 'field_shape' in %r, assuming 1D fields", self)
|
|
243
|
+
return (self.shape[-1],)
|
|
239
244
|
|
|
240
245
|
@property
|
|
241
246
|
def frequency(self):
|
|
@@ -288,6 +293,10 @@ class Zarr(Dataset):
|
|
|
288
293
|
def tree(self):
|
|
289
294
|
return Node(self, [], path=self.path)
|
|
290
295
|
|
|
296
|
+
def get_dataset_names(self, names):
|
|
297
|
+
name, _ = os.path.splitext(os.path.basename(self.path))
|
|
298
|
+
names.add(name)
|
|
299
|
+
|
|
291
300
|
|
|
292
301
|
class ZarrWithMissingDates(Zarr):
|
|
293
302
|
def __init__(self, path):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: anemoi-datasets
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: A package to hold various functions to support training of ML models on ECMWF data.
|
|
5
5
|
Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
|
|
6
6
|
License: Apache License
|
|
@@ -223,7 +223,7 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
|
223
223
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
224
224
|
Requires-Python: >=3.9
|
|
225
225
|
License-File: LICENSE
|
|
226
|
-
Requires-Dist: anemoi-utils[provenance] >=0.3
|
|
226
|
+
Requires-Dist: anemoi-utils[provenance] >=0.3.4
|
|
227
227
|
Requires-Dist: numpy
|
|
228
228
|
Requires-Dist: pyyaml
|
|
229
229
|
Requires-Dist: semantic-version
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
anemoi/datasets/__init__.py,sha256=
|
|
1
|
+
anemoi/datasets/__init__.py,sha256=Z1gqZWhecLcT0RZQqYBLlz01MUlUZd0kWEj_RavbITM,782
|
|
2
2
|
anemoi/datasets/__main__.py,sha256=cLA2PidDTOUHaDGzd0_E5iioKYNe-PSTv567Y2fuwQk,723
|
|
3
|
-
anemoi/datasets/_version.py,sha256=
|
|
3
|
+
anemoi/datasets/_version.py,sha256=FAr0t5Ub_Olk5Ke3Xi4Oeu5jcLPAvKpdk9naAtMuou8,411
|
|
4
4
|
anemoi/datasets/grids.py,sha256=3YBMMJodgYhavarXPAlMZHaMtDT9v2IbTmAXZTqf8Qo,8481
|
|
5
5
|
anemoi/datasets/commands/__init__.py,sha256=qAybFZPBBQs0dyx7dZ3X5JsLpE90pwrqt1vSV7cqEIw,706
|
|
6
6
|
anemoi/datasets/commands/compare.py,sha256=p2jQOAC3JhScCLF0GjTCO8goYLWLN8p7vzy_gf5fFcI,1473
|
|
7
|
-
anemoi/datasets/commands/copy.py,sha256=
|
|
7
|
+
anemoi/datasets/commands/copy.py,sha256=SxAeN51owyN5gwtwpt30xhJSIJRlJb9YOUt_4K4m-D8,11780
|
|
8
8
|
anemoi/datasets/commands/create.py,sha256=POdOsVDlvRrHFFkI3SNXNgNIbSxkVUUPMoo660x7Ma0,987
|
|
9
9
|
anemoi/datasets/commands/inspect.py,sha256=G3fzcgiLaU8jln7GKvgamN7Y06-qC_JnFw2SbNn1_E4,18646
|
|
10
10
|
anemoi/datasets/commands/scan.py,sha256=HxsLdCgBMSdEXjlJfPq5M_9LxXHHQIoZ1ZEHO_AoPgA,2881
|
|
11
11
|
anemoi/datasets/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
anemoi/datasets/compute/recentre.py,sha256=GRxI6rY_KyXJnZGPxU_UO9YDb-rY_raK70Fiwv1mjhs,4792
|
|
13
|
-
anemoi/datasets/create/__init__.py,sha256=
|
|
13
|
+
anemoi/datasets/create/__init__.py,sha256=Q8uXUdbE-SRYYaZd5cPQ2RVbSoHnGX7-eKdOJHYVhDk,5951
|
|
14
14
|
anemoi/datasets/create/check.py,sha256=DLjw-eyaCNxPhoKFsP4Yn_l3SIr57YHdyPR-tE5vx80,5791
|
|
15
15
|
anemoi/datasets/create/chunks.py,sha256=YEDcr0K2KiiceSTiBuZzj0TbRbzZ9J546XO7rrrTFQw,2441
|
|
16
16
|
anemoi/datasets/create/config.py,sha256=uLIp1WHg3hbqwwMV9EepMwJQsXJAGImkbo0okBeEVd4,7683
|
|
17
17
|
anemoi/datasets/create/input.py,sha256=3G7sqdn7R1pLBeeswXwwi8VRAHrBnjq1PdRYHJBe594,27741
|
|
18
|
-
anemoi/datasets/create/loaders.py,sha256
|
|
18
|
+
anemoi/datasets/create/loaders.py,sha256=-fJ9qKjsCd8Wvnobn34WsQpE9uAjon5M4REgCpW5q_w,30594
|
|
19
19
|
anemoi/datasets/create/patch.py,sha256=xjCLhvIQKRqmypsKInRU1CvFh1uoaB3YGSQP1UVZZik,3682
|
|
20
20
|
anemoi/datasets/create/persistent.py,sha256=nT8gvhVPdI1H3zW_F7uViGKIlQQ94jCDrMSWTmhQ2_A,4290
|
|
21
21
|
anemoi/datasets/create/size.py,sha256=A1w6RkaL0L9IlwIdmYsCTJTecmY_QtvbkGf__jvQle0,1068
|
|
@@ -31,43 +31,43 @@ anemoi/datasets/create/functions/filters/rename.py,sha256=cGoHr-IS-PhYEtZvXDpH03
|
|
|
31
31
|
anemoi/datasets/create/functions/filters/rotate_winds.py,sha256=fUdh8ILcMzMzckGlvwzdgG-c7w5R9NnWfaijp28Bf5M,4092
|
|
32
32
|
anemoi/datasets/create/functions/filters/unrotate_winds.py,sha256=nsa3EHly8ppWd2WH4ROoMczM8WFu5qKaIhO_UFcL9TY,3502
|
|
33
33
|
anemoi/datasets/create/functions/sources/__init__.py,sha256=Xe9G54CKvCI3ji-7k0R5l0WZZdhlydRgawsXuBcX_hg,379
|
|
34
|
-
anemoi/datasets/create/functions/sources/accumulations.py,sha256=
|
|
34
|
+
anemoi/datasets/create/functions/sources/accumulations.py,sha256=klbp-akoZlOk9jByDFsgPfHRCdfLvpatTLMxDPZaNZc,12943
|
|
35
35
|
anemoi/datasets/create/functions/sources/constants.py,sha256=aqquu6HDc8t-zsF9KRFLaj0eV4S0UPZ59BVna8E3bU8,785
|
|
36
36
|
anemoi/datasets/create/functions/sources/empty.py,sha256=SBuAfC33imbfcRnFnnOR44y8Q3KSQcqx3juIcXfCa3c,481
|
|
37
37
|
anemoi/datasets/create/functions/sources/forcings.py,sha256=EVcdu8puMSW451qj3LKCWWXaSf2LlmF8YXVs8hSMxkU,643
|
|
38
38
|
anemoi/datasets/create/functions/sources/grib.py,sha256=YQNuGnlh2EYb2NIHYpzlipwUTmOhrmyQtP3zgk8MAUU,1661
|
|
39
39
|
anemoi/datasets/create/functions/sources/hindcasts.py,sha256=0Psnsx2J0cRLMpJuNN-gESm1xJFC1gmQzI8sdnXCoYE,13042
|
|
40
|
-
anemoi/datasets/create/functions/sources/mars.py,sha256=
|
|
40
|
+
anemoi/datasets/create/functions/sources/mars.py,sha256=JWsbzyoXF95HPk2VWzmX53f_SJwXhKkaJvXtXJMGLig,5285
|
|
41
41
|
anemoi/datasets/create/functions/sources/netcdf.py,sha256=kic6PH7SAK3gseXChD38IDXw6Zcg2zhF4SeDXB2LQ8Q,2084
|
|
42
42
|
anemoi/datasets/create/functions/sources/opendap.py,sha256=T0CPinscfafrVLaye5ue-PbiCNbcNqf_3m6pphN9rCU,543
|
|
43
43
|
anemoi/datasets/create/functions/sources/recentre.py,sha256=t07LIXG3Hp9gmPkPriILVt86TxubsHyS1EL1lzwgtXY,1810
|
|
44
44
|
anemoi/datasets/create/functions/sources/source.py,sha256=hPQnV_6UIxFw97uRKcTA8TplcgG1kC8NlFHoEaaLet4,1418
|
|
45
45
|
anemoi/datasets/create/functions/sources/tendencies.py,sha256=kwS_GZt8R9kpfs5RrvxPb0Gj-5nDP0sgJgfSRCAwwww,4057
|
|
46
|
-
anemoi/datasets/create/statistics/__init__.py,sha256=
|
|
46
|
+
anemoi/datasets/create/statistics/__init__.py,sha256=eXyOdlgXBt6QdVWM7ZVyUWdFMv6iNsFefkjvOVvZAlQ,11010
|
|
47
47
|
anemoi/datasets/create/statistics/summary.py,sha256=sgmhA24y3VRyjmDUgTnPIqcHSlWBbFA0qynx6gJ9Xw8,3370
|
|
48
|
-
anemoi/datasets/data/__init__.py,sha256=
|
|
48
|
+
anemoi/datasets/data/__init__.py,sha256=to9L_RZVQ4OgyHUpX6lcvt4GqJdZjBa5HCTaWx1aGKo,1046
|
|
49
49
|
anemoi/datasets/data/concat.py,sha256=AkpyOs16OjW7X0cdyYFQfWSCV6dteXBp-x9WlokO-DI,3550
|
|
50
|
-
anemoi/datasets/data/dataset.py,sha256=
|
|
50
|
+
anemoi/datasets/data/dataset.py,sha256=LBUwWhwcAcovLv0FOLT-rA-yNZhcBFUMc03BfVh7UFc,7465
|
|
51
51
|
anemoi/datasets/data/debug.css,sha256=z2X_ZDSnZ9C3pyZPWnQiEyAxuMxUaxJxET4oaCImTAQ,211
|
|
52
52
|
anemoi/datasets/data/debug.py,sha256=PcyrjgxaLzeb_vf12pvUtPPVvBRHNm1SimythZvqsP4,6303
|
|
53
53
|
anemoi/datasets/data/ensemble.py,sha256=AsP7Xx0ZHLoZs6a4EC0jtyGYIcOvZvvKXhgNsIvqIN8,1137
|
|
54
|
-
anemoi/datasets/data/forwards.py,sha256=
|
|
54
|
+
anemoi/datasets/data/forwards.py,sha256=UZOOMUblGS21aaPoFfQa0ONSUaxkqlZQF3KGRhlCr9I,7899
|
|
55
55
|
anemoi/datasets/data/grids.py,sha256=rooOeR6rvjl4U8B4LO3N23fcgxvGE7ZUmhVryk1QS4M,7493
|
|
56
56
|
anemoi/datasets/data/indexing.py,sha256=625m__JG5m_tDMrkz1hB6Vydenwt0oHuyAlc-o3Zwos,4799
|
|
57
57
|
anemoi/datasets/data/join.py,sha256=dtCBbMTicqrRPxfBULi3RwEcQBLhQpIcvCjdN5A3XUU,4892
|
|
58
|
-
anemoi/datasets/data/masked.py,sha256=
|
|
59
|
-
anemoi/datasets/data/misc.py,sha256=
|
|
60
|
-
anemoi/datasets/data/select.py,sha256=
|
|
61
|
-
anemoi/datasets/data/statistics.py,sha256=
|
|
62
|
-
anemoi/datasets/data/stores.py,sha256=
|
|
58
|
+
anemoi/datasets/data/masked.py,sha256=czAv1ZfZ9q6Wr4RqI2Xj8SEm7yoCgJrwMl-CPDs_wSI,3857
|
|
59
|
+
anemoi/datasets/data/misc.py,sha256=tuNsUY06nWh3Raf_RCi8bzCXsMB4t2hOuIkNGV4epj8,10501
|
|
60
|
+
anemoi/datasets/data/select.py,sha256=Fqek_bAI0A53n7EeEgGxn5wFL76r7Eb7H2hN5L2YNx8,3768
|
|
61
|
+
anemoi/datasets/data/statistics.py,sha256=lZCcKw9s7ttMBEp6ANyxtbXoZZvchhE7SClq-D4AUR8,1645
|
|
62
|
+
anemoi/datasets/data/stores.py,sha256=yy914zMHIYKm5q6mHOqGeK0dC_26VFeqKLXyb7x9NXE,11190
|
|
63
63
|
anemoi/datasets/data/subset.py,sha256=9urVTXdnwCgqn0_BRYquMi8oiXn4ubAf0n4586hWfKw,3814
|
|
64
64
|
anemoi/datasets/data/unchecked.py,sha256=xhdMg-ToI1UfBWHNsWyn1y2meZWngZtHx-33L0KqKp8,4037
|
|
65
65
|
anemoi/datasets/dates/__init__.py,sha256=4ItowfLLh90T8L_JOjtv98lE6M7gAaWt7dV3niUrFvk,4473
|
|
66
66
|
anemoi/datasets/dates/groups.py,sha256=iq310Pi7ullglOhcNblv14MmcT8FPgYCD5s45qAfV_s,3383
|
|
67
67
|
anemoi/datasets/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
anemoi_datasets-0.3.
|
|
69
|
-
anemoi_datasets-0.3.
|
|
70
|
-
anemoi_datasets-0.3.
|
|
71
|
-
anemoi_datasets-0.3.
|
|
72
|
-
anemoi_datasets-0.3.
|
|
73
|
-
anemoi_datasets-0.3.
|
|
68
|
+
anemoi_datasets-0.3.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
69
|
+
anemoi_datasets-0.3.8.dist-info/METADATA,sha256=4VLO7xHj8yzd7nJpncoRsix8l95gXOXfTrk4A-UX2X0,16019
|
|
70
|
+
anemoi_datasets-0.3.8.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
|
|
71
|
+
anemoi_datasets-0.3.8.dist-info/entry_points.txt,sha256=yR-o-4uiPEA_GLBL81SkMYnUoxq3CAV3hHulQiRtGG0,66
|
|
72
|
+
anemoi_datasets-0.3.8.dist-info/top_level.txt,sha256=DYn8VPs-fNwr7fNH9XIBqeXIwiYYd2E2k5-dUFFqUz0,7
|
|
73
|
+
anemoi_datasets-0.3.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|