climate-ref-core 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- climate_ref_core/__init__.py +7 -0
- climate_ref_core/constraints.py +363 -0
- climate_ref_core/dataset_registry.py +158 -0
- climate_ref_core/datasets.py +157 -0
- climate_ref_core/diagnostics.py +549 -0
- climate_ref_core/env.py +35 -0
- climate_ref_core/exceptions.py +48 -0
- climate_ref_core/executor.py +96 -0
- climate_ref_core/logging.py +146 -0
- climate_ref_core/providers.py +418 -0
- climate_ref_core/py.typed +0 -0
- climate_ref_core/pycmec/README.md +1 -0
- climate_ref_core/pycmec/__init__.py +3 -0
- climate_ref_core/pycmec/controlled_vocabulary.py +175 -0
- climate_ref_core/pycmec/cv_cmip7_aft.yaml +44 -0
- climate_ref_core/pycmec/metric.py +437 -0
- climate_ref_core/pycmec/output.py +207 -0
- climate_ref_core-0.5.0.dist-info/METADATA +63 -0
- climate_ref_core-0.5.0.dist-info/RECORD +22 -0
- climate_ref_core-0.5.0.dist-info/WHEEL +4 -0
- climate_ref_core-0.5.0.dist-info/licenses/LICENCE +201 -0
- climate_ref_core-0.5.0.dist-info/licenses/NOTICE +3 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset selection constraints
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
import warnings
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from collections.abc import Mapping
|
|
9
|
+
from typing import Protocol, runtime_checkable
|
|
10
|
+
|
|
11
|
+
if sys.version_info < (3, 11):
|
|
12
|
+
from typing_extensions import Self
|
|
13
|
+
else:
|
|
14
|
+
from typing import Self
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
from attrs import frozen
|
|
19
|
+
from loguru import logger
|
|
20
|
+
|
|
21
|
+
from climate_ref_core.datasets import SourceDatasetType
|
|
22
|
+
from climate_ref_core.exceptions import ConstraintNotSatisfied
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@runtime_checkable
|
|
26
|
+
class GroupValidator(Protocol):
|
|
27
|
+
"""
|
|
28
|
+
A constraint that must be satisfied when executing a given diagnostic run.
|
|
29
|
+
|
|
30
|
+
All constraints must be satisfied for a given group to be run.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def validate(self, group: pd.DataFrame) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Validate if the constraint is satisfied by the dataset.
|
|
36
|
+
|
|
37
|
+
This is executed after the apply method to determine if the constraint is satisfied.
|
|
38
|
+
If the constraint is not satisfied, the group will not be executed.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
group
|
|
43
|
+
A group of datasets that is being validated.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
:
|
|
48
|
+
Whether the constraint is satisfied
|
|
49
|
+
"""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@runtime_checkable
|
|
54
|
+
class GroupOperation(Protocol):
|
|
55
|
+
"""
|
|
56
|
+
An operation to perform on a group of datasets resulting in a new group of datasets.
|
|
57
|
+
|
|
58
|
+
!! warning
|
|
59
|
+
|
|
60
|
+
Operations should not mutate the input group, but instead return a new group.
|
|
61
|
+
Mutating the input group may result in unexpected behaviour.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
65
|
+
"""
|
|
66
|
+
Perform an operation on the group of datasets.
|
|
67
|
+
|
|
68
|
+
A new group of datasets should be returned if modifications are required,
|
|
69
|
+
and the input group should not be modified. If no modifications are required,
|
|
70
|
+
return the input group unchanged.
|
|
71
|
+
If this operation fails, a ConstraintNotSatisfied exception should be raised.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
group
|
|
76
|
+
A group of datasets that is being validated.
|
|
77
|
+
data_catalog
|
|
78
|
+
The data catalog of datasets
|
|
79
|
+
|
|
80
|
+
Raises
|
|
81
|
+
------
|
|
82
|
+
ConstraintNotSatisfied
|
|
83
|
+
The operation was not successful
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
:
|
|
88
|
+
The updated group of datasets
|
|
89
|
+
"""
|
|
90
|
+
...
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
GroupConstraint = GroupOperation | GroupValidator
|
|
94
|
+
"""
|
|
95
|
+
A constraint that must be satisfied for a group of datasets to be executed.
|
|
96
|
+
|
|
97
|
+
This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
|
|
98
|
+
The group must satisfy all constraints to be processed.
|
|
99
|
+
|
|
100
|
+
This can include operations that are applied to a group of datasets which may modify the group,
|
|
101
|
+
but may also include validators that check if the group satisfies a certain condition.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def apply_constraint(
|
|
106
|
+
dataframe: pd.DataFrame,
|
|
107
|
+
constraint: GroupConstraint,
|
|
108
|
+
data_catalog: pd.DataFrame,
|
|
109
|
+
) -> pd.DataFrame | None:
|
|
110
|
+
"""
|
|
111
|
+
Apply a constraint to a group of datasets
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
dataframe:
|
|
116
|
+
The group of datasets to apply the constraint to.
|
|
117
|
+
constraint
|
|
118
|
+
The constraint to apply.
|
|
119
|
+
data_catalog
|
|
120
|
+
The data catalog of all datasets.
|
|
121
|
+
|
|
122
|
+
Returns
|
|
123
|
+
-------
|
|
124
|
+
:
|
|
125
|
+
The updated group of datasets or None if the constraint was not satisfied
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
updated_group = (
|
|
129
|
+
constraint.apply(dataframe, data_catalog) if isinstance(constraint, GroupOperation) else dataframe
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
valid = constraint.validate(updated_group) if isinstance(constraint, GroupValidator) else True
|
|
133
|
+
if not valid:
|
|
134
|
+
logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
135
|
+
raise ConstraintNotSatisfied(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
136
|
+
except ConstraintNotSatisfied:
|
|
137
|
+
logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
return updated_group
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@frozen
|
|
144
|
+
class RequireFacets:
|
|
145
|
+
"""
|
|
146
|
+
A constraint that requires a dataset to have certain facets.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
dimension: str
|
|
150
|
+
required_facets: tuple[str, ...]
|
|
151
|
+
|
|
152
|
+
def validate(self, group: pd.DataFrame) -> bool:
|
|
153
|
+
"""
|
|
154
|
+
Check that the required facets are present in the group
|
|
155
|
+
"""
|
|
156
|
+
if self.dimension not in group:
|
|
157
|
+
logger.warning(f"Dimension {self.dimension} not present in group {group}")
|
|
158
|
+
return False
|
|
159
|
+
return all(value in group[self.dimension].values for value in self.required_facets)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@frozen
|
|
163
|
+
class AddSupplementaryDataset:
|
|
164
|
+
"""
|
|
165
|
+
Include e.g. a cell measure or ancillary variable in the selection.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
supplementary_facets: Mapping[str, str | tuple[str, ...]]
|
|
169
|
+
"""
|
|
170
|
+
Facets describing the supplementary dataset.
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
matching_facets: tuple[str, ...]
|
|
174
|
+
"""
|
|
175
|
+
Facets that must match with datasets in the selection.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
optional_matching_facets: tuple[str, ...]
|
|
179
|
+
"""
|
|
180
|
+
Select only the best matching datasets based on similarity with these facets.
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
def apply(
|
|
184
|
+
self,
|
|
185
|
+
group: pd.DataFrame,
|
|
186
|
+
data_catalog: pd.DataFrame,
|
|
187
|
+
) -> pd.DataFrame:
|
|
188
|
+
"""
|
|
189
|
+
Add a supplementary dataset to the group.
|
|
190
|
+
"""
|
|
191
|
+
supplementary_facets: defaultdict[str, tuple[str, ...]] = defaultdict(tuple)
|
|
192
|
+
for facet, values in self.supplementary_facets.items():
|
|
193
|
+
supplementary_facets[facet] = values if isinstance(values, tuple) else (values,)
|
|
194
|
+
|
|
195
|
+
for facet in self.matching_facets:
|
|
196
|
+
values = tuple(group[facet].unique())
|
|
197
|
+
supplementary_facets[facet] += values
|
|
198
|
+
|
|
199
|
+
supplementary_group = data_catalog
|
|
200
|
+
for facet, values in supplementary_facets.items():
|
|
201
|
+
mask = supplementary_group[facet].isin(values)
|
|
202
|
+
supplementary_group = supplementary_group[mask]
|
|
203
|
+
|
|
204
|
+
if not supplementary_group.empty and self.optional_matching_facets:
|
|
205
|
+
facets = list(self.matching_facets + self.optional_matching_facets)
|
|
206
|
+
datasets = group[facets].drop_duplicates()
|
|
207
|
+
indices = set()
|
|
208
|
+
for i in range(len(datasets)):
|
|
209
|
+
scores = (supplementary_group[facets] == datasets.iloc[i]).sum(axis=1)
|
|
210
|
+
matches = supplementary_group[scores == scores.max()]
|
|
211
|
+
# Select the latest version if there are multiple matches
|
|
212
|
+
matches = matches[matches["version"] == matches["version"].max()]
|
|
213
|
+
indices.add(matches.index[0])
|
|
214
|
+
supplementary_group = supplementary_group.loc[list(indices)].drop_duplicates()
|
|
215
|
+
|
|
216
|
+
return pd.concat([group, supplementary_group])
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def from_defaults(
|
|
220
|
+
cls,
|
|
221
|
+
variable: str,
|
|
222
|
+
source_type: SourceDatasetType,
|
|
223
|
+
) -> Self:
|
|
224
|
+
"""
|
|
225
|
+
Include e.g. a cell measure or ancillary variable in the selection.
|
|
226
|
+
|
|
227
|
+
The constraint is created using the defaults for the source_type.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
variable:
|
|
232
|
+
The name of the variable to add.
|
|
233
|
+
source_type:
|
|
234
|
+
The source_type of the variable to add.
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
:
|
|
239
|
+
A constraint to include a supplementary variable.
|
|
240
|
+
|
|
241
|
+
"""
|
|
242
|
+
kwargs = {
|
|
243
|
+
SourceDatasetType.CMIP6: {
|
|
244
|
+
"matching_facets": (
|
|
245
|
+
"source_id",
|
|
246
|
+
"grid_label",
|
|
247
|
+
),
|
|
248
|
+
"optional_matching_facets": (
|
|
249
|
+
"table_id",
|
|
250
|
+
"experiment_id",
|
|
251
|
+
"member_id",
|
|
252
|
+
"version",
|
|
253
|
+
),
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
variable_facet = {
|
|
257
|
+
SourceDatasetType.CMIP6: "variable_id",
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
supplementary_facets = {variable_facet[source_type]: variable}
|
|
261
|
+
return cls(supplementary_facets, **kwargs[source_type])
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@frozen
|
|
265
|
+
class RequireContiguousTimerange:
|
|
266
|
+
"""
|
|
267
|
+
A constraint that requires datasets to have a contiguous timerange.
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
group_by: tuple[str, ...]
|
|
271
|
+
"""
|
|
272
|
+
The fields to group the datasets by. Each group must be contiguous in time
|
|
273
|
+
to fulfill the constraint.
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
def validate(self, group: pd.DataFrame) -> bool:
|
|
277
|
+
"""
|
|
278
|
+
Check that all subgroups of the group have a contiguous timerange.
|
|
279
|
+
"""
|
|
280
|
+
# Maximum allowed time difference between the end of one file and the
|
|
281
|
+
# start of the next file.
|
|
282
|
+
max_timedelta = pd.Timedelta(
|
|
283
|
+
days=31, # Maximum number of days in a month.
|
|
284
|
+
hours=1, # Allow for potential rounding errors.
|
|
285
|
+
)
|
|
286
|
+
group = group.dropna(subset=["start_time", "end_time"])
|
|
287
|
+
if len(group) < 2: # noqa: PLR2004
|
|
288
|
+
return True
|
|
289
|
+
|
|
290
|
+
for _, subgroup in group.groupby(list(self.group_by)):
|
|
291
|
+
if len(subgroup) < 2: # noqa: PLR2004
|
|
292
|
+
continue
|
|
293
|
+
sorted_group = subgroup.sort_values("start_time", kind="stable")
|
|
294
|
+
start_series = sorted_group["start_time"]
|
|
295
|
+
end_series = sorted_group["end_time"]
|
|
296
|
+
# Sometimes the elements of start_series.values are of type datetime64[ns]
|
|
297
|
+
# and sometimes its elements are of type datetime.datetime.
|
|
298
|
+
# Convert both arrays to datetime.datetime objects to make sure they
|
|
299
|
+
# can be subtracted.
|
|
300
|
+
with warnings.catch_warnings():
|
|
301
|
+
# We have already mitigated the future change in behaviour of DatetimeProperties.to_pydatetime
|
|
302
|
+
warnings.simplefilter("ignore", FutureWarning)
|
|
303
|
+
|
|
304
|
+
if hasattr(start_series, "dt"):
|
|
305
|
+
start_array = np.array(start_series.dt.to_pydatetime())
|
|
306
|
+
else:
|
|
307
|
+
start_array = start_series.values # type: ignore[assignment]
|
|
308
|
+
if hasattr(end_series, "dt"):
|
|
309
|
+
end_array = np.array(end_series.dt.to_pydatetime())
|
|
310
|
+
else:
|
|
311
|
+
end_array = end_series.values # type: ignore[assignment]
|
|
312
|
+
diff = start_array[1:] - end_array[:-1]
|
|
313
|
+
gap_indices = diff > max_timedelta
|
|
314
|
+
if gap_indices.any():
|
|
315
|
+
paths = sorted_group["path"]
|
|
316
|
+
for gap_idx in np.flatnonzero(gap_indices):
|
|
317
|
+
logger.debug(
|
|
318
|
+
f"Constraint {self.__class__.__name__} not satisfied "
|
|
319
|
+
f"because gap larger than {max_timedelta} found between "
|
|
320
|
+
f"{paths.iloc[gap_idx]} and {paths.iloc[gap_idx + 1]}"
|
|
321
|
+
)
|
|
322
|
+
return False
|
|
323
|
+
return True
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@frozen
|
|
327
|
+
class RequireOverlappingTimerange:
|
|
328
|
+
"""
|
|
329
|
+
A constraint that requires datasets to have an overlapping timerange.
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
group_by: tuple[str, ...]
|
|
333
|
+
"""
|
|
334
|
+
The fields to group the datasets by. There must be overlap in time between
|
|
335
|
+
the groups to fulfill the constraint.
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
def validate(self, group: pd.DataFrame) -> bool:
|
|
339
|
+
"""
|
|
340
|
+
Check that all subgroups of the group have an overlapping timerange.
|
|
341
|
+
"""
|
|
342
|
+
group = group.dropna(subset=["start_time", "end_time"])
|
|
343
|
+
if len(group) < 2: # noqa: PLR2004
|
|
344
|
+
return True
|
|
345
|
+
|
|
346
|
+
starts = group.groupby(list(self.group_by))["start_time"].min()
|
|
347
|
+
ends = group.groupby(list(self.group_by))["end_time"].max()
|
|
348
|
+
return starts.max() < ends.min() # type: ignore[no-any-return]
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@frozen
|
|
352
|
+
class SelectParentExperiment:
|
|
353
|
+
"""
|
|
354
|
+
Include a dataset's parent experiment in the selection
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
358
|
+
"""
|
|
359
|
+
Include a dataset's parent experiment in the selection
|
|
360
|
+
|
|
361
|
+
Not yet implemented
|
|
362
|
+
"""
|
|
363
|
+
raise NotImplementedError("This is not implemented yet") # pragma: no cover
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data registries for non-published reference data
|
|
3
|
+
|
|
4
|
+
These data are placeholders until these data have been added to obs4MIPs.
|
|
5
|
+
The CMIP7 Assessment Fas Track REF requires that reference datasets are openly licensed
|
|
6
|
+
before it is included in any published data catalogs.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import importlib.resources
|
|
10
|
+
import os
|
|
11
|
+
import pathlib
|
|
12
|
+
import shutil
|
|
13
|
+
|
|
14
|
+
import pooch
|
|
15
|
+
from loguru import logger
|
|
16
|
+
from rich.progress import track
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def fetch_all_files(
|
|
20
|
+
registry: pooch.Pooch,
|
|
21
|
+
name: str,
|
|
22
|
+
output_dir: pathlib.Path | None,
|
|
23
|
+
symlink: bool = False,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Fetch all files associated with a pooch registry and write them to an output directory.
|
|
27
|
+
|
|
28
|
+
Pooch fetches, caches and validates the downloaded files.
|
|
29
|
+
Subsequent calls to this function will not refetch any previously downloaded files.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
registry
|
|
34
|
+
Pooch directory containing a set of files that should be fetched.
|
|
35
|
+
name
|
|
36
|
+
Name of the registry.
|
|
37
|
+
output_dir
|
|
38
|
+
The root directory to write the files to.
|
|
39
|
+
|
|
40
|
+
The directory will be created if it doesn't exist,
|
|
41
|
+
and matching files will be overwritten.
|
|
42
|
+
|
|
43
|
+
If no directory is provided, the files will be fetched from the remote server,
|
|
44
|
+
but not copied anywhere.
|
|
45
|
+
symlink
|
|
46
|
+
If True, symlink all files to this directory.
|
|
47
|
+
Otherwise, perform a copy.
|
|
48
|
+
"""
|
|
49
|
+
if output_dir:
|
|
50
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
for key in track(registry.registry.keys(), description=f"Fetching {name} data"):
|
|
53
|
+
fetch_file = registry.fetch(key)
|
|
54
|
+
|
|
55
|
+
if output_dir is None:
|
|
56
|
+
# Just warm the cache and move onto the next file
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
linked_file = output_dir / key
|
|
60
|
+
linked_file.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
if not linked_file.exists(): # pragma: no cover
|
|
62
|
+
if symlink:
|
|
63
|
+
logger.info(f"Linking {key} to {linked_file}")
|
|
64
|
+
|
|
65
|
+
os.symlink(fetch_file, linked_file)
|
|
66
|
+
else:
|
|
67
|
+
logger.info(f"Copying {key} to {linked_file}")
|
|
68
|
+
shutil.copy(fetch_file, linked_file)
|
|
69
|
+
else:
|
|
70
|
+
logger.info(f"File {linked_file} already exists. Skipping.")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class DatasetRegistryManager:
|
|
74
|
+
"""
|
|
75
|
+
A collection of reference datasets registries
|
|
76
|
+
|
|
77
|
+
The REF requires additional reference datasets
|
|
78
|
+
in addition to obs4MIPs data which can be downloaded via ESGF.
|
|
79
|
+
Each provider may have different sets of reference data that are needed.
|
|
80
|
+
These are provider-specific datasets are datasets not yet available in obs4MIPs,
|
|
81
|
+
or are post-processed from obs4MIPs.
|
|
82
|
+
|
|
83
|
+
A dataset registry consists of a file that contains a list of files and checksums,
|
|
84
|
+
in combination with a base URL that is used to fetch the files.
|
|
85
|
+
[Pooch](https://www.fatiando.org/pooch/latest/) is used within the DataRegistry
|
|
86
|
+
to manage the caching, downloading and validation of the files.
|
|
87
|
+
|
|
88
|
+
All datasets that are registered here are expected to be openly licensed and freely available.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self) -> None:
|
|
92
|
+
self._registries: dict[str, pooch.Pooch] = {}
|
|
93
|
+
|
|
94
|
+
def __getitem__(self, item: str) -> pooch.Pooch:
|
|
95
|
+
"""
|
|
96
|
+
Get a registry by name
|
|
97
|
+
"""
|
|
98
|
+
return self._registries[item]
|
|
99
|
+
|
|
100
|
+
def keys(self) -> list[str]:
|
|
101
|
+
"""
|
|
102
|
+
Get the list of registry names
|
|
103
|
+
"""
|
|
104
|
+
return list(self._registries.keys())
|
|
105
|
+
|
|
106
|
+
def register( # noqa: PLR0913
|
|
107
|
+
self,
|
|
108
|
+
name: str,
|
|
109
|
+
base_url: str,
|
|
110
|
+
package: str,
|
|
111
|
+
resource: str,
|
|
112
|
+
cache_name: str | None = None,
|
|
113
|
+
version: str | None = None,
|
|
114
|
+
) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Register a new dataset registry
|
|
117
|
+
|
|
118
|
+
This will create a new Pooch registry and add it to the list of registries.
|
|
119
|
+
This is typically used by a provider to register a new collections of datasets at runtime.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
name
|
|
124
|
+
Name of the registry
|
|
125
|
+
|
|
126
|
+
This is used to identify the registry
|
|
127
|
+
base_url
|
|
128
|
+
Commmon URL prefix for the files
|
|
129
|
+
package
|
|
130
|
+
Name of the package containing the registry resource.
|
|
131
|
+
resource
|
|
132
|
+
Name of the resource in the package that contains a list of files and checksums.
|
|
133
|
+
|
|
134
|
+
This must be formatted in a way that is expected by pooch.
|
|
135
|
+
version
|
|
136
|
+
The version of the data.
|
|
137
|
+
|
|
138
|
+
Changing the version will invalidate the cache and force a re-download of the data.
|
|
139
|
+
cache_name
|
|
140
|
+
Name to use to generate the cache directory.
|
|
141
|
+
|
|
142
|
+
This defaults to the value of `name` if not provided.
|
|
143
|
+
"""
|
|
144
|
+
if cache_name is None:
|
|
145
|
+
cache_name = "ref"
|
|
146
|
+
|
|
147
|
+
registry = pooch.create(
|
|
148
|
+
path=pooch.os_cache(cache_name),
|
|
149
|
+
base_url=base_url,
|
|
150
|
+
version=version,
|
|
151
|
+
retry_if_failed=5,
|
|
152
|
+
env="REF_DATASET_CACHE_DIR",
|
|
153
|
+
)
|
|
154
|
+
registry.load_registry(str(importlib.resources.files(package) / resource))
|
|
155
|
+
self._registries[name] = registry
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
dataset_registry_manager = DatasetRegistryManager()
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset management and filtering
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import enum
|
|
6
|
+
import functools
|
|
7
|
+
import hashlib
|
|
8
|
+
from collections.abc import Iterable
|
|
9
|
+
from typing import Any, Self
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from attrs import field, frozen
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SourceDatasetType(enum.Enum):
|
|
16
|
+
"""
|
|
17
|
+
Types of supported source datasets
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
CMIP6 = "cmip6"
|
|
21
|
+
CMIP7 = "cmip7"
|
|
22
|
+
obs4MIPs = "obs4mips"
|
|
23
|
+
PMPClimatology = "pmp-climatology"
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
@functools.lru_cache(maxsize=1)
|
|
27
|
+
def ordered(
|
|
28
|
+
cls,
|
|
29
|
+
) -> list[Self]:
|
|
30
|
+
"""
|
|
31
|
+
Order in alphabetical order according to their value
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
:
|
|
36
|
+
Ordered list of dataset types
|
|
37
|
+
"""
|
|
38
|
+
return sorted(cls, key=lambda x: x.value)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _clean_facets(raw_values: dict[str, str | tuple[str, ...] | list[str]]) -> dict[str, tuple[str, ...]]:
|
|
42
|
+
"""
|
|
43
|
+
Clean the value of a facet filter to a tuple of strings
|
|
44
|
+
"""
|
|
45
|
+
result = {}
|
|
46
|
+
|
|
47
|
+
for key, value in raw_values.items():
|
|
48
|
+
if isinstance(value, list):
|
|
49
|
+
result[key] = tuple(value)
|
|
50
|
+
elif isinstance(value, str):
|
|
51
|
+
result[key] = (value,)
|
|
52
|
+
elif isinstance(value, tuple):
|
|
53
|
+
result[key] = value
|
|
54
|
+
return result
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@frozen
|
|
58
|
+
class FacetFilter:
|
|
59
|
+
"""
|
|
60
|
+
A filter to apply to a data catalog of datasets.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
facets: dict[str, tuple[str, ...]] = field(converter=_clean_facets)
|
|
64
|
+
"""
|
|
65
|
+
Filters to apply to the data catalog.
|
|
66
|
+
|
|
67
|
+
The keys are the metadata fields to filter on, and the values are the values to filter on.
|
|
68
|
+
The result will only contain datasets where for all fields,
|
|
69
|
+
the value of the field is one of the given values.
|
|
70
|
+
"""
|
|
71
|
+
keep: bool = True
|
|
72
|
+
"""
|
|
73
|
+
Whether to keep or remove datasets that match the filter.
|
|
74
|
+
|
|
75
|
+
If true (default), datasets that match the filter will be kept else they will be removed.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@frozen
|
|
80
|
+
class DatasetCollection:
|
|
81
|
+
"""
|
|
82
|
+
Group of datasets required for a given diagnostic execution for a specific source dataset type.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
datasets: pd.DataFrame
|
|
86
|
+
slug_column: str
|
|
87
|
+
"""
|
|
88
|
+
Column in datasets that contains the unique identifier for the dataset
|
|
89
|
+
"""
|
|
90
|
+
selector: tuple[tuple[str, str], ...] = ()
|
|
91
|
+
"""
|
|
92
|
+
Unique key, value pairs that were selected during the initial groupby
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __getattr__(self, item: str) -> Any:
|
|
96
|
+
return getattr(self.datasets, item)
|
|
97
|
+
|
|
98
|
+
def __getitem__(self, item: str | list[str]) -> Any:
|
|
99
|
+
return self.datasets[item]
|
|
100
|
+
|
|
101
|
+
def __hash__(self) -> int:
|
|
102
|
+
# This hashes each item individually and sums them so order doesn't matter
|
|
103
|
+
return int(pd.util.hash_pandas_object(self.datasets[self.slug_column]).sum())
|
|
104
|
+
|
|
105
|
+
def __eq__(self, other: object) -> bool:
|
|
106
|
+
return self.__hash__() == other.__hash__()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class ExecutionDatasetCollection:
|
|
110
|
+
"""
|
|
111
|
+
The complete set of datasets required for an execution of a diagnostic.
|
|
112
|
+
|
|
113
|
+
This may cover multiple source dataset types.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self, collection: dict[SourceDatasetType | str, DatasetCollection]):
|
|
117
|
+
self._collection = {SourceDatasetType(k): v for k, v in collection.items()}
|
|
118
|
+
|
|
119
|
+
def __contains__(self, key: SourceDatasetType | str) -> bool:
|
|
120
|
+
if isinstance(key, str):
|
|
121
|
+
key = SourceDatasetType(key)
|
|
122
|
+
return key in self._collection
|
|
123
|
+
|
|
124
|
+
def __getitem__(self, key: SourceDatasetType | str) -> DatasetCollection:
|
|
125
|
+
if isinstance(key, str):
|
|
126
|
+
key = SourceDatasetType(key)
|
|
127
|
+
return self._collection[key]
|
|
128
|
+
|
|
129
|
+
def __hash__(self) -> int:
|
|
130
|
+
return hash(self.hash)
|
|
131
|
+
|
|
132
|
+
def items(self) -> Iterable[tuple[SourceDatasetType, DatasetCollection]]:
|
|
133
|
+
"""
|
|
134
|
+
Iterate over the datasets in the collection
|
|
135
|
+
"""
|
|
136
|
+
return self._collection.items()
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def hash(self) -> str:
|
|
140
|
+
"""
|
|
141
|
+
Unique identifier for the collection
|
|
142
|
+
|
|
143
|
+
A SHA1 hash is calculated of the combination of the hashes of the individual collections.
|
|
144
|
+
The value isn't reversible but can be used to uniquely identify the aggregate of the
|
|
145
|
+
collections.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
:
|
|
150
|
+
SHA1 hash of the collections
|
|
151
|
+
"""
|
|
152
|
+
# The dataset collection hashes are reproducible,
|
|
153
|
+
# so we can use them to hash the diagnostic dataset.
|
|
154
|
+
# This isn't explicitly true for all Python hashes
|
|
155
|
+
hash_sum = sum(hash(item) for item in self._collection.values())
|
|
156
|
+
hash_bytes = hash_sum.to_bytes(16, "little", signed=True)
|
|
157
|
+
return hashlib.sha1(hash_bytes).hexdigest() # noqa: S324
|