climate-ref-core 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """
2
+ Rapid evaluating Climate data
3
+ """
4
+
5
+ import importlib.metadata
6
+
7
+ __version__ = importlib.metadata.version("climate-ref-core")
@@ -0,0 +1,363 @@
1
+ """
2
+ Dataset selection constraints
3
+ """
4
+
5
+ import sys
6
+ import warnings
7
+ from collections import defaultdict
8
+ from collections.abc import Mapping
9
+ from typing import Protocol, runtime_checkable
10
+
11
+ if sys.version_info < (3, 11):
12
+ from typing_extensions import Self
13
+ else:
14
+ from typing import Self
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from attrs import frozen
19
+ from loguru import logger
20
+
21
+ from climate_ref_core.datasets import SourceDatasetType
22
+ from climate_ref_core.exceptions import ConstraintNotSatisfied
23
+
24
+
25
+ @runtime_checkable
26
+ class GroupValidator(Protocol):
27
+ """
28
+ A constraint that must be satisfied when executing a given diagnostic run.
29
+
30
+ All constraints must be satisfied for a given group to be run.
31
+ """
32
+
33
+ def validate(self, group: pd.DataFrame) -> bool:
34
+ """
35
+ Validate if the constraint is satisfied by the dataset.
36
+
37
+ This is executed after the apply method to determine if the constraint is satisfied.
38
+ If the constraint is not satisfied, the group will not be executed.
39
+
40
+ Parameters
41
+ ----------
42
+ group
43
+ A group of datasets that is being validated.
44
+
45
+ Returns
46
+ -------
47
+ :
48
+ Whether the constraint is satisfied
49
+ """
50
+ ...
51
+
52
+
53
+ @runtime_checkable
54
+ class GroupOperation(Protocol):
55
+ """
56
+ An operation to perform on a group of datasets resulting in a new group of datasets.
57
+
58
+ !! warning
59
+
60
+ Operations should not mutate the input group, but instead return a new group.
61
+ Mutating the input group may result in unexpected behaviour.
62
+ """
63
+
64
+ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
65
+ """
66
+ Perform an operation on the group of datasets.
67
+
68
+ A new group of datasets should be returned if modifications are required,
69
+ and the input group should not be modified. If no modifications are required,
70
+ return the input group unchanged.
71
+ If this operation fails, a ConstraintNotSatisfied exception should be raised.
72
+
73
+ Parameters
74
+ ----------
75
+ group
76
+ A group of datasets that is being validated.
77
+ data_catalog
78
+ The data catalog of datasets
79
+
80
+ Raises
81
+ ------
82
+ ConstraintNotSatisfied
83
+ The operation was not successful
84
+
85
+ Returns
86
+ -------
87
+ :
88
+ The updated group of datasets
89
+ """
90
+ ...
91
+
92
+
93
+ GroupConstraint = GroupOperation | GroupValidator
94
+ """
95
+ A constraint that must be satisfied for a group of datasets to be executed.
96
+
97
+ This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
98
+ The group must satisfy all constraints to be processed.
99
+
100
+ This can include operations that are applied to a group of datasets which may modify the group,
101
+ but may also include validators that check if the group satisfies a certain condition.
102
+ """
103
+
104
+
105
+ def apply_constraint(
106
+ dataframe: pd.DataFrame,
107
+ constraint: GroupConstraint,
108
+ data_catalog: pd.DataFrame,
109
+ ) -> pd.DataFrame | None:
110
+ """
111
+ Apply a constraint to a group of datasets
112
+
113
+ Parameters
114
+ ----------
115
+ dataframe:
116
+ The group of datasets to apply the constraint to.
117
+ constraint
118
+ The constraint to apply.
119
+ data_catalog
120
+ The data catalog of all datasets.
121
+
122
+ Returns
123
+ -------
124
+ :
125
+ The updated group of datasets or None if the constraint was not satisfied
126
+ """
127
+ try:
128
+ updated_group = (
129
+ constraint.apply(dataframe, data_catalog) if isinstance(constraint, GroupOperation) else dataframe
130
+ )
131
+
132
+ valid = constraint.validate(updated_group) if isinstance(constraint, GroupValidator) else True
133
+ if not valid:
134
+ logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
135
+ raise ConstraintNotSatisfied(f"Constraint {constraint} not satisfied for {dataframe}")
136
+ except ConstraintNotSatisfied:
137
+ logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
138
+ return None
139
+
140
+ return updated_group
141
+
142
+
143
+ @frozen
144
+ class RequireFacets:
145
+ """
146
+ A constraint that requires a dataset to have certain facets.
147
+ """
148
+
149
+ dimension: str
150
+ required_facets: tuple[str, ...]
151
+
152
+ def validate(self, group: pd.DataFrame) -> bool:
153
+ """
154
+ Check that the required facets are present in the group
155
+ """
156
+ if self.dimension not in group:
157
+ logger.warning(f"Dimension {self.dimension} not present in group {group}")
158
+ return False
159
+ return all(value in group[self.dimension].values for value in self.required_facets)
160
+
161
+
162
+ @frozen
163
+ class AddSupplementaryDataset:
164
+ """
165
+ Include e.g. a cell measure or ancillary variable in the selection.
166
+ """
167
+
168
+ supplementary_facets: Mapping[str, str | tuple[str, ...]]
169
+ """
170
+ Facets describing the supplementary dataset.
171
+ """
172
+
173
+ matching_facets: tuple[str, ...]
174
+ """
175
+ Facets that must match with datasets in the selection.
176
+ """
177
+
178
+ optional_matching_facets: tuple[str, ...]
179
+ """
180
+ Select only the best matching datasets based on similarity with these facets.
181
+ """
182
+
183
+ def apply(
184
+ self,
185
+ group: pd.DataFrame,
186
+ data_catalog: pd.DataFrame,
187
+ ) -> pd.DataFrame:
188
+ """
189
+ Add a supplementary dataset to the group.
190
+ """
191
+ supplementary_facets: defaultdict[str, tuple[str, ...]] = defaultdict(tuple)
192
+ for facet, values in self.supplementary_facets.items():
193
+ supplementary_facets[facet] = values if isinstance(values, tuple) else (values,)
194
+
195
+ for facet in self.matching_facets:
196
+ values = tuple(group[facet].unique())
197
+ supplementary_facets[facet] += values
198
+
199
+ supplementary_group = data_catalog
200
+ for facet, values in supplementary_facets.items():
201
+ mask = supplementary_group[facet].isin(values)
202
+ supplementary_group = supplementary_group[mask]
203
+
204
+ if not supplementary_group.empty and self.optional_matching_facets:
205
+ facets = list(self.matching_facets + self.optional_matching_facets)
206
+ datasets = group[facets].drop_duplicates()
207
+ indices = set()
208
+ for i in range(len(datasets)):
209
+ scores = (supplementary_group[facets] == datasets.iloc[i]).sum(axis=1)
210
+ matches = supplementary_group[scores == scores.max()]
211
+ # Select the latest version if there are multiple matches
212
+ matches = matches[matches["version"] == matches["version"].max()]
213
+ indices.add(matches.index[0])
214
+ supplementary_group = supplementary_group.loc[list(indices)].drop_duplicates()
215
+
216
+ return pd.concat([group, supplementary_group])
217
+
218
+ @classmethod
219
+ def from_defaults(
220
+ cls,
221
+ variable: str,
222
+ source_type: SourceDatasetType,
223
+ ) -> Self:
224
+ """
225
+ Include e.g. a cell measure or ancillary variable in the selection.
226
+
227
+ The constraint is created using the defaults for the source_type.
228
+
229
+ Parameters
230
+ ----------
231
+ variable:
232
+ The name of the variable to add.
233
+ source_type:
234
+ The source_type of the variable to add.
235
+
236
+ Returns
237
+ -------
238
+ :
239
+ A constraint to include a supplementary variable.
240
+
241
+ """
242
+ kwargs = {
243
+ SourceDatasetType.CMIP6: {
244
+ "matching_facets": (
245
+ "source_id",
246
+ "grid_label",
247
+ ),
248
+ "optional_matching_facets": (
249
+ "table_id",
250
+ "experiment_id",
251
+ "member_id",
252
+ "version",
253
+ ),
254
+ }
255
+ }
256
+ variable_facet = {
257
+ SourceDatasetType.CMIP6: "variable_id",
258
+ }
259
+
260
+ supplementary_facets = {variable_facet[source_type]: variable}
261
+ return cls(supplementary_facets, **kwargs[source_type])
262
+
263
+
264
+ @frozen
265
+ class RequireContiguousTimerange:
266
+ """
267
+ A constraint that requires datasets to have a contiguous timerange.
268
+ """
269
+
270
+ group_by: tuple[str, ...]
271
+ """
272
+ The fields to group the datasets by. Each group must be contiguous in time
273
+ to fulfill the constraint.
274
+ """
275
+
276
+ def validate(self, group: pd.DataFrame) -> bool:
277
+ """
278
+ Check that all subgroups of the group have a contiguous timerange.
279
+ """
280
+ # Maximum allowed time difference between the end of one file and the
281
+ # start of the next file.
282
+ max_timedelta = pd.Timedelta(
283
+ days=31, # Maximum number of days in a month.
284
+ hours=1, # Allow for potential rounding errors.
285
+ )
286
+ group = group.dropna(subset=["start_time", "end_time"])
287
+ if len(group) < 2: # noqa: PLR2004
288
+ return True
289
+
290
+ for _, subgroup in group.groupby(list(self.group_by)):
291
+ if len(subgroup) < 2: # noqa: PLR2004
292
+ continue
293
+ sorted_group = subgroup.sort_values("start_time", kind="stable")
294
+ start_series = sorted_group["start_time"]
295
+ end_series = sorted_group["end_time"]
296
+ # Sometimes the elements of start_series.values are of type datetime64[ns]
297
+ # and sometimes its elements are of type datetime.datetime.
298
+ # Convert both arrays to datetime.datetime objects to make sure they
299
+ # can be subtracted.
300
+ with warnings.catch_warnings():
301
+ # We have already mitigated the future change in behaviour of DatetimeProperties.to_pydatetime
302
+ warnings.simplefilter("ignore", FutureWarning)
303
+
304
+ if hasattr(start_series, "dt"):
305
+ start_array = np.array(start_series.dt.to_pydatetime())
306
+ else:
307
+ start_array = start_series.values # type: ignore[assignment]
308
+ if hasattr(end_series, "dt"):
309
+ end_array = np.array(end_series.dt.to_pydatetime())
310
+ else:
311
+ end_array = end_series.values # type: ignore[assignment]
312
+ diff = start_array[1:] - end_array[:-1]
313
+ gap_indices = diff > max_timedelta
314
+ if gap_indices.any():
315
+ paths = sorted_group["path"]
316
+ for gap_idx in np.flatnonzero(gap_indices):
317
+ logger.debug(
318
+ f"Constraint {self.__class__.__name__} not satisfied "
319
+ f"because gap larger than {max_timedelta} found between "
320
+ f"{paths.iloc[gap_idx]} and {paths.iloc[gap_idx + 1]}"
321
+ )
322
+ return False
323
+ return True
324
+
325
+
326
+ @frozen
327
+ class RequireOverlappingTimerange:
328
+ """
329
+ A constraint that requires datasets to have an overlapping timerange.
330
+ """
331
+
332
+ group_by: tuple[str, ...]
333
+ """
334
+ The fields to group the datasets by. There must be overlap in time between
335
+ the groups to fulfill the constraint.
336
+ """
337
+
338
+ def validate(self, group: pd.DataFrame) -> bool:
339
+ """
340
+ Check that all subgroups of the group have an overlapping timerange.
341
+ """
342
+ group = group.dropna(subset=["start_time", "end_time"])
343
+ if len(group) < 2: # noqa: PLR2004
344
+ return True
345
+
346
+ starts = group.groupby(list(self.group_by))["start_time"].min()
347
+ ends = group.groupby(list(self.group_by))["end_time"].max()
348
+ return starts.max() < ends.min() # type: ignore[no-any-return]
349
+
350
+
351
+ @frozen
352
+ class SelectParentExperiment:
353
+ """
354
+ Include a dataset's parent experiment in the selection
355
+ """
356
+
357
+ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
358
+ """
359
+ Include a dataset's parent experiment in the selection
360
+
361
+ Not yet implemented
362
+ """
363
+ raise NotImplementedError("This is not implemented yet") # pragma: no cover
@@ -0,0 +1,158 @@
1
+ """
2
+ Data registries for non-published reference data
3
+
4
+ These data are placeholders until these data have been added to obs4MIPs.
5
+ The CMIP7 Assessment Fas Track REF requires that reference datasets are openly licensed
6
+ before it is included in any published data catalogs.
7
+ """
8
+
9
+ import importlib.resources
10
+ import os
11
+ import pathlib
12
+ import shutil
13
+
14
+ import pooch
15
+ from loguru import logger
16
+ from rich.progress import track
17
+
18
+
19
+ def fetch_all_files(
20
+ registry: pooch.Pooch,
21
+ name: str,
22
+ output_dir: pathlib.Path | None,
23
+ symlink: bool = False,
24
+ ) -> None:
25
+ """
26
+ Fetch all files associated with a pooch registry and write them to an output directory.
27
+
28
+ Pooch fetches, caches and validates the downloaded files.
29
+ Subsequent calls to this function will not refetch any previously downloaded files.
30
+
31
+ Parameters
32
+ ----------
33
+ registry
34
+ Pooch directory containing a set of files that should be fetched.
35
+ name
36
+ Name of the registry.
37
+ output_dir
38
+ The root directory to write the files to.
39
+
40
+ The directory will be created if it doesn't exist,
41
+ and matching files will be overwritten.
42
+
43
+ If no directory is provided, the files will be fetched from the remote server,
44
+ but not copied anywhere.
45
+ symlink
46
+ If True, symlink all files to this directory.
47
+ Otherwise, perform a copy.
48
+ """
49
+ if output_dir:
50
+ output_dir.mkdir(parents=True, exist_ok=True)
51
+
52
+ for key in track(registry.registry.keys(), description=f"Fetching {name} data"):
53
+ fetch_file = registry.fetch(key)
54
+
55
+ if output_dir is None:
56
+ # Just warm the cache and move onto the next file
57
+ continue
58
+
59
+ linked_file = output_dir / key
60
+ linked_file.parent.mkdir(parents=True, exist_ok=True)
61
+ if not linked_file.exists(): # pragma: no cover
62
+ if symlink:
63
+ logger.info(f"Linking {key} to {linked_file}")
64
+
65
+ os.symlink(fetch_file, linked_file)
66
+ else:
67
+ logger.info(f"Copying {key} to {linked_file}")
68
+ shutil.copy(fetch_file, linked_file)
69
+ else:
70
+ logger.info(f"File {linked_file} already exists. Skipping.")
71
+
72
+
73
+ class DatasetRegistryManager:
74
+ """
75
+ A collection of reference datasets registries
76
+
77
+ The REF requires additional reference datasets
78
+ in addition to obs4MIPs data which can be downloaded via ESGF.
79
+ Each provider may have different sets of reference data that are needed.
80
+ These are provider-specific datasets are datasets not yet available in obs4MIPs,
81
+ or are post-processed from obs4MIPs.
82
+
83
+ A dataset registry consists of a file that contains a list of files and checksums,
84
+ in combination with a base URL that is used to fetch the files.
85
+ [Pooch](https://www.fatiando.org/pooch/latest/) is used within the DataRegistry
86
+ to manage the caching, downloading and validation of the files.
87
+
88
+ All datasets that are registered here are expected to be openly licensed and freely available.
89
+ """
90
+
91
+ def __init__(self) -> None:
92
+ self._registries: dict[str, pooch.Pooch] = {}
93
+
94
+ def __getitem__(self, item: str) -> pooch.Pooch:
95
+ """
96
+ Get a registry by name
97
+ """
98
+ return self._registries[item]
99
+
100
+ def keys(self) -> list[str]:
101
+ """
102
+ Get the list of registry names
103
+ """
104
+ return list(self._registries.keys())
105
+
106
+ def register( # noqa: PLR0913
107
+ self,
108
+ name: str,
109
+ base_url: str,
110
+ package: str,
111
+ resource: str,
112
+ cache_name: str | None = None,
113
+ version: str | None = None,
114
+ ) -> None:
115
+ """
116
+ Register a new dataset registry
117
+
118
+ This will create a new Pooch registry and add it to the list of registries.
119
+ This is typically used by a provider to register a new collections of datasets at runtime.
120
+
121
+ Parameters
122
+ ----------
123
+ name
124
+ Name of the registry
125
+
126
+ This is used to identify the registry
127
+ base_url
128
+ Commmon URL prefix for the files
129
+ package
130
+ Name of the package containing the registry resource.
131
+ resource
132
+ Name of the resource in the package that contains a list of files and checksums.
133
+
134
+ This must be formatted in a way that is expected by pooch.
135
+ version
136
+ The version of the data.
137
+
138
+ Changing the version will invalidate the cache and force a re-download of the data.
139
+ cache_name
140
+ Name to use to generate the cache directory.
141
+
142
+ This defaults to the value of `name` if not provided.
143
+ """
144
+ if cache_name is None:
145
+ cache_name = "ref"
146
+
147
+ registry = pooch.create(
148
+ path=pooch.os_cache(cache_name),
149
+ base_url=base_url,
150
+ version=version,
151
+ retry_if_failed=5,
152
+ env="REF_DATASET_CACHE_DIR",
153
+ )
154
+ registry.load_registry(str(importlib.resources.files(package) / resource))
155
+ self._registries[name] = registry
156
+
157
+
158
+ dataset_registry_manager = DatasetRegistryManager()
@@ -0,0 +1,157 @@
1
+ """
2
+ Dataset management and filtering
3
+ """
4
+
5
+ import enum
6
+ import functools
7
+ import hashlib
8
+ from collections.abc import Iterable
9
+ from typing import Any, Self
10
+
11
+ import pandas as pd
12
+ from attrs import field, frozen
13
+
14
+
15
+ class SourceDatasetType(enum.Enum):
16
+ """
17
+ Types of supported source datasets
18
+ """
19
+
20
+ CMIP6 = "cmip6"
21
+ CMIP7 = "cmip7"
22
+ obs4MIPs = "obs4mips"
23
+ PMPClimatology = "pmp-climatology"
24
+
25
+ @classmethod
26
+ @functools.lru_cache(maxsize=1)
27
+ def ordered(
28
+ cls,
29
+ ) -> list[Self]:
30
+ """
31
+ Order in alphabetical order according to their value
32
+
33
+ Returns
34
+ -------
35
+ :
36
+ Ordered list of dataset types
37
+ """
38
+ return sorted(cls, key=lambda x: x.value)
39
+
40
+
41
+ def _clean_facets(raw_values: dict[str, str | tuple[str, ...] | list[str]]) -> dict[str, tuple[str, ...]]:
42
+ """
43
+ Clean the value of a facet filter to a tuple of strings
44
+ """
45
+ result = {}
46
+
47
+ for key, value in raw_values.items():
48
+ if isinstance(value, list):
49
+ result[key] = tuple(value)
50
+ elif isinstance(value, str):
51
+ result[key] = (value,)
52
+ elif isinstance(value, tuple):
53
+ result[key] = value
54
+ return result
55
+
56
+
57
+ @frozen
58
+ class FacetFilter:
59
+ """
60
+ A filter to apply to a data catalog of datasets.
61
+ """
62
+
63
+ facets: dict[str, tuple[str, ...]] = field(converter=_clean_facets)
64
+ """
65
+ Filters to apply to the data catalog.
66
+
67
+ The keys are the metadata fields to filter on, and the values are the values to filter on.
68
+ The result will only contain datasets where for all fields,
69
+ the value of the field is one of the given values.
70
+ """
71
+ keep: bool = True
72
+ """
73
+ Whether to keep or remove datasets that match the filter.
74
+
75
+ If true (default), datasets that match the filter will be kept else they will be removed.
76
+ """
77
+
78
+
79
+ @frozen
80
+ class DatasetCollection:
81
+ """
82
+ Group of datasets required for a given diagnostic execution for a specific source dataset type.
83
+ """
84
+
85
+ datasets: pd.DataFrame
86
+ slug_column: str
87
+ """
88
+ Column in datasets that contains the unique identifier for the dataset
89
+ """
90
+ selector: tuple[tuple[str, str], ...] = ()
91
+ """
92
+ Unique key, value pairs that were selected during the initial groupby
93
+ """
94
+
95
+ def __getattr__(self, item: str) -> Any:
96
+ return getattr(self.datasets, item)
97
+
98
+ def __getitem__(self, item: str | list[str]) -> Any:
99
+ return self.datasets[item]
100
+
101
+ def __hash__(self) -> int:
102
+ # This hashes each item individually and sums them so order doesn't matter
103
+ return int(pd.util.hash_pandas_object(self.datasets[self.slug_column]).sum())
104
+
105
+ def __eq__(self, other: object) -> bool:
106
+ return self.__hash__() == other.__hash__()
107
+
108
+
109
+ class ExecutionDatasetCollection:
110
+ """
111
+ The complete set of datasets required for an execution of a diagnostic.
112
+
113
+ This may cover multiple source dataset types.
114
+ """
115
+
116
+ def __init__(self, collection: dict[SourceDatasetType | str, DatasetCollection]):
117
+ self._collection = {SourceDatasetType(k): v for k, v in collection.items()}
118
+
119
+ def __contains__(self, key: SourceDatasetType | str) -> bool:
120
+ if isinstance(key, str):
121
+ key = SourceDatasetType(key)
122
+ return key in self._collection
123
+
124
+ def __getitem__(self, key: SourceDatasetType | str) -> DatasetCollection:
125
+ if isinstance(key, str):
126
+ key = SourceDatasetType(key)
127
+ return self._collection[key]
128
+
129
+ def __hash__(self) -> int:
130
+ return hash(self.hash)
131
+
132
+ def items(self) -> Iterable[tuple[SourceDatasetType, DatasetCollection]]:
133
+ """
134
+ Iterate over the datasets in the collection
135
+ """
136
+ return self._collection.items()
137
+
138
+ @property
139
+ def hash(self) -> str:
140
+ """
141
+ Unique identifier for the collection
142
+
143
+ A SHA1 hash is calculated of the combination of the hashes of the individual collections.
144
+ The value isn't reversible but can be used to uniquely identify the aggregate of the
145
+ collections.
146
+
147
+ Returns
148
+ -------
149
+ :
150
+ SHA1 hash of the collections
151
+ """
152
+ # The dataset collection hashes are reproducible,
153
+ # so we can use them to hash the diagnostic dataset.
154
+ # This isn't explicitly true for all Python hashes
155
+ hash_sum = sum(hash(item) for item in self._collection.values())
156
+ hash_bytes = hash_sum.to_bytes(16, "little", signed=True)
157
+ return hashlib.sha1(hash_bytes).hexdigest() # noqa: S324