climate-ref-core 0.6.6__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/PKG-INFO +1 -1
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/pyproject.toml +1 -1
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/constraints.py +223 -79
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/datasets.py +3 -6
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/diagnostics.py +13 -6
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/logging.py +7 -3
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/metric_values/typing.py +29 -7
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/providers.py +51 -1
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/metric.py +12 -4
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/output.py +12 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/test_output_json_schema.yml +8 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/test_cmec_metric.py +1 -2
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/test_cmec_output.py +1 -4
- climate_ref_core-0.8.0/tests/unit/test_constraints.py +818 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_dataset_registry/test_dataset_registry.py +2 -2
- climate_ref_core-0.8.0/tests/unit/test_datasets/dataset_collection_obs4mips_hash.yml +2 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_datasets.py +25 -19
- climate_ref_core-0.6.6/tests/unit/test_metrics.py → climate_ref_core-0.8.0/tests/unit/test_diagnostics.py +39 -76
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_logging.py +4 -7
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_providers.py +74 -6
- climate_ref_core-0.6.6/tests/unit/test_constraints.py +0 -438
- climate_ref_core-0.6.6/tests/unit/test_datasets/dataset_collection_obs4mips_hash.yml +0 -2
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/.gitignore +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/LICENCE +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/NOTICE +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/README.md +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/__init__.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/dataset_registry.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/env.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/exceptions.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/executor.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/metric_values/__init__.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/py.typed +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/README.md +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/__init__.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/controlled_vocabulary.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/cv_cmip7_aft.yaml +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/metric_values/test_typing.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/cmec_metric_sample.json +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/cmec_output_sample.json +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/cv_sample.yaml +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/test_metric_json_schema.yml +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/conftest.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/test_controlled_vocabulary.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_datasets/dataset_collection_hash.yml +0 -0
- /climate_ref_core-0.6.6/tests/unit/test_datasets/metric_dataset_hash.yml → /climate_ref_core-0.8.0/tests/unit/test_datasets/execution_dataset_hash.yml +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_exceptions.py +0 -0
- {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_executor.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: climate-ref-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Core library for the CMIP Rapid Evaluation Framework
|
|
5
5
|
Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -6,6 +6,8 @@ import sys
|
|
|
6
6
|
import warnings
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from collections.abc import Mapping
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from functools import total_ordering
|
|
9
11
|
from typing import Literal, Protocol, runtime_checkable
|
|
10
12
|
|
|
11
13
|
if sys.version_info < (3, 11):
|
|
@@ -15,45 +17,21 @@ else:
|
|
|
15
17
|
|
|
16
18
|
import numpy as np
|
|
17
19
|
import pandas as pd
|
|
18
|
-
from attrs import frozen
|
|
20
|
+
from attrs import field, frozen
|
|
19
21
|
from loguru import logger
|
|
20
22
|
|
|
21
23
|
from climate_ref_core.datasets import SourceDatasetType
|
|
22
|
-
from climate_ref_core.exceptions import ConstraintNotSatisfied
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
@runtime_checkable
|
|
26
|
-
class
|
|
27
|
+
class GroupConstraint(Protocol):
|
|
27
28
|
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
All constraints must be satisfied for a given group to be run.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def validate(self, group: pd.DataFrame) -> bool:
|
|
34
|
-
"""
|
|
35
|
-
Validate if the constraint is satisfied by the dataset.
|
|
36
|
-
|
|
37
|
-
This is executed after the apply method to determine if the constraint is satisfied.
|
|
38
|
-
If the constraint is not satisfied, the group will not be executed.
|
|
39
|
-
|
|
40
|
-
Parameters
|
|
41
|
-
----------
|
|
42
|
-
group
|
|
43
|
-
A group of datasets that is being validated.
|
|
44
|
-
|
|
45
|
-
Returns
|
|
46
|
-
-------
|
|
47
|
-
:
|
|
48
|
-
Whether the constraint is satisfied
|
|
49
|
-
"""
|
|
50
|
-
...
|
|
29
|
+
An operation to perform on a group of datasets resulting in a new group of datasets.
|
|
51
30
|
|
|
31
|
+
This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
|
|
52
32
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
"""
|
|
56
|
-
An operation to perform on a group of datasets resulting in a new group of datasets.
|
|
33
|
+
If the operation results in an empty group, the constraint is considered not satisfied.
|
|
34
|
+
The group must satisfy all constraints to be processed.
|
|
57
35
|
|
|
58
36
|
!! warning
|
|
59
37
|
|
|
@@ -90,18 +68,6 @@ class GroupOperation(Protocol):
|
|
|
90
68
|
...
|
|
91
69
|
|
|
92
70
|
|
|
93
|
-
GroupConstraint = GroupOperation | GroupValidator
|
|
94
|
-
"""
|
|
95
|
-
A constraint that must be satisfied for a group of datasets to be executed.
|
|
96
|
-
|
|
97
|
-
This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
|
|
98
|
-
The group must satisfy all constraints to be processed.
|
|
99
|
-
|
|
100
|
-
This can include operations that are applied to a group of datasets which may modify the group,
|
|
101
|
-
but may also include validators that check if the group satisfies a certain condition.
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
|
|
105
71
|
def apply_constraint(
|
|
106
72
|
dataframe: pd.DataFrame,
|
|
107
73
|
constraint: GroupConstraint,
|
|
@@ -124,41 +90,95 @@ def apply_constraint(
|
|
|
124
90
|
:
|
|
125
91
|
The updated group of datasets or None if the constraint was not satisfied
|
|
126
92
|
"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
constraint.apply(dataframe, data_catalog) if isinstance(constraint, GroupOperation) else dataframe
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
valid = constraint.validate(updated_group) if isinstance(constraint, GroupValidator) else True
|
|
133
|
-
if not valid:
|
|
134
|
-
logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
135
|
-
raise ConstraintNotSatisfied(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
136
|
-
except ConstraintNotSatisfied:
|
|
93
|
+
updated_group = constraint.apply(dataframe, data_catalog)
|
|
94
|
+
if updated_group.empty:
|
|
137
95
|
logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
138
96
|
return None
|
|
139
97
|
|
|
140
98
|
return updated_group
|
|
141
99
|
|
|
142
100
|
|
|
101
|
+
def _to_tuple(value: None | str | tuple[str, ...]) -> tuple[str, ...]:
|
|
102
|
+
"""
|
|
103
|
+
Normalize value to a tuple of strings.
|
|
104
|
+
"""
|
|
105
|
+
if value is None:
|
|
106
|
+
return ()
|
|
107
|
+
if isinstance(value, str):
|
|
108
|
+
return (value,)
|
|
109
|
+
return tuple(value)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _to_tuple_dict(value: dict[str, str | tuple[str, ...]]) -> dict[str, tuple[str, ...]]:
|
|
113
|
+
"""
|
|
114
|
+
Normalize value to a dict of tuples of strings.
|
|
115
|
+
"""
|
|
116
|
+
return {k: _to_tuple(v) for k, v in value.items()}
|
|
117
|
+
|
|
118
|
+
|
|
143
119
|
@frozen
|
|
144
120
|
class RequireFacets:
|
|
145
121
|
"""
|
|
146
|
-
A constraint that requires
|
|
122
|
+
A constraint that requires datasets to have certain facet values.
|
|
147
123
|
"""
|
|
148
124
|
|
|
149
125
|
dimension: str
|
|
150
|
-
|
|
126
|
+
"""The name of the facet to filter on."""
|
|
127
|
+
|
|
128
|
+
required_facets: tuple[str, ...] = field(converter=_to_tuple)
|
|
129
|
+
"The required facet values."
|
|
130
|
+
|
|
151
131
|
operator: Literal["all", "any"] = "all"
|
|
132
|
+
"""Whether all or any of the required facets must be present."""
|
|
152
133
|
|
|
153
|
-
|
|
134
|
+
group_by: tuple[str, ...] | None = field(converter=_to_tuple, default=None)
|
|
135
|
+
"""
|
|
136
|
+
The facets to group the datasets by.
|
|
137
|
+
|
|
138
|
+
Each group created by `group_by` must contain at least one dataset where the
|
|
139
|
+
value of the given dimension is in the list of required facet values.
|
|
140
|
+
|
|
141
|
+
For example, if there are multiple models and variables in the selection,
|
|
142
|
+
`group_by` can be used to make sure that only those models are selected that
|
|
143
|
+
provide all required variables.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
154
147
|
"""
|
|
155
|
-
|
|
148
|
+
Filter out groups of datasets that do not provide the required facets
|
|
156
149
|
"""
|
|
157
|
-
if self.dimension not in group:
|
|
158
|
-
logger.warning(f"Dimension {self.dimension} not present in group {group}")
|
|
159
|
-
return False
|
|
160
150
|
op = all if self.operator == "all" else any
|
|
161
|
-
|
|
151
|
+
select = pd.Series(True, index=group.index)
|
|
152
|
+
groups = [group] if not self.group_by else (g[1] for g in group.groupby(list(self.group_by)))
|
|
153
|
+
for subgroup in groups:
|
|
154
|
+
if not op(value in subgroup[self.dimension].values for value in self.required_facets):
|
|
155
|
+
logger.debug(
|
|
156
|
+
f"Constraint {self} not satisfied because required facet values "
|
|
157
|
+
f"not found for group {', '.join(subgroup['path'])}"
|
|
158
|
+
)
|
|
159
|
+
select.loc[subgroup.index] = False
|
|
160
|
+
return group[select]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@frozen
|
|
164
|
+
class IgnoreFacets:
|
|
165
|
+
"""
|
|
166
|
+
A constraint that ignores certain facet values.
|
|
167
|
+
|
|
168
|
+
Datasets with these facet values are removed from the selection.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
facets: dict[str, str | tuple[str, ...]] = field(converter=_to_tuple_dict)
|
|
172
|
+
"""The facet values to ignore."""
|
|
173
|
+
|
|
174
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
175
|
+
"""
|
|
176
|
+
Filter out datasets with the ignored facets.
|
|
177
|
+
"""
|
|
178
|
+
mask = group[list(self.facets)].isin(self.facets).all(axis="columns")
|
|
179
|
+
if mask.any():
|
|
180
|
+
logger.debug(f"Ignoring files {', '.join(group.loc[mask, 'path'])} becauseof {self}")
|
|
181
|
+
return group[~mask]
|
|
162
182
|
|
|
163
183
|
|
|
164
184
|
@frozen
|
|
@@ -273,6 +293,123 @@ class AddSupplementaryDataset:
|
|
|
273
293
|
return cls(supplementary_facets, **kwargs[source_type])
|
|
274
294
|
|
|
275
295
|
|
|
296
|
+
@frozen
|
|
297
|
+
@total_ordering
|
|
298
|
+
class PartialDateTime: # noqa: PLW1641
|
|
299
|
+
"""
|
|
300
|
+
A partial datetime object that can be used to compare datetimes.
|
|
301
|
+
|
|
302
|
+
Only the specified fields are used for comparison.
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
year: int | None = None
|
|
306
|
+
month: int | None = None
|
|
307
|
+
day: int | None = None
|
|
308
|
+
hour: int | None = None
|
|
309
|
+
minute: int | None = None
|
|
310
|
+
second: int | None = None
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def _attrs(self) -> dict[str, int]:
|
|
314
|
+
"""The attributes that are set."""
|
|
315
|
+
return {
|
|
316
|
+
a: v
|
|
317
|
+
for a in self.__slots__ # type: ignore[attr-defined]
|
|
318
|
+
if not a.startswith("_") and (v := getattr(self, a)) is not None
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
def __repr__(self) -> str:
|
|
322
|
+
return f"{self.__class__.__name__}({', '.join(f'{a}={v}' for a, v in self._attrs.items())})"
|
|
323
|
+
|
|
324
|
+
def __eq__(self, other: object) -> bool:
|
|
325
|
+
if not isinstance(other, datetime):
|
|
326
|
+
msg = (
|
|
327
|
+
f"Can only compare PartialDateTime with `datetime.datetime` "
|
|
328
|
+
f"objects, got object {other} of type {type(other)}"
|
|
329
|
+
)
|
|
330
|
+
raise TypeError(msg)
|
|
331
|
+
|
|
332
|
+
for attr, value in self._attrs.items():
|
|
333
|
+
other_value = getattr(other, attr)
|
|
334
|
+
if value != other_value:
|
|
335
|
+
return False
|
|
336
|
+
return True
|
|
337
|
+
|
|
338
|
+
def __lt__(self, other: object) -> bool:
|
|
339
|
+
if not isinstance(other, datetime):
|
|
340
|
+
msg = (
|
|
341
|
+
f"Can only compare PartialDateTime with `datetime.datetime` "
|
|
342
|
+
f"objects, got object {other} of type {type(other)}"
|
|
343
|
+
)
|
|
344
|
+
raise TypeError(msg)
|
|
345
|
+
|
|
346
|
+
for attr, value in self._attrs.items():
|
|
347
|
+
other_value = getattr(other, attr)
|
|
348
|
+
if value != other_value:
|
|
349
|
+
return value < other_value # type: ignore[no-any-return]
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@frozen
|
|
354
|
+
class RequireTimerange:
|
|
355
|
+
"""
|
|
356
|
+
A constraint that requires datasets to have a specific timerange.
|
|
357
|
+
|
|
358
|
+
Specify the start and/or end of the required timerange using a precision
|
|
359
|
+
that matches the frequency of the datasets.
|
|
360
|
+
|
|
361
|
+
For example, to ensure that datasets at monthly frequency cover the period
|
|
362
|
+
from 2000 to 2010, use start=PartialDateTime(year=2000, month=1) and
|
|
363
|
+
end=PartialDateTime(year=2010, month=12).
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
group_by: tuple[str, ...]
|
|
367
|
+
"""
|
|
368
|
+
The fields to group the datasets by. Groups that do not cover the timerange
|
|
369
|
+
will be removed.
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
start: PartialDateTime | None = None
|
|
373
|
+
"""
|
|
374
|
+
The start time of the required timerange. If None, no start time is required.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
end: PartialDateTime | None = None
|
|
378
|
+
"""
|
|
379
|
+
The end time of the required timerange. If None, no end time is required.
|
|
380
|
+
"""
|
|
381
|
+
|
|
382
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
383
|
+
"""
|
|
384
|
+
Check that all subgroups of the group have a contiguous timerange.
|
|
385
|
+
"""
|
|
386
|
+
select = pd.Series(True, index=group.index)
|
|
387
|
+
for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
|
|
388
|
+
start = subgroup["start_time"].min()
|
|
389
|
+
end = subgroup["end_time"].max()
|
|
390
|
+
result = True
|
|
391
|
+
if self.start is not None and start > self.start:
|
|
392
|
+
logger.debug(
|
|
393
|
+
f"Constraint {self} not satisfied because start time {start} "
|
|
394
|
+
f"is after required start time for {', '.join(subgroup['path'])}"
|
|
395
|
+
)
|
|
396
|
+
result = False
|
|
397
|
+
if self.end is not None and end < self.end:
|
|
398
|
+
logger.debug(
|
|
399
|
+
f"Constraint {self} not satisfied because end time {end} "
|
|
400
|
+
f"is before required end time for {', '.join(subgroup['path'])}"
|
|
401
|
+
)
|
|
402
|
+
result = False
|
|
403
|
+
if result:
|
|
404
|
+
contiguous_subgroup = RequireContiguousTimerange(group_by=self.group_by).apply(
|
|
405
|
+
subgroup, data_catalog
|
|
406
|
+
)
|
|
407
|
+
result = len(contiguous_subgroup) == len(subgroup)
|
|
408
|
+
if not result:
|
|
409
|
+
select.loc[subgroup.index] = False
|
|
410
|
+
return group[select]
|
|
411
|
+
|
|
412
|
+
|
|
276
413
|
@frozen
|
|
277
414
|
class RequireContiguousTimerange:
|
|
278
415
|
"""
|
|
@@ -281,11 +418,11 @@ class RequireContiguousTimerange:
|
|
|
281
418
|
|
|
282
419
|
group_by: tuple[str, ...]
|
|
283
420
|
"""
|
|
284
|
-
The fields to group the datasets by.
|
|
285
|
-
|
|
421
|
+
The fields to group the datasets by. Groups that are not be contiguous in time
|
|
422
|
+
are removed.
|
|
286
423
|
"""
|
|
287
424
|
|
|
288
|
-
def
|
|
425
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
289
426
|
"""
|
|
290
427
|
Check that all subgroups of the group have a contiguous timerange.
|
|
291
428
|
"""
|
|
@@ -295,11 +432,10 @@ class RequireContiguousTimerange:
|
|
|
295
432
|
days=31, # Maximum number of days in a month.
|
|
296
433
|
hours=1, # Allow for potential rounding errors.
|
|
297
434
|
)
|
|
298
|
-
group = group.dropna(subset=["start_time", "end_time"])
|
|
299
|
-
if len(group) < 2: # noqa: PLR2004
|
|
300
|
-
return True
|
|
301
435
|
|
|
302
|
-
|
|
436
|
+
select = pd.Series(True, index=group.index)
|
|
437
|
+
|
|
438
|
+
for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
|
|
303
439
|
if len(subgroup) < 2: # noqa: PLR2004
|
|
304
440
|
continue
|
|
305
441
|
sorted_group = subgroup.sort_values("start_time", kind="stable")
|
|
@@ -327,12 +463,13 @@ class RequireContiguousTimerange:
|
|
|
327
463
|
paths = sorted_group["path"]
|
|
328
464
|
for gap_idx in np.flatnonzero(gap_indices):
|
|
329
465
|
logger.debug(
|
|
330
|
-
f"Constraint {self
|
|
331
|
-
f"
|
|
466
|
+
f"Constraint {self} not satisfied because gap larger "
|
|
467
|
+
f"than {max_timedelta} found between "
|
|
332
468
|
f"{paths.iloc[gap_idx]} and {paths.iloc[gap_idx + 1]}"
|
|
333
469
|
)
|
|
334
|
-
|
|
335
|
-
|
|
470
|
+
select.loc[subgroup.index] = False
|
|
471
|
+
|
|
472
|
+
return group[select]
|
|
336
473
|
|
|
337
474
|
|
|
338
475
|
@frozen
|
|
@@ -347,17 +484,24 @@ class RequireOverlappingTimerange:
|
|
|
347
484
|
the groups to fulfill the constraint.
|
|
348
485
|
"""
|
|
349
486
|
|
|
350
|
-
def
|
|
487
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
351
488
|
"""
|
|
352
489
|
Check that all subgroups of the group have an overlapping timerange.
|
|
353
490
|
"""
|
|
354
|
-
|
|
355
|
-
if len(
|
|
356
|
-
return
|
|
357
|
-
|
|
358
|
-
starts =
|
|
359
|
-
ends =
|
|
360
|
-
|
|
491
|
+
group_with_time = group.dropna(subset=["start_time", "end_time"])
|
|
492
|
+
if len(group_with_time) < 2: # noqa: PLR2004
|
|
493
|
+
return group
|
|
494
|
+
|
|
495
|
+
starts = group_with_time.groupby(list(self.group_by))["start_time"].min()
|
|
496
|
+
ends = group_with_time.groupby(list(self.group_by))["end_time"].max()
|
|
497
|
+
result = starts.max() < ends.min()
|
|
498
|
+
if not result:
|
|
499
|
+
logger.debug(
|
|
500
|
+
f"Constraint {self} not satisfied because no overlapping timerange "
|
|
501
|
+
f"found for groups in {', '.join(group['path'])}"
|
|
502
|
+
)
|
|
503
|
+
return group.loc[[]]
|
|
504
|
+
return group
|
|
361
505
|
|
|
362
506
|
|
|
363
507
|
@frozen
|
|
@@ -76,12 +76,6 @@ class FacetFilter:
|
|
|
76
76
|
The result will only contain datasets where for all fields,
|
|
77
77
|
the value of the field is one of the given values.
|
|
78
78
|
"""
|
|
79
|
-
keep: bool = True
|
|
80
|
-
"""
|
|
81
|
-
Whether to keep or remove datasets that match the filter.
|
|
82
|
-
|
|
83
|
-
If true (default), datasets that match the filter will be kept else they will be removed.
|
|
84
|
-
"""
|
|
85
79
|
|
|
86
80
|
|
|
87
81
|
def sort_selector(inp: Selector) -> Selector:
|
|
@@ -159,6 +153,9 @@ class ExecutionDatasetCollection:
|
|
|
159
153
|
def __init__(self, collection: dict[SourceDatasetType | str, DatasetCollection]):
|
|
160
154
|
self._collection = {SourceDatasetType(k): v for k, v in collection.items()}
|
|
161
155
|
|
|
156
|
+
def __repr__(self) -> str:
|
|
157
|
+
return f"ExecutionDatasetCollection({self._collection})"
|
|
158
|
+
|
|
162
159
|
def __contains__(self, key: SourceDatasetType | str) -> bool:
|
|
163
160
|
if isinstance(key, str):
|
|
164
161
|
key = SourceDatasetType(key)
|
|
@@ -321,7 +321,12 @@ class DataRequirement:
|
|
|
321
321
|
Filters to apply to the data catalog of datasets.
|
|
322
322
|
|
|
323
323
|
This is used to reduce the set of datasets to only those that are required by the diagnostic.
|
|
324
|
-
|
|
324
|
+
|
|
325
|
+
Each FacetFilter contains one or more facet values that must all be satisfied
|
|
326
|
+
for a dataset to match that filter. The overall selection keeps any dataset
|
|
327
|
+
that matches at least one of the provided filters.
|
|
328
|
+
|
|
329
|
+
If no filters are specified, all datasets in the data catalog are used.
|
|
325
330
|
"""
|
|
326
331
|
|
|
327
332
|
group_by: tuple[str, ...] | None
|
|
@@ -361,6 +366,10 @@ class DataRequirement:
|
|
|
361
366
|
:
|
|
362
367
|
Filtered data catalog
|
|
363
368
|
"""
|
|
369
|
+
if not self.filters or any(not f.facets for f in self.filters):
|
|
370
|
+
return data_catalog
|
|
371
|
+
|
|
372
|
+
select = pd.Series(False, index=data_catalog.index)
|
|
364
373
|
for facet_filter in self.filters:
|
|
365
374
|
values = {}
|
|
366
375
|
for facet, value in facet_filter.facets.items():
|
|
@@ -372,11 +381,9 @@ class DataRequirement:
|
|
|
372
381
|
)
|
|
373
382
|
values[facet] = clean_value
|
|
374
383
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
data_catalog = data_catalog[mask]
|
|
379
|
-
return data_catalog
|
|
384
|
+
select |= data_catalog[list(values)].isin(values).all(axis="columns")
|
|
385
|
+
|
|
386
|
+
return data_catalog[select]
|
|
380
387
|
|
|
381
388
|
|
|
382
389
|
@runtime_checkable
|
|
@@ -9,6 +9,7 @@ import contextlib
|
|
|
9
9
|
import inspect
|
|
10
10
|
import logging
|
|
11
11
|
import multiprocessing
|
|
12
|
+
import os
|
|
12
13
|
import sys
|
|
13
14
|
from collections.abc import Generator
|
|
14
15
|
from pathlib import Path
|
|
@@ -94,7 +95,10 @@ def initialise_logging(level: int | str, format: str, log_directory: str | Path)
|
|
|
94
95
|
logger.info("Starting REF logging")
|
|
95
96
|
logger.info(f"arguments: {sys.argv}")
|
|
96
97
|
|
|
97
|
-
|
|
98
|
+
# LOGURU_COLORIZE is the default env var used by loguru to determine if color should be used
|
|
99
|
+
# We override this to use NO_COLOR which is more widely supported
|
|
100
|
+
no_color = os.environ.get("NO_COLOR") is not None
|
|
101
|
+
add_log_handler(level=level, format=format, colorize=not no_color)
|
|
98
102
|
|
|
99
103
|
|
|
100
104
|
def capture_logging() -> None:
|
|
@@ -150,12 +154,12 @@ def remove_log_handler() -> None:
|
|
|
150
154
|
"""
|
|
151
155
|
if hasattr(logger, "default_handler_id"):
|
|
152
156
|
try:
|
|
153
|
-
logger.remove(logger.default_handler_id)
|
|
157
|
+
logger.remove(logger.default_handler_id) # pyright: ignore[reportAttributeAccessIssue]
|
|
154
158
|
except ValueError:
|
|
155
159
|
# This can happen if the handler has already been removed
|
|
156
160
|
# or if the logger was never configured
|
|
157
161
|
pass
|
|
158
|
-
del logger.default_handler_id
|
|
162
|
+
del logger.default_handler_id # pyright: ignore[reportAttributeAccessIssue]
|
|
159
163
|
else:
|
|
160
164
|
raise AssertionError("No default log handler to remove.")
|
|
161
165
|
|
{climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/metric_values/typing.py
RENAMED
|
@@ -3,7 +3,8 @@ from collections.abc import Sequence
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, Self
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
import numpy as np
|
|
7
|
+
from pydantic import BaseModel, field_validator, model_validator
|
|
7
8
|
|
|
8
9
|
Value = float | int
|
|
9
10
|
|
|
@@ -64,20 +65,35 @@ class SeriesMetricValue(BaseModel):
|
|
|
64
65
|
This is used for presentation purposes and is not used in the controlled vocabulary.
|
|
65
66
|
"""
|
|
66
67
|
|
|
67
|
-
attributes: dict[str, str | Value] | None = None
|
|
68
|
+
attributes: dict[str, str | Value | None] | None = None
|
|
68
69
|
"""
|
|
69
70
|
Additional unstructured attributes associated with the metric value
|
|
70
71
|
"""
|
|
71
72
|
|
|
72
73
|
@model_validator(mode="after")
|
|
73
|
-
def
|
|
74
|
-
"""Validate that index has the same length as values"""
|
|
74
|
+
def validate_index(self) -> Self:
|
|
75
|
+
"""Validate that index has the same length as values and contains no NaNs"""
|
|
75
76
|
if len(self.index) != len(self.values):
|
|
76
77
|
raise ValueError(
|
|
77
78
|
f"Index length ({len(self.index)}) must match values length ({len(self.values)})"
|
|
78
79
|
)
|
|
80
|
+
for v in self.index:
|
|
81
|
+
if isinstance(v, float) and not np.isfinite(v):
|
|
82
|
+
raise ValueError("NaN or Inf values are not allowed in the index")
|
|
79
83
|
return self
|
|
80
84
|
|
|
85
|
+
@field_validator("values", mode="before")
|
|
86
|
+
@classmethod
|
|
87
|
+
def validate_values(cls, value: Any) -> Any:
|
|
88
|
+
"""
|
|
89
|
+
Transform None values to NaN in the values field
|
|
90
|
+
"""
|
|
91
|
+
if not isinstance(value, (list, tuple)):
|
|
92
|
+
raise ValueError("`values` must be a list or tuple.")
|
|
93
|
+
|
|
94
|
+
# Transform None values to NaN
|
|
95
|
+
return [float("nan") if v is None else v for v in value]
|
|
96
|
+
|
|
81
97
|
@classmethod
|
|
82
98
|
def dump_to_json(cls, path: Path, series: Sequence["SeriesMetricValue"]) -> None:
|
|
83
99
|
"""
|
|
@@ -94,7 +110,13 @@ class SeriesMetricValue(BaseModel):
|
|
|
94
110
|
The series values to dump.
|
|
95
111
|
"""
|
|
96
112
|
with open(path, "w") as f:
|
|
97
|
-
json.dump(
|
|
113
|
+
json.dump(
|
|
114
|
+
[s.model_dump(mode="json") for s in series],
|
|
115
|
+
f,
|
|
116
|
+
indent=2,
|
|
117
|
+
allow_nan=False,
|
|
118
|
+
sort_keys=True,
|
|
119
|
+
)
|
|
98
120
|
|
|
99
121
|
@classmethod
|
|
100
122
|
def load_from_json(
|
|
@@ -102,7 +124,7 @@ class SeriesMetricValue(BaseModel):
|
|
|
102
124
|
path: Path,
|
|
103
125
|
) -> list["SeriesMetricValue"]:
|
|
104
126
|
"""
|
|
105
|
-
|
|
127
|
+
Load a sequence of SeriesMetricValue from a JSON file.
|
|
106
128
|
|
|
107
129
|
Parameters
|
|
108
130
|
----------
|
|
@@ -115,7 +137,7 @@ class SeriesMetricValue(BaseModel):
|
|
|
115
137
|
if not isinstance(data, list):
|
|
116
138
|
raise ValueError(f"Expected a list of series values, got {type(data)}")
|
|
117
139
|
|
|
118
|
-
return [cls.model_validate(s) for s in data]
|
|
140
|
+
return [cls.model_validate(s, strict=True) for s in data]
|
|
119
141
|
|
|
120
142
|
|
|
121
143
|
class ScalarMetricValue(BaseModel):
|
|
@@ -16,14 +16,18 @@ import os
|
|
|
16
16
|
import stat
|
|
17
17
|
import subprocess
|
|
18
18
|
from abc import abstractmethod
|
|
19
|
-
from collections.abc import Iterable
|
|
19
|
+
from collections.abc import Iterable, Sequence
|
|
20
20
|
from contextlib import AbstractContextManager
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
from typing import TYPE_CHECKING
|
|
23
23
|
|
|
24
24
|
import requests
|
|
25
|
+
import yaml
|
|
26
|
+
from attrs import evolve
|
|
25
27
|
from loguru import logger
|
|
26
28
|
|
|
29
|
+
from climate_ref_core.constraints import IgnoreFacets
|
|
30
|
+
from climate_ref_core.datasets import SourceDatasetType
|
|
27
31
|
from climate_ref_core.diagnostics import Diagnostic
|
|
28
32
|
from climate_ref_core.exceptions import InvalidDiagnosticException, InvalidProviderException
|
|
29
33
|
|
|
@@ -74,6 +78,51 @@ class DiagnosticProvider:
|
|
|
74
78
|
config :
|
|
75
79
|
A configuration.
|
|
76
80
|
"""
|
|
81
|
+
logger.debug(
|
|
82
|
+
f"Configuring provider {self.slug} using ignore_datasets_file {config.ignore_datasets_file}"
|
|
83
|
+
)
|
|
84
|
+
# The format of the configuration file is:
|
|
85
|
+
# provider:
|
|
86
|
+
# diagnostic:
|
|
87
|
+
# source_type:
|
|
88
|
+
# - facet: value
|
|
89
|
+
# - other_facet: [other_value1, other_value2]
|
|
90
|
+
ignore_datasets_all = yaml.safe_load(config.ignore_datasets_file.read_text(encoding="utf-8")) or {}
|
|
91
|
+
ignore_datasets = ignore_datasets_all.get(self.slug, {})
|
|
92
|
+
if unknown_slugs := {slug for slug in ignore_datasets} - {d.slug for d in self.diagnostics()}:
|
|
93
|
+
logger.warning(
|
|
94
|
+
f"Unknown diagnostics found in {config.ignore_datasets_file} "
|
|
95
|
+
f"for provider {self.slug}: {', '.join(sorted(unknown_slugs))}"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
known_source_types = {s.value for s in iter(SourceDatasetType)}
|
|
99
|
+
for diagnostic in self.diagnostics():
|
|
100
|
+
if diagnostic.slug in ignore_datasets:
|
|
101
|
+
if unknown_source_types := set(ignore_datasets[diagnostic.slug]) - known_source_types:
|
|
102
|
+
logger.warning(
|
|
103
|
+
f"Unknown source types found in {config.ignore_datasets_file} for "
|
|
104
|
+
f"diagnostic '{diagnostic.slug}' by provider {self.slug}: "
|
|
105
|
+
f"{', '.join(sorted(unknown_source_types))}"
|
|
106
|
+
)
|
|
107
|
+
data_requirements = (
|
|
108
|
+
r if isinstance(r, Sequence) else (r,) for r in diagnostic.data_requirements
|
|
109
|
+
)
|
|
110
|
+
diagnostic.data_requirements = tuple(
|
|
111
|
+
tuple(
|
|
112
|
+
evolve(
|
|
113
|
+
data_requirement,
|
|
114
|
+
constraints=tuple(
|
|
115
|
+
IgnoreFacets(facets)
|
|
116
|
+
for facets in ignore_datasets[diagnostic.slug].get(
|
|
117
|
+
data_requirement.source_type.value, []
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
+ data_requirement.constraints,
|
|
121
|
+
)
|
|
122
|
+
for data_requirement in requirement_collection
|
|
123
|
+
)
|
|
124
|
+
for requirement_collection in data_requirements
|
|
125
|
+
)
|
|
77
126
|
|
|
78
127
|
def diagnostics(self) -> list[Diagnostic]:
|
|
79
128
|
"""
|
|
@@ -287,6 +336,7 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
|
|
|
287
336
|
|
|
288
337
|
def configure(self, config: Config) -> None:
|
|
289
338
|
"""Configure the provider."""
|
|
339
|
+
super().configure(config)
|
|
290
340
|
self.prefix = config.paths.software / "conda"
|
|
291
341
|
|
|
292
342
|
def _install_conda(self, update: bool) -> Path:
|