climate-ref-core 0.6.5__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/PKG-INFO +2 -2
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/pyproject.toml +2 -2
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/constraints.py +198 -80
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/datasets.py +20 -8
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/diagnostics.py +38 -14
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/executor.py +4 -1
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/logging.py +7 -3
- climate_ref_core-0.7.0/src/climate_ref_core/metric_values/typing.py +139 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/providers.py +25 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/pycmec/controlled_vocabulary.py +33 -15
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/pycmec/output.py +12 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/metric_values/test_typing.py +32 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/cmec_testdata/test_output_json_schema.yml +8 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/test_cmec_metric.py +1 -2
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/test_cmec_output.py +1 -4
- climate_ref_core-0.7.0/tests/unit/test_constraints.py +765 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/test_dataset_registry/test_dataset_registry.py +2 -2
- climate_ref_core-0.7.0/tests/unit/test_datasets/dataset_collection_hash.yml +2 -0
- climate_ref_core-0.7.0/tests/unit/test_datasets/dataset_collection_obs4mips_hash.yml +2 -0
- climate_ref_core-0.7.0/tests/unit/test_datasets/execution_dataset_hash.yml +2 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/test_datasets.py +29 -11
- climate_ref_core-0.6.5/tests/unit/test_metrics.py → climate_ref_core-0.7.0/tests/unit/test_diagnostics.py +39 -76
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/test_logging.py +4 -7
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/test_providers.py +9 -2
- climate_ref_core-0.6.5/src/climate_ref_core/metric_values/typing.py +0 -74
- climate_ref_core-0.6.5/tests/unit/test_constraints.py +0 -438
- climate_ref_core-0.6.5/tests/unit/test_datasets/dataset_collection_hash.yml +0 -2
- climate_ref_core-0.6.5/tests/unit/test_datasets/dataset_collection_obs4mips_hash.yml +0 -2
- climate_ref_core-0.6.5/tests/unit/test_datasets/metric_dataset_hash.yml +0 -2
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/.gitignore +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/LICENCE +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/NOTICE +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/README.md +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/__init__.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/dataset_registry.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/env.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/exceptions.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/metric_values/__init__.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/py.typed +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/pycmec/README.md +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/pycmec/__init__.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/pycmec/cv_cmip7_aft.yaml +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/pycmec/metric.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/cmec_testdata/cmec_metric_sample.json +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/cmec_testdata/cmec_output_sample.json +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/cmec_testdata/cv_sample.yaml +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/cmec_testdata/test_metric_json_schema.yml +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/conftest.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/pycmec/test_controlled_vocabulary.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/test_exceptions.py +0 -0
- {climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/tests/unit/test_executor.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: climate-ref-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Core library for the CMIP Rapid Evaluation Framework
|
|
5
5
|
Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -29,7 +29,7 @@ Requires-Dist: pydantic>=2.10.6
|
|
|
29
29
|
Requires-Dist: pyyaml>=6.0.2
|
|
30
30
|
Requires-Dist: requests
|
|
31
31
|
Requires-Dist: rich
|
|
32
|
-
Requires-Dist: setuptools
|
|
32
|
+
Requires-Dist: setuptools<81
|
|
33
33
|
Requires-Dist: typing-extensions
|
|
34
34
|
Description-Content-Type: text/markdown
|
|
35
35
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "climate-ref-core"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.7.0"
|
|
4
4
|
description = "Core library for the CMIP Rapid Evaluation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -39,7 +39,7 @@ dependencies = [
|
|
|
39
39
|
"environs>=11",
|
|
40
40
|
"pyyaml>=6.0.2",
|
|
41
41
|
# Not used directly, but required to support some installations
|
|
42
|
-
"setuptools
|
|
42
|
+
"setuptools<81",
|
|
43
43
|
|
|
44
44
|
# SPEC 0000 constraints
|
|
45
45
|
# We follow [SPEC-0000](https://scientific-python.org/specs/spec-0000/)
|
|
@@ -6,7 +6,9 @@ import sys
|
|
|
6
6
|
import warnings
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from collections.abc import Mapping
|
|
9
|
-
from
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from functools import total_ordering
|
|
11
|
+
from typing import Literal, Protocol, runtime_checkable
|
|
10
12
|
|
|
11
13
|
if sys.version_info < (3, 11):
|
|
12
14
|
from typing_extensions import Self
|
|
@@ -15,45 +17,21 @@ else:
|
|
|
15
17
|
|
|
16
18
|
import numpy as np
|
|
17
19
|
import pandas as pd
|
|
18
|
-
from attrs import frozen
|
|
20
|
+
from attrs import field, frozen
|
|
19
21
|
from loguru import logger
|
|
20
22
|
|
|
21
23
|
from climate_ref_core.datasets import SourceDatasetType
|
|
22
|
-
from climate_ref_core.exceptions import ConstraintNotSatisfied
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
@runtime_checkable
|
|
26
|
-
class
|
|
27
|
+
class GroupConstraint(Protocol):
|
|
27
28
|
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
All constraints must be satisfied for a given group to be run.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def validate(self, group: pd.DataFrame) -> bool:
|
|
34
|
-
"""
|
|
35
|
-
Validate if the constraint is satisfied by the dataset.
|
|
36
|
-
|
|
37
|
-
This is executed after the apply method to determine if the constraint is satisfied.
|
|
38
|
-
If the constraint is not satisfied, the group will not be executed.
|
|
39
|
-
|
|
40
|
-
Parameters
|
|
41
|
-
----------
|
|
42
|
-
group
|
|
43
|
-
A group of datasets that is being validated.
|
|
44
|
-
|
|
45
|
-
Returns
|
|
46
|
-
-------
|
|
47
|
-
:
|
|
48
|
-
Whether the constraint is satisfied
|
|
49
|
-
"""
|
|
50
|
-
...
|
|
29
|
+
An operation to perform on a group of datasets resulting in a new group of datasets.
|
|
51
30
|
|
|
31
|
+
This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
|
|
52
32
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
"""
|
|
56
|
-
An operation to perform on a group of datasets resulting in a new group of datasets.
|
|
33
|
+
If the operation results in an empty group, the constraint is considered not satisfied.
|
|
34
|
+
The group must satisfy all constraints to be processed.
|
|
57
35
|
|
|
58
36
|
!! warning
|
|
59
37
|
|
|
@@ -90,18 +68,6 @@ class GroupOperation(Protocol):
|
|
|
90
68
|
...
|
|
91
69
|
|
|
92
70
|
|
|
93
|
-
GroupConstraint = GroupOperation | GroupValidator
|
|
94
|
-
"""
|
|
95
|
-
A constraint that must be satisfied for a group of datasets to be executed.
|
|
96
|
-
|
|
97
|
-
This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
|
|
98
|
-
The group must satisfy all constraints to be processed.
|
|
99
|
-
|
|
100
|
-
This can include operations that are applied to a group of datasets which may modify the group,
|
|
101
|
-
but may also include validators that check if the group satisfies a certain condition.
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
|
|
105
71
|
def apply_constraint(
|
|
106
72
|
dataframe: pd.DataFrame,
|
|
107
73
|
constraint: GroupConstraint,
|
|
@@ -124,39 +90,67 @@ def apply_constraint(
|
|
|
124
90
|
:
|
|
125
91
|
The updated group of datasets or None if the constraint was not satisfied
|
|
126
92
|
"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
constraint.apply(dataframe, data_catalog) if isinstance(constraint, GroupOperation) else dataframe
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
valid = constraint.validate(updated_group) if isinstance(constraint, GroupValidator) else True
|
|
133
|
-
if not valid:
|
|
134
|
-
logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
135
|
-
raise ConstraintNotSatisfied(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
136
|
-
except ConstraintNotSatisfied:
|
|
93
|
+
updated_group = constraint.apply(dataframe, data_catalog)
|
|
94
|
+
if updated_group.empty:
|
|
137
95
|
logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
|
|
138
96
|
return None
|
|
139
97
|
|
|
140
98
|
return updated_group
|
|
141
99
|
|
|
142
100
|
|
|
101
|
+
def _to_tuple(value: None | str | tuple[str, ...]) -> tuple[str, ...]:
|
|
102
|
+
"""
|
|
103
|
+
Clean the value of group_by to a tuple of strings
|
|
104
|
+
"""
|
|
105
|
+
if value is None:
|
|
106
|
+
return ()
|
|
107
|
+
if isinstance(value, str):
|
|
108
|
+
return (value,)
|
|
109
|
+
return tuple(value)
|
|
110
|
+
|
|
111
|
+
|
|
143
112
|
@frozen
|
|
144
113
|
class RequireFacets:
|
|
145
114
|
"""
|
|
146
|
-
A constraint that requires
|
|
115
|
+
A constraint that requires datasets to have certain facet values.
|
|
147
116
|
"""
|
|
148
117
|
|
|
149
118
|
dimension: str
|
|
150
|
-
|
|
119
|
+
"""The name of the facet to filter on."""
|
|
151
120
|
|
|
152
|
-
|
|
121
|
+
required_facets: tuple[str, ...] = field(converter=_to_tuple)
|
|
122
|
+
"The required facet values."
|
|
123
|
+
|
|
124
|
+
operator: Literal["all", "any"] = "all"
|
|
125
|
+
"""Whether all or any of the required facets must be present."""
|
|
126
|
+
|
|
127
|
+
group_by: tuple[str, ...] | None = field(converter=_to_tuple, default=None)
|
|
128
|
+
"""
|
|
129
|
+
The facets to group the datasets by.
|
|
130
|
+
|
|
131
|
+
Each group created by `group_by` must contain at least one dataset where the
|
|
132
|
+
value of the given dimension is in the list of required facet values.
|
|
133
|
+
|
|
134
|
+
For example, if there are multiple models and variables in the selection,
|
|
135
|
+
`group_by` can be used to make sure that only those models are selected that
|
|
136
|
+
provide all required variables.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
153
140
|
"""
|
|
154
|
-
|
|
141
|
+
Filter out groups of datasets that do not provide the required facets
|
|
155
142
|
"""
|
|
156
|
-
if self.
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
143
|
+
op = all if self.operator == "all" else any
|
|
144
|
+
select = pd.Series(True, index=group.index)
|
|
145
|
+
groups = [group] if not self.group_by else (g[1] for g in group.groupby(list(self.group_by)))
|
|
146
|
+
for subgroup in groups:
|
|
147
|
+
if not op(value in subgroup[self.dimension].values for value in self.required_facets):
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"Constraint {self} not satisfied because required facet values "
|
|
150
|
+
f"not found for group {', '.join(subgroup['path'])}"
|
|
151
|
+
)
|
|
152
|
+
select.loc[subgroup.index] = False
|
|
153
|
+
return group[select]
|
|
160
154
|
|
|
161
155
|
|
|
162
156
|
@frozen
|
|
@@ -271,6 +265,123 @@ class AddSupplementaryDataset:
|
|
|
271
265
|
return cls(supplementary_facets, **kwargs[source_type])
|
|
272
266
|
|
|
273
267
|
|
|
268
|
+
@frozen
|
|
269
|
+
@total_ordering
|
|
270
|
+
class PartialDateTime: # noqa: PLW1641
|
|
271
|
+
"""
|
|
272
|
+
A partial datetime object that can be used to compare datetimes.
|
|
273
|
+
|
|
274
|
+
Only the specified fields are used for comparison.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
year: int | None = None
|
|
278
|
+
month: int | None = None
|
|
279
|
+
day: int | None = None
|
|
280
|
+
hour: int | None = None
|
|
281
|
+
minute: int | None = None
|
|
282
|
+
second: int | None = None
|
|
283
|
+
|
|
284
|
+
@property
|
|
285
|
+
def _attrs(self) -> dict[str, int]:
|
|
286
|
+
"""The attributes that are set."""
|
|
287
|
+
return {
|
|
288
|
+
a: v
|
|
289
|
+
for a in self.__slots__ # type: ignore[attr-defined]
|
|
290
|
+
if not a.startswith("_") and (v := getattr(self, a)) is not None
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
def __repr__(self) -> str:
|
|
294
|
+
return f"{self.__class__.__name__}({', '.join(f'{a}={v}' for a, v in self._attrs.items())})"
|
|
295
|
+
|
|
296
|
+
def __eq__(self, other: object) -> bool:
|
|
297
|
+
if not isinstance(other, datetime):
|
|
298
|
+
msg = (
|
|
299
|
+
f"Can only compare PartialDateTime with `datetime.datetime` "
|
|
300
|
+
f"objects, got object {other} of type {type(other)}"
|
|
301
|
+
)
|
|
302
|
+
raise TypeError(msg)
|
|
303
|
+
|
|
304
|
+
for attr, value in self._attrs.items():
|
|
305
|
+
other_value = getattr(other, attr)
|
|
306
|
+
if value != other_value:
|
|
307
|
+
return False
|
|
308
|
+
return True
|
|
309
|
+
|
|
310
|
+
def __lt__(self, other: object) -> bool:
|
|
311
|
+
if not isinstance(other, datetime):
|
|
312
|
+
msg = (
|
|
313
|
+
f"Can only compare PartialDateTime with `datetime.datetime` "
|
|
314
|
+
f"objects, got object {other} of type {type(other)}"
|
|
315
|
+
)
|
|
316
|
+
raise TypeError(msg)
|
|
317
|
+
|
|
318
|
+
for attr, value in self._attrs.items():
|
|
319
|
+
other_value = getattr(other, attr)
|
|
320
|
+
if value != other_value:
|
|
321
|
+
return value < other_value # type: ignore[no-any-return]
|
|
322
|
+
return False
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@frozen
|
|
326
|
+
class RequireTimerange:
|
|
327
|
+
"""
|
|
328
|
+
A constraint that requires datasets to have a specific timerange.
|
|
329
|
+
|
|
330
|
+
Specify the start and/or end of the required timerange using a precision
|
|
331
|
+
that matches the frequency of the datasets.
|
|
332
|
+
|
|
333
|
+
For example, to ensure that datasets at monthly frequency cover the period
|
|
334
|
+
from 2000 to 2010, use start=PartialDateTime(year=2000, month=1) and
|
|
335
|
+
end=PartialDateTime(year=2010, month=12).
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
group_by: tuple[str, ...]
|
|
339
|
+
"""
|
|
340
|
+
The fields to group the datasets by. Groups that do not cover the timerange
|
|
341
|
+
will be removed.
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
start: PartialDateTime | None = None
|
|
345
|
+
"""
|
|
346
|
+
The start time of the required timerange. If None, no start time is required.
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
end: PartialDateTime | None = None
|
|
350
|
+
"""
|
|
351
|
+
The end time of the required timerange. If None, no end time is required.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
355
|
+
"""
|
|
356
|
+
Check that all subgroups of the group have a contiguous timerange.
|
|
357
|
+
"""
|
|
358
|
+
select = pd.Series(True, index=group.index)
|
|
359
|
+
for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
|
|
360
|
+
start = subgroup["start_time"].min()
|
|
361
|
+
end = subgroup["end_time"].max()
|
|
362
|
+
result = True
|
|
363
|
+
if self.start is not None and start > self.start:
|
|
364
|
+
logger.debug(
|
|
365
|
+
f"Constraint {self} not satisfied because start time {start} "
|
|
366
|
+
f"is after required start time for {', '.join(subgroup['path'])}"
|
|
367
|
+
)
|
|
368
|
+
result = False
|
|
369
|
+
if self.end is not None and end < self.end:
|
|
370
|
+
logger.debug(
|
|
371
|
+
f"Constraint {self} not satisfied because end time {end} "
|
|
372
|
+
f"is before required end time for {', '.join(subgroup['path'])}"
|
|
373
|
+
)
|
|
374
|
+
result = False
|
|
375
|
+
if result:
|
|
376
|
+
contiguous_subgroup = RequireContiguousTimerange(group_by=self.group_by).apply(
|
|
377
|
+
subgroup, data_catalog
|
|
378
|
+
)
|
|
379
|
+
result = len(contiguous_subgroup) == len(subgroup)
|
|
380
|
+
if not result:
|
|
381
|
+
select.loc[subgroup.index] = False
|
|
382
|
+
return group[select]
|
|
383
|
+
|
|
384
|
+
|
|
274
385
|
@frozen
|
|
275
386
|
class RequireContiguousTimerange:
|
|
276
387
|
"""
|
|
@@ -279,11 +390,11 @@ class RequireContiguousTimerange:
|
|
|
279
390
|
|
|
280
391
|
group_by: tuple[str, ...]
|
|
281
392
|
"""
|
|
282
|
-
The fields to group the datasets by.
|
|
283
|
-
|
|
393
|
+
The fields to group the datasets by. Groups that are not be contiguous in time
|
|
394
|
+
are removed.
|
|
284
395
|
"""
|
|
285
396
|
|
|
286
|
-
def
|
|
397
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
287
398
|
"""
|
|
288
399
|
Check that all subgroups of the group have a contiguous timerange.
|
|
289
400
|
"""
|
|
@@ -293,11 +404,10 @@ class RequireContiguousTimerange:
|
|
|
293
404
|
days=31, # Maximum number of days in a month.
|
|
294
405
|
hours=1, # Allow for potential rounding errors.
|
|
295
406
|
)
|
|
296
|
-
group = group.dropna(subset=["start_time", "end_time"])
|
|
297
|
-
if len(group) < 2: # noqa: PLR2004
|
|
298
|
-
return True
|
|
299
407
|
|
|
300
|
-
|
|
408
|
+
select = pd.Series(True, index=group.index)
|
|
409
|
+
|
|
410
|
+
for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
|
|
301
411
|
if len(subgroup) < 2: # noqa: PLR2004
|
|
302
412
|
continue
|
|
303
413
|
sorted_group = subgroup.sort_values("start_time", kind="stable")
|
|
@@ -325,12 +435,13 @@ class RequireContiguousTimerange:
|
|
|
325
435
|
paths = sorted_group["path"]
|
|
326
436
|
for gap_idx in np.flatnonzero(gap_indices):
|
|
327
437
|
logger.debug(
|
|
328
|
-
f"Constraint {self
|
|
329
|
-
f"
|
|
438
|
+
f"Constraint {self} not satisfied because gap larger "
|
|
439
|
+
f"than {max_timedelta} found between "
|
|
330
440
|
f"{paths.iloc[gap_idx]} and {paths.iloc[gap_idx + 1]}"
|
|
331
441
|
)
|
|
332
|
-
|
|
333
|
-
|
|
442
|
+
select.loc[subgroup.index] = False
|
|
443
|
+
|
|
444
|
+
return group[select]
|
|
334
445
|
|
|
335
446
|
|
|
336
447
|
@frozen
|
|
@@ -345,17 +456,24 @@ class RequireOverlappingTimerange:
|
|
|
345
456
|
the groups to fulfill the constraint.
|
|
346
457
|
"""
|
|
347
458
|
|
|
348
|
-
def
|
|
459
|
+
def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
349
460
|
"""
|
|
350
461
|
Check that all subgroups of the group have an overlapping timerange.
|
|
351
462
|
"""
|
|
352
|
-
|
|
353
|
-
if len(
|
|
354
|
-
return
|
|
355
|
-
|
|
356
|
-
starts =
|
|
357
|
-
ends =
|
|
358
|
-
|
|
463
|
+
group_with_time = group.dropna(subset=["start_time", "end_time"])
|
|
464
|
+
if len(group_with_time) < 2: # noqa: PLR2004
|
|
465
|
+
return group
|
|
466
|
+
|
|
467
|
+
starts = group_with_time.groupby(list(self.group_by))["start_time"].min()
|
|
468
|
+
ends = group_with_time.groupby(list(self.group_by))["end_time"].max()
|
|
469
|
+
result = starts.max() < ends.min()
|
|
470
|
+
if not result:
|
|
471
|
+
logger.debug(
|
|
472
|
+
f"Constraint {self} not satisfied because no overlapping timerange "
|
|
473
|
+
f"found for groups in {', '.join(group['path'])}"
|
|
474
|
+
)
|
|
475
|
+
return group.loc[[]]
|
|
476
|
+
return group
|
|
359
477
|
|
|
360
478
|
|
|
361
479
|
@frozen
|
|
@@ -5,7 +5,7 @@ Dataset management and filtering
|
|
|
5
5
|
import enum
|
|
6
6
|
import functools
|
|
7
7
|
import hashlib
|
|
8
|
-
from collections.abc import Collection, Iterable
|
|
8
|
+
from collections.abc import Collection, Iterable, Iterator
|
|
9
9
|
from typing import Any, Self
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
@@ -76,12 +76,6 @@ class FacetFilter:
|
|
|
76
76
|
The result will only contain datasets where for all fields,
|
|
77
77
|
the value of the field is one of the given values.
|
|
78
78
|
"""
|
|
79
|
-
keep: bool = True
|
|
80
|
-
"""
|
|
81
|
-
Whether to keep or remove datasets that match the filter.
|
|
82
|
-
|
|
83
|
-
If true (default), datasets that match the filter will be kept else they will be removed.
|
|
84
|
-
"""
|
|
85
79
|
|
|
86
80
|
|
|
87
81
|
def sort_selector(inp: Selector) -> Selector:
|
|
@@ -159,6 +153,9 @@ class ExecutionDatasetCollection:
|
|
|
159
153
|
def __init__(self, collection: dict[SourceDatasetType | str, DatasetCollection]):
|
|
160
154
|
self._collection = {SourceDatasetType(k): v for k, v in collection.items()}
|
|
161
155
|
|
|
156
|
+
def __repr__(self) -> str:
|
|
157
|
+
return f"ExecutionDatasetCollection({self._collection})"
|
|
158
|
+
|
|
162
159
|
def __contains__(self, key: SourceDatasetType | str) -> bool:
|
|
163
160
|
if isinstance(key, str):
|
|
164
161
|
key = SourceDatasetType(key)
|
|
@@ -172,9 +169,24 @@ class ExecutionDatasetCollection:
|
|
|
172
169
|
def __hash__(self) -> int:
|
|
173
170
|
return hash(self.hash)
|
|
174
171
|
|
|
172
|
+
def __iter__(self) -> Iterator[SourceDatasetType]:
|
|
173
|
+
return iter(self._collection)
|
|
174
|
+
|
|
175
|
+
def keys(self) -> Iterable[SourceDatasetType]:
|
|
176
|
+
"""
|
|
177
|
+
Iterate over the source types in the collection.
|
|
178
|
+
"""
|
|
179
|
+
return self._collection.keys()
|
|
180
|
+
|
|
181
|
+
def values(self) -> Iterable[DatasetCollection]:
|
|
182
|
+
"""
|
|
183
|
+
Iterate over the datasets in the collection.
|
|
184
|
+
"""
|
|
185
|
+
return self._collection.values()
|
|
186
|
+
|
|
175
187
|
def items(self) -> Iterable[tuple[SourceDatasetType, DatasetCollection]]:
|
|
176
188
|
"""
|
|
177
|
-
Iterate over the
|
|
189
|
+
Iterate over the items in the collection.
|
|
178
190
|
"""
|
|
179
191
|
return self._collection.items()
|
|
180
192
|
|
|
@@ -14,6 +14,7 @@ from attrs import field, frozen
|
|
|
14
14
|
from climate_ref_core.constraints import GroupConstraint
|
|
15
15
|
from climate_ref_core.datasets import ExecutionDatasetCollection, FacetFilter, SourceDatasetType
|
|
16
16
|
from climate_ref_core.metric_values import SeriesMetricValue
|
|
17
|
+
from climate_ref_core.metric_values.typing import SeriesDefinition
|
|
17
18
|
from climate_ref_core.pycmec.metric import CMECMetric
|
|
18
19
|
from climate_ref_core.pycmec.output import CMECOutput
|
|
19
20
|
|
|
@@ -182,9 +183,11 @@ class ExecutionResult:
|
|
|
182
183
|
Whether the diagnostic execution ran successfully.
|
|
183
184
|
"""
|
|
184
185
|
|
|
185
|
-
|
|
186
|
+
series_filename: pathlib.Path | None = None
|
|
186
187
|
"""
|
|
187
188
|
A collection of series metric values that were extracted from the execution.
|
|
189
|
+
|
|
190
|
+
These are written to a CSV file in the output directory.
|
|
188
191
|
"""
|
|
189
192
|
|
|
190
193
|
@staticmethod
|
|
@@ -193,6 +196,7 @@ class ExecutionResult:
|
|
|
193
196
|
*,
|
|
194
197
|
cmec_output_bundle: CMECOutput | dict[str, Any],
|
|
195
198
|
cmec_metric_bundle: CMECMetric | dict[str, Any],
|
|
199
|
+
series: Sequence[SeriesMetricValue] = tuple(),
|
|
196
200
|
) -> ExecutionResult:
|
|
197
201
|
"""
|
|
198
202
|
Build a ExecutionResult from a CMEC output bundle.
|
|
@@ -205,6 +209,8 @@ class ExecutionResult:
|
|
|
205
209
|
An output bundle in the CMEC format.
|
|
206
210
|
cmec_metric_bundle
|
|
207
211
|
An diagnostic bundle in the CMEC format.
|
|
212
|
+
series
|
|
213
|
+
Series metric values extracted from the execution.
|
|
208
214
|
|
|
209
215
|
Returns
|
|
210
216
|
-------
|
|
@@ -223,17 +229,21 @@ class ExecutionResult:
|
|
|
223
229
|
cmec_metric = cmec_metric_bundle
|
|
224
230
|
|
|
225
231
|
definition.to_output_path(filename=None).mkdir(parents=True, exist_ok=True)
|
|
226
|
-
bundle_path = definition.to_output_path("output.json")
|
|
227
|
-
cmec_output.dump_to_json(bundle_path)
|
|
228
232
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
233
|
+
output_filename = "output.json"
|
|
234
|
+
metric_filename = "diagnostic.json"
|
|
235
|
+
series_filename = "series.json"
|
|
236
|
+
|
|
237
|
+
cmec_output.dump_to_json(definition.to_output_path(output_filename))
|
|
238
|
+
cmec_metric.dump_to_json(definition.to_output_path(metric_filename))
|
|
239
|
+
SeriesMetricValue.dump_to_json(definition.to_output_path(series_filename), series)
|
|
232
240
|
|
|
241
|
+
# We are using relative paths for the output files for portability of the results
|
|
233
242
|
return ExecutionResult(
|
|
234
243
|
definition=definition,
|
|
235
|
-
output_bundle_filename=pathlib.Path(
|
|
236
|
-
metric_bundle_filename=pathlib.Path(
|
|
244
|
+
output_bundle_filename=pathlib.Path(output_filename),
|
|
245
|
+
metric_bundle_filename=pathlib.Path(metric_filename),
|
|
246
|
+
series_filename=pathlib.Path(series_filename),
|
|
237
247
|
successful=True,
|
|
238
248
|
)
|
|
239
249
|
|
|
@@ -311,7 +321,12 @@ class DataRequirement:
|
|
|
311
321
|
Filters to apply to the data catalog of datasets.
|
|
312
322
|
|
|
313
323
|
This is used to reduce the set of datasets to only those that are required by the diagnostic.
|
|
314
|
-
|
|
324
|
+
|
|
325
|
+
Each FacetFilter contains one or more facet values that must all be satisfied
|
|
326
|
+
for a dataset to match that filter. The overall selection keeps any dataset
|
|
327
|
+
that matches at least one of the provided filters.
|
|
328
|
+
|
|
329
|
+
If no filters are specified, all datasets in the data catalog are used.
|
|
315
330
|
"""
|
|
316
331
|
|
|
317
332
|
group_by: tuple[str, ...] | None
|
|
@@ -351,6 +366,10 @@ class DataRequirement:
|
|
|
351
366
|
:
|
|
352
367
|
Filtered data catalog
|
|
353
368
|
"""
|
|
369
|
+
if not self.filters or any(not f.facets for f in self.filters):
|
|
370
|
+
return data_catalog
|
|
371
|
+
|
|
372
|
+
select = pd.Series(False, index=data_catalog.index)
|
|
354
373
|
for facet_filter in self.filters:
|
|
355
374
|
values = {}
|
|
356
375
|
for facet, value in facet_filter.facets.items():
|
|
@@ -362,11 +381,9 @@ class DataRequirement:
|
|
|
362
381
|
)
|
|
363
382
|
values[facet] = clean_value
|
|
364
383
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
data_catalog = data_catalog[mask]
|
|
369
|
-
return data_catalog
|
|
384
|
+
select |= data_catalog[list(values)].isin(values).all(axis="columns")
|
|
385
|
+
|
|
386
|
+
return data_catalog[select]
|
|
370
387
|
|
|
371
388
|
|
|
372
389
|
@runtime_checkable
|
|
@@ -432,6 +449,11 @@ class AbstractDiagnostic(Protocol):
|
|
|
432
449
|
is raised.
|
|
433
450
|
"""
|
|
434
451
|
|
|
452
|
+
series: Sequence[SeriesDefinition]
|
|
453
|
+
"""
|
|
454
|
+
Definition of the series that are produced by the diagnostic.
|
|
455
|
+
"""
|
|
456
|
+
|
|
435
457
|
provider: DiagnosticProvider
|
|
436
458
|
"""
|
|
437
459
|
The provider that provides the diagnostic.
|
|
@@ -493,6 +515,8 @@ class Diagnostic(AbstractDiagnostic):
|
|
|
493
515
|
See (climate_ref_example.example.ExampleDiagnostic)[] for an example implementation.
|
|
494
516
|
"""
|
|
495
517
|
|
|
518
|
+
series: Sequence[SeriesDefinition] = tuple()
|
|
519
|
+
|
|
496
520
|
def __init__(self) -> None:
|
|
497
521
|
super().__init__()
|
|
498
522
|
self._provider: DiagnosticProvider | None = None
|
|
@@ -160,12 +160,15 @@ def import_executor_cls(fqn: str) -> type[Executor]:
|
|
|
160
160
|
imp = importlib.import_module(module)
|
|
161
161
|
executor: type[Executor] = getattr(imp, attribute_name)
|
|
162
162
|
|
|
163
|
+
if isinstance(executor, Exception):
|
|
164
|
+
raise executor
|
|
165
|
+
|
|
163
166
|
# We can't really check if the executor is a subclass of Executor here
|
|
164
167
|
# Protocols can't be used with issubclass if they have non-method members
|
|
165
168
|
# We have to check this at class instantiation time
|
|
166
169
|
|
|
167
170
|
return executor
|
|
168
|
-
except ModuleNotFoundError:
|
|
171
|
+
except (ModuleNotFoundError, ImportError):
|
|
169
172
|
logger.error(f"Package '{fqn}' not found")
|
|
170
173
|
raise InvalidExecutorException(fqn, f"Module '{module}' not found")
|
|
171
174
|
except AttributeError:
|
|
@@ -9,6 +9,7 @@ import contextlib
|
|
|
9
9
|
import inspect
|
|
10
10
|
import logging
|
|
11
11
|
import multiprocessing
|
|
12
|
+
import os
|
|
12
13
|
import sys
|
|
13
14
|
from collections.abc import Generator
|
|
14
15
|
from pathlib import Path
|
|
@@ -94,7 +95,10 @@ def initialise_logging(level: int | str, format: str, log_directory: str | Path)
|
|
|
94
95
|
logger.info("Starting REF logging")
|
|
95
96
|
logger.info(f"arguments: {sys.argv}")
|
|
96
97
|
|
|
97
|
-
|
|
98
|
+
# LOGURU_COLORIZE is the default env var used by loguru to determine if color should be used
|
|
99
|
+
# We override this to use NO_COLOR which is more widely supported
|
|
100
|
+
no_color = os.environ.get("NO_COLOR") is not None
|
|
101
|
+
add_log_handler(level=level, format=format, colorize=not no_color)
|
|
98
102
|
|
|
99
103
|
|
|
100
104
|
def capture_logging() -> None:
|
|
@@ -150,12 +154,12 @@ def remove_log_handler() -> None:
|
|
|
150
154
|
"""
|
|
151
155
|
if hasattr(logger, "default_handler_id"):
|
|
152
156
|
try:
|
|
153
|
-
logger.remove(logger.default_handler_id)
|
|
157
|
+
logger.remove(logger.default_handler_id) # pyright: ignore[reportAttributeAccessIssue]
|
|
154
158
|
except ValueError:
|
|
155
159
|
# This can happen if the handler has already been removed
|
|
156
160
|
# or if the logger was never configured
|
|
157
161
|
pass
|
|
158
|
-
del logger.default_handler_id
|
|
162
|
+
del logger.default_handler_id # pyright: ignore[reportAttributeAccessIssue]
|
|
159
163
|
else:
|
|
160
164
|
raise AssertionError("No default log handler to remove.")
|
|
161
165
|
|