climate-ref-core 0.6.6__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/PKG-INFO +1 -1
  2. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/pyproject.toml +1 -1
  3. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/constraints.py +223 -79
  4. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/datasets.py +3 -6
  5. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/diagnostics.py +13 -6
  6. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/logging.py +7 -3
  7. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/metric_values/typing.py +29 -7
  8. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/providers.py +51 -1
  9. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/metric.py +12 -4
  10. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/output.py +12 -0
  11. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/test_output_json_schema.yml +8 -0
  12. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/test_cmec_metric.py +1 -2
  13. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/test_cmec_output.py +1 -4
  14. climate_ref_core-0.8.0/tests/unit/test_constraints.py +818 -0
  15. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_dataset_registry/test_dataset_registry.py +2 -2
  16. climate_ref_core-0.8.0/tests/unit/test_datasets/dataset_collection_obs4mips_hash.yml +2 -0
  17. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_datasets.py +25 -19
  18. climate_ref_core-0.6.6/tests/unit/test_metrics.py → climate_ref_core-0.8.0/tests/unit/test_diagnostics.py +39 -76
  19. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_logging.py +4 -7
  20. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_providers.py +74 -6
  21. climate_ref_core-0.6.6/tests/unit/test_constraints.py +0 -438
  22. climate_ref_core-0.6.6/tests/unit/test_datasets/dataset_collection_obs4mips_hash.yml +0 -2
  23. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/.gitignore +0 -0
  24. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/LICENCE +0 -0
  25. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/NOTICE +0 -0
  26. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/README.md +0 -0
  27. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/__init__.py +0 -0
  28. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/dataset_registry.py +0 -0
  29. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/env.py +0 -0
  30. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/exceptions.py +0 -0
  31. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/executor.py +0 -0
  32. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/metric_values/__init__.py +0 -0
  33. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/py.typed +0 -0
  34. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/README.md +0 -0
  35. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/__init__.py +0 -0
  36. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/controlled_vocabulary.py +0 -0
  37. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/src/climate_ref_core/pycmec/cv_cmip7_aft.yaml +0 -0
  38. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/metric_values/test_typing.py +0 -0
  39. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/cmec_metric_sample.json +0 -0
  40. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/cmec_output_sample.json +0 -0
  41. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/cv_sample.yaml +0 -0
  42. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/cmec_testdata/test_metric_json_schema.yml +0 -0
  43. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/conftest.py +0 -0
  44. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/pycmec/test_controlled_vocabulary.py +0 -0
  45. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_datasets/dataset_collection_hash.yml +0 -0
  46. /climate_ref_core-0.6.6/tests/unit/test_datasets/metric_dataset_hash.yml → /climate_ref_core-0.8.0/tests/unit/test_datasets/execution_dataset_hash.yml +0 -0
  47. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_exceptions.py +0 -0
  48. {climate_ref_core-0.6.6 → climate_ref_core-0.8.0}/tests/unit/test_executor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climate-ref-core
3
- Version: 0.6.6
3
+ Version: 0.8.0
4
4
  Summary: Core library for the CMIP Rapid Evaluation Framework
5
5
  Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "climate-ref-core"
3
- version = "0.6.6"
3
+ version = "0.8.0"
4
4
  description = "Core library for the CMIP Rapid Evaluation Framework"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -6,6 +6,8 @@ import sys
6
6
  import warnings
7
7
  from collections import defaultdict
8
8
  from collections.abc import Mapping
9
+ from datetime import datetime
10
+ from functools import total_ordering
9
11
  from typing import Literal, Protocol, runtime_checkable
10
12
 
11
13
  if sys.version_info < (3, 11):
@@ -15,45 +17,21 @@ else:
15
17
 
16
18
  import numpy as np
17
19
  import pandas as pd
18
- from attrs import frozen
20
+ from attrs import field, frozen
19
21
  from loguru import logger
20
22
 
21
23
  from climate_ref_core.datasets import SourceDatasetType
22
- from climate_ref_core.exceptions import ConstraintNotSatisfied
23
24
 
24
25
 
25
26
  @runtime_checkable
26
- class GroupValidator(Protocol):
27
+ class GroupConstraint(Protocol):
27
28
  """
28
- A constraint that must be satisfied when executing a given diagnostic run.
29
-
30
- All constraints must be satisfied for a given group to be run.
31
- """
32
-
33
- def validate(self, group: pd.DataFrame) -> bool:
34
- """
35
- Validate if the constraint is satisfied by the dataset.
36
-
37
- This is executed after the apply method to determine if the constraint is satisfied.
38
- If the constraint is not satisfied, the group will not be executed.
39
-
40
- Parameters
41
- ----------
42
- group
43
- A group of datasets that is being validated.
44
-
45
- Returns
46
- -------
47
- :
48
- Whether the constraint is satisfied
49
- """
50
- ...
29
+ An operation to perform on a group of datasets resulting in a new group of datasets.
51
30
 
31
+ This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
52
32
 
53
- @runtime_checkable
54
- class GroupOperation(Protocol):
55
- """
56
- An operation to perform on a group of datasets resulting in a new group of datasets.
33
+ If the operation results in an empty group, the constraint is considered not satisfied.
34
+ The group must satisfy all constraints to be processed.
57
35
 
58
36
  !! warning
59
37
 
@@ -90,18 +68,6 @@ class GroupOperation(Protocol):
90
68
  ...
91
69
 
92
70
 
93
- GroupConstraint = GroupOperation | GroupValidator
94
- """
95
- A constraint that must be satisfied for a group of datasets to be executed.
96
-
97
- This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
98
- The group must satisfy all constraints to be processed.
99
-
100
- This can include operations that are applied to a group of datasets which may modify the group,
101
- but may also include validators that check if the group satisfies a certain condition.
102
- """
103
-
104
-
105
71
  def apply_constraint(
106
72
  dataframe: pd.DataFrame,
107
73
  constraint: GroupConstraint,
@@ -124,41 +90,95 @@ def apply_constraint(
124
90
  :
125
91
  The updated group of datasets or None if the constraint was not satisfied
126
92
  """
127
- try:
128
- updated_group = (
129
- constraint.apply(dataframe, data_catalog) if isinstance(constraint, GroupOperation) else dataframe
130
- )
131
-
132
- valid = constraint.validate(updated_group) if isinstance(constraint, GroupValidator) else True
133
- if not valid:
134
- logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
135
- raise ConstraintNotSatisfied(f"Constraint {constraint} not satisfied for {dataframe}")
136
- except ConstraintNotSatisfied:
93
+ updated_group = constraint.apply(dataframe, data_catalog)
94
+ if updated_group.empty:
137
95
  logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
138
96
  return None
139
97
 
140
98
  return updated_group
141
99
 
142
100
 
101
+ def _to_tuple(value: None | str | tuple[str, ...]) -> tuple[str, ...]:
102
+ """
103
+ Normalize value to a tuple of strings.
104
+ """
105
+ if value is None:
106
+ return ()
107
+ if isinstance(value, str):
108
+ return (value,)
109
+ return tuple(value)
110
+
111
+
112
+ def _to_tuple_dict(value: dict[str, str | tuple[str, ...]]) -> dict[str, tuple[str, ...]]:
113
+ """
114
+ Normalize value to a dict of tuples of strings.
115
+ """
116
+ return {k: _to_tuple(v) for k, v in value.items()}
117
+
118
+
143
119
  @frozen
144
120
  class RequireFacets:
145
121
  """
146
- A constraint that requires a dataset to have certain facets.
122
+ A constraint that requires datasets to have certain facet values.
147
123
  """
148
124
 
149
125
  dimension: str
150
- required_facets: tuple[str, ...]
126
+ """The name of the facet to filter on."""
127
+
128
+ required_facets: tuple[str, ...] = field(converter=_to_tuple)
129
+ "The required facet values."
130
+
151
131
  operator: Literal["all", "any"] = "all"
132
+ """Whether all or any of the required facets must be present."""
152
133
 
153
- def validate(self, group: pd.DataFrame) -> bool:
134
+ group_by: tuple[str, ...] | None = field(converter=_to_tuple, default=None)
135
+ """
136
+ The facets to group the datasets by.
137
+
138
+ Each group created by `group_by` must contain at least one dataset where the
139
+ value of the given dimension is in the list of required facet values.
140
+
141
+ For example, if there are multiple models and variables in the selection,
142
+ `group_by` can be used to make sure that only those models are selected that
143
+ provide all required variables.
144
+ """
145
+
146
+ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
154
147
  """
155
- Check that the required facets are present in the group
148
+ Filter out groups of datasets that do not provide the required facets
156
149
  """
157
- if self.dimension not in group:
158
- logger.warning(f"Dimension {self.dimension} not present in group {group}")
159
- return False
160
150
  op = all if self.operator == "all" else any
161
- return op(value in group[self.dimension].values for value in self.required_facets)
151
+ select = pd.Series(True, index=group.index)
152
+ groups = [group] if not self.group_by else (g[1] for g in group.groupby(list(self.group_by)))
153
+ for subgroup in groups:
154
+ if not op(value in subgroup[self.dimension].values for value in self.required_facets):
155
+ logger.debug(
156
+ f"Constraint {self} not satisfied because required facet values "
157
+ f"not found for group {', '.join(subgroup['path'])}"
158
+ )
159
+ select.loc[subgroup.index] = False
160
+ return group[select]
161
+
162
+
163
+ @frozen
164
+ class IgnoreFacets:
165
+ """
166
+ A constraint that ignores certain facet values.
167
+
168
+ Datasets with these facet values are removed from the selection.
169
+ """
170
+
171
+ facets: dict[str, str | tuple[str, ...]] = field(converter=_to_tuple_dict)
172
+ """The facet values to ignore."""
173
+
174
+ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
175
+ """
176
+ Filter out datasets with the ignored facets.
177
+ """
178
+ mask = group[list(self.facets)].isin(self.facets).all(axis="columns")
179
+ if mask.any():
180
+ logger.debug(f"Ignoring files {', '.join(group.loc[mask, 'path'])} becauseof {self}")
181
+ return group[~mask]
162
182
 
163
183
 
164
184
  @frozen
@@ -273,6 +293,123 @@ class AddSupplementaryDataset:
273
293
  return cls(supplementary_facets, **kwargs[source_type])
274
294
 
275
295
 
296
+ @frozen
297
+ @total_ordering
298
+ class PartialDateTime: # noqa: PLW1641
299
+ """
300
+ A partial datetime object that can be used to compare datetimes.
301
+
302
+ Only the specified fields are used for comparison.
303
+ """
304
+
305
+ year: int | None = None
306
+ month: int | None = None
307
+ day: int | None = None
308
+ hour: int | None = None
309
+ minute: int | None = None
310
+ second: int | None = None
311
+
312
+ @property
313
+ def _attrs(self) -> dict[str, int]:
314
+ """The attributes that are set."""
315
+ return {
316
+ a: v
317
+ for a in self.__slots__ # type: ignore[attr-defined]
318
+ if not a.startswith("_") and (v := getattr(self, a)) is not None
319
+ }
320
+
321
+ def __repr__(self) -> str:
322
+ return f"{self.__class__.__name__}({', '.join(f'{a}={v}' for a, v in self._attrs.items())})"
323
+
324
+ def __eq__(self, other: object) -> bool:
325
+ if not isinstance(other, datetime):
326
+ msg = (
327
+ f"Can only compare PartialDateTime with `datetime.datetime` "
328
+ f"objects, got object {other} of type {type(other)}"
329
+ )
330
+ raise TypeError(msg)
331
+
332
+ for attr, value in self._attrs.items():
333
+ other_value = getattr(other, attr)
334
+ if value != other_value:
335
+ return False
336
+ return True
337
+
338
+ def __lt__(self, other: object) -> bool:
339
+ if not isinstance(other, datetime):
340
+ msg = (
341
+ f"Can only compare PartialDateTime with `datetime.datetime` "
342
+ f"objects, got object {other} of type {type(other)}"
343
+ )
344
+ raise TypeError(msg)
345
+
346
+ for attr, value in self._attrs.items():
347
+ other_value = getattr(other, attr)
348
+ if value != other_value:
349
+ return value < other_value # type: ignore[no-any-return]
350
+ return False
351
+
352
+
353
+ @frozen
354
+ class RequireTimerange:
355
+ """
356
+ A constraint that requires datasets to have a specific timerange.
357
+
358
+ Specify the start and/or end of the required timerange using a precision
359
+ that matches the frequency of the datasets.
360
+
361
+ For example, to ensure that datasets at monthly frequency cover the period
362
+ from 2000 to 2010, use start=PartialDateTime(year=2000, month=1) and
363
+ end=PartialDateTime(year=2010, month=12).
364
+ """
365
+
366
+ group_by: tuple[str, ...]
367
+ """
368
+ The fields to group the datasets by. Groups that do not cover the timerange
369
+ will be removed.
370
+ """
371
+
372
+ start: PartialDateTime | None = None
373
+ """
374
+ The start time of the required timerange. If None, no start time is required.
375
+ """
376
+
377
+ end: PartialDateTime | None = None
378
+ """
379
+ The end time of the required timerange. If None, no end time is required.
380
+ """
381
+
382
+ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
383
+ """
384
+ Check that all subgroups of the group have a contiguous timerange.
385
+ """
386
+ select = pd.Series(True, index=group.index)
387
+ for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
388
+ start = subgroup["start_time"].min()
389
+ end = subgroup["end_time"].max()
390
+ result = True
391
+ if self.start is not None and start > self.start:
392
+ logger.debug(
393
+ f"Constraint {self} not satisfied because start time {start} "
394
+ f"is after required start time for {', '.join(subgroup['path'])}"
395
+ )
396
+ result = False
397
+ if self.end is not None and end < self.end:
398
+ logger.debug(
399
+ f"Constraint {self} not satisfied because end time {end} "
400
+ f"is before required end time for {', '.join(subgroup['path'])}"
401
+ )
402
+ result = False
403
+ if result:
404
+ contiguous_subgroup = RequireContiguousTimerange(group_by=self.group_by).apply(
405
+ subgroup, data_catalog
406
+ )
407
+ result = len(contiguous_subgroup) == len(subgroup)
408
+ if not result:
409
+ select.loc[subgroup.index] = False
410
+ return group[select]
411
+
412
+
276
413
  @frozen
277
414
  class RequireContiguousTimerange:
278
415
  """
@@ -281,11 +418,11 @@ class RequireContiguousTimerange:
281
418
 
282
419
  group_by: tuple[str, ...]
283
420
  """
284
- The fields to group the datasets by. Each group must be contiguous in time
285
- to fulfill the constraint.
421
+ The fields to group the datasets by. Groups that are not be contiguous in time
422
+ are removed.
286
423
  """
287
424
 
288
- def validate(self, group: pd.DataFrame) -> bool:
425
+ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
289
426
  """
290
427
  Check that all subgroups of the group have a contiguous timerange.
291
428
  """
@@ -295,11 +432,10 @@ class RequireContiguousTimerange:
295
432
  days=31, # Maximum number of days in a month.
296
433
  hours=1, # Allow for potential rounding errors.
297
434
  )
298
- group = group.dropna(subset=["start_time", "end_time"])
299
- if len(group) < 2: # noqa: PLR2004
300
- return True
301
435
 
302
- for _, subgroup in group.groupby(list(self.group_by)):
436
+ select = pd.Series(True, index=group.index)
437
+
438
+ for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
303
439
  if len(subgroup) < 2: # noqa: PLR2004
304
440
  continue
305
441
  sorted_group = subgroup.sort_values("start_time", kind="stable")
@@ -327,12 +463,13 @@ class RequireContiguousTimerange:
327
463
  paths = sorted_group["path"]
328
464
  for gap_idx in np.flatnonzero(gap_indices):
329
465
  logger.debug(
330
- f"Constraint {self.__class__.__name__} not satisfied "
331
- f"because gap larger than {max_timedelta} found between "
466
+ f"Constraint {self} not satisfied because gap larger "
467
+ f"than {max_timedelta} found between "
332
468
  f"{paths.iloc[gap_idx]} and {paths.iloc[gap_idx + 1]}"
333
469
  )
334
- return False
335
- return True
470
+ select.loc[subgroup.index] = False
471
+
472
+ return group[select]
336
473
 
337
474
 
338
475
  @frozen
@@ -347,17 +484,24 @@ class RequireOverlappingTimerange:
347
484
  the groups to fulfill the constraint.
348
485
  """
349
486
 
350
- def validate(self, group: pd.DataFrame) -> bool:
487
+ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
351
488
  """
352
489
  Check that all subgroups of the group have an overlapping timerange.
353
490
  """
354
- group = group.dropna(subset=["start_time", "end_time"])
355
- if len(group) < 2: # noqa: PLR2004
356
- return True
357
-
358
- starts = group.groupby(list(self.group_by))["start_time"].min()
359
- ends = group.groupby(list(self.group_by))["end_time"].max()
360
- return starts.max() < ends.min() # type: ignore[no-any-return]
491
+ group_with_time = group.dropna(subset=["start_time", "end_time"])
492
+ if len(group_with_time) < 2: # noqa: PLR2004
493
+ return group
494
+
495
+ starts = group_with_time.groupby(list(self.group_by))["start_time"].min()
496
+ ends = group_with_time.groupby(list(self.group_by))["end_time"].max()
497
+ result = starts.max() < ends.min()
498
+ if not result:
499
+ logger.debug(
500
+ f"Constraint {self} not satisfied because no overlapping timerange "
501
+ f"found for groups in {', '.join(group['path'])}"
502
+ )
503
+ return group.loc[[]]
504
+ return group
361
505
 
362
506
 
363
507
  @frozen
@@ -76,12 +76,6 @@ class FacetFilter:
76
76
  The result will only contain datasets where for all fields,
77
77
  the value of the field is one of the given values.
78
78
  """
79
- keep: bool = True
80
- """
81
- Whether to keep or remove datasets that match the filter.
82
-
83
- If true (default), datasets that match the filter will be kept else they will be removed.
84
- """
85
79
 
86
80
 
87
81
  def sort_selector(inp: Selector) -> Selector:
@@ -159,6 +153,9 @@ class ExecutionDatasetCollection:
159
153
  def __init__(self, collection: dict[SourceDatasetType | str, DatasetCollection]):
160
154
  self._collection = {SourceDatasetType(k): v for k, v in collection.items()}
161
155
 
156
+ def __repr__(self) -> str:
157
+ return f"ExecutionDatasetCollection({self._collection})"
158
+
162
159
  def __contains__(self, key: SourceDatasetType | str) -> bool:
163
160
  if isinstance(key, str):
164
161
  key = SourceDatasetType(key)
@@ -321,7 +321,12 @@ class DataRequirement:
321
321
  Filters to apply to the data catalog of datasets.
322
322
 
323
323
  This is used to reduce the set of datasets to only those that are required by the diagnostic.
324
- The filters are applied iteratively to reduce the set of datasets.
324
+
325
+ Each FacetFilter contains one or more facet values that must all be satisfied
326
+ for a dataset to match that filter. The overall selection keeps any dataset
327
+ that matches at least one of the provided filters.
328
+
329
+ If no filters are specified, all datasets in the data catalog are used.
325
330
  """
326
331
 
327
332
  group_by: tuple[str, ...] | None
@@ -361,6 +366,10 @@ class DataRequirement:
361
366
  :
362
367
  Filtered data catalog
363
368
  """
369
+ if not self.filters or any(not f.facets for f in self.filters):
370
+ return data_catalog
371
+
372
+ select = pd.Series(False, index=data_catalog.index)
364
373
  for facet_filter in self.filters:
365
374
  values = {}
366
375
  for facet, value in facet_filter.facets.items():
@@ -372,11 +381,9 @@ class DataRequirement:
372
381
  )
373
382
  values[facet] = clean_value
374
383
 
375
- mask = data_catalog[list(values)].isin(values).all(axis="columns")
376
- if not facet_filter.keep:
377
- mask = ~mask
378
- data_catalog = data_catalog[mask]
379
- return data_catalog
384
+ select |= data_catalog[list(values)].isin(values).all(axis="columns")
385
+
386
+ return data_catalog[select]
380
387
 
381
388
 
382
389
  @runtime_checkable
@@ -9,6 +9,7 @@ import contextlib
9
9
  import inspect
10
10
  import logging
11
11
  import multiprocessing
12
+ import os
12
13
  import sys
13
14
  from collections.abc import Generator
14
15
  from pathlib import Path
@@ -94,7 +95,10 @@ def initialise_logging(level: int | str, format: str, log_directory: str | Path)
94
95
  logger.info("Starting REF logging")
95
96
  logger.info(f"arguments: {sys.argv}")
96
97
 
97
- add_log_handler(level=level, format=format, colorize=True)
98
+ # LOGURU_COLORIZE is the default env var used by loguru to determine if color should be used
99
+ # We override this to use NO_COLOR which is more widely supported
100
+ no_color = os.environ.get("NO_COLOR") is not None
101
+ add_log_handler(level=level, format=format, colorize=not no_color)
98
102
 
99
103
 
100
104
  def capture_logging() -> None:
@@ -150,12 +154,12 @@ def remove_log_handler() -> None:
150
154
  """
151
155
  if hasattr(logger, "default_handler_id"):
152
156
  try:
153
- logger.remove(logger.default_handler_id)
157
+ logger.remove(logger.default_handler_id) # pyright: ignore[reportAttributeAccessIssue]
154
158
  except ValueError:
155
159
  # This can happen if the handler has already been removed
156
160
  # or if the logger was never configured
157
161
  pass
158
- del logger.default_handler_id
162
+ del logger.default_handler_id # pyright: ignore[reportAttributeAccessIssue]
159
163
  else:
160
164
  raise AssertionError("No default log handler to remove.")
161
165
 
@@ -3,7 +3,8 @@ from collections.abc import Sequence
3
3
  from pathlib import Path
4
4
  from typing import Any, Self
5
5
 
6
- from pydantic import BaseModel, model_validator
6
+ import numpy as np
7
+ from pydantic import BaseModel, field_validator, model_validator
7
8
 
8
9
  Value = float | int
9
10
 
@@ -64,20 +65,35 @@ class SeriesMetricValue(BaseModel):
64
65
  This is used for presentation purposes and is not used in the controlled vocabulary.
65
66
  """
66
67
 
67
- attributes: dict[str, str | Value] | None = None
68
+ attributes: dict[str, str | Value | None] | None = None
68
69
  """
69
70
  Additional unstructured attributes associated with the metric value
70
71
  """
71
72
 
72
73
  @model_validator(mode="after")
73
- def validate_index_length(self) -> Self:
74
- """Validate that index has the same length as values"""
74
+ def validate_index(self) -> Self:
75
+ """Validate that index has the same length as values and contains no NaNs"""
75
76
  if len(self.index) != len(self.values):
76
77
  raise ValueError(
77
78
  f"Index length ({len(self.index)}) must match values length ({len(self.values)})"
78
79
  )
80
+ for v in self.index:
81
+ if isinstance(v, float) and not np.isfinite(v):
82
+ raise ValueError("NaN or Inf values are not allowed in the index")
79
83
  return self
80
84
 
85
+ @field_validator("values", mode="before")
86
+ @classmethod
87
+ def validate_values(cls, value: Any) -> Any:
88
+ """
89
+ Transform None values to NaN in the values field
90
+ """
91
+ if not isinstance(value, (list, tuple)):
92
+ raise ValueError("`values` must be a list or tuple.")
93
+
94
+ # Transform None values to NaN
95
+ return [float("nan") if v is None else v for v in value]
96
+
81
97
  @classmethod
82
98
  def dump_to_json(cls, path: Path, series: Sequence["SeriesMetricValue"]) -> None:
83
99
  """
@@ -94,7 +110,13 @@ class SeriesMetricValue(BaseModel):
94
110
  The series values to dump.
95
111
  """
96
112
  with open(path, "w") as f:
97
- json.dump([s.model_dump() for s in series], f, indent=2)
113
+ json.dump(
114
+ [s.model_dump(mode="json") for s in series],
115
+ f,
116
+ indent=2,
117
+ allow_nan=False,
118
+ sort_keys=True,
119
+ )
98
120
 
99
121
  @classmethod
100
122
  def load_from_json(
@@ -102,7 +124,7 @@ class SeriesMetricValue(BaseModel):
102
124
  path: Path,
103
125
  ) -> list["SeriesMetricValue"]:
104
126
  """
105
- Dump a sequence of SeriesMetricValue to a JSON file.
127
+ Load a sequence of SeriesMetricValue from a JSON file.
106
128
 
107
129
  Parameters
108
130
  ----------
@@ -115,7 +137,7 @@ class SeriesMetricValue(BaseModel):
115
137
  if not isinstance(data, list):
116
138
  raise ValueError(f"Expected a list of series values, got {type(data)}")
117
139
 
118
- return [cls.model_validate(s) for s in data]
140
+ return [cls.model_validate(s, strict=True) for s in data]
119
141
 
120
142
 
121
143
  class ScalarMetricValue(BaseModel):
@@ -16,14 +16,18 @@ import os
16
16
  import stat
17
17
  import subprocess
18
18
  from abc import abstractmethod
19
- from collections.abc import Iterable
19
+ from collections.abc import Iterable, Sequence
20
20
  from contextlib import AbstractContextManager
21
21
  from pathlib import Path
22
22
  from typing import TYPE_CHECKING
23
23
 
24
24
  import requests
25
+ import yaml
26
+ from attrs import evolve
25
27
  from loguru import logger
26
28
 
29
+ from climate_ref_core.constraints import IgnoreFacets
30
+ from climate_ref_core.datasets import SourceDatasetType
27
31
  from climate_ref_core.diagnostics import Diagnostic
28
32
  from climate_ref_core.exceptions import InvalidDiagnosticException, InvalidProviderException
29
33
 
@@ -74,6 +78,51 @@ class DiagnosticProvider:
74
78
  config :
75
79
  A configuration.
76
80
  """
81
+ logger.debug(
82
+ f"Configuring provider {self.slug} using ignore_datasets_file {config.ignore_datasets_file}"
83
+ )
84
+ # The format of the configuration file is:
85
+ # provider:
86
+ # diagnostic:
87
+ # source_type:
88
+ # - facet: value
89
+ # - other_facet: [other_value1, other_value2]
90
+ ignore_datasets_all = yaml.safe_load(config.ignore_datasets_file.read_text(encoding="utf-8")) or {}
91
+ ignore_datasets = ignore_datasets_all.get(self.slug, {})
92
+ if unknown_slugs := {slug for slug in ignore_datasets} - {d.slug for d in self.diagnostics()}:
93
+ logger.warning(
94
+ f"Unknown diagnostics found in {config.ignore_datasets_file} "
95
+ f"for provider {self.slug}: {', '.join(sorted(unknown_slugs))}"
96
+ )
97
+
98
+ known_source_types = {s.value for s in iter(SourceDatasetType)}
99
+ for diagnostic in self.diagnostics():
100
+ if diagnostic.slug in ignore_datasets:
101
+ if unknown_source_types := set(ignore_datasets[diagnostic.slug]) - known_source_types:
102
+ logger.warning(
103
+ f"Unknown source types found in {config.ignore_datasets_file} for "
104
+ f"diagnostic '{diagnostic.slug}' by provider {self.slug}: "
105
+ f"{', '.join(sorted(unknown_source_types))}"
106
+ )
107
+ data_requirements = (
108
+ r if isinstance(r, Sequence) else (r,) for r in diagnostic.data_requirements
109
+ )
110
+ diagnostic.data_requirements = tuple(
111
+ tuple(
112
+ evolve(
113
+ data_requirement,
114
+ constraints=tuple(
115
+ IgnoreFacets(facets)
116
+ for facets in ignore_datasets[diagnostic.slug].get(
117
+ data_requirement.source_type.value, []
118
+ )
119
+ )
120
+ + data_requirement.constraints,
121
+ )
122
+ for data_requirement in requirement_collection
123
+ )
124
+ for requirement_collection in data_requirements
125
+ )
77
126
 
78
127
  def diagnostics(self) -> list[Diagnostic]:
79
128
  """
@@ -287,6 +336,7 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
287
336
 
288
337
  def configure(self, config: Config) -> None:
289
338
  """Configure the provider."""
339
+ super().configure(config)
290
340
  self.prefix = config.paths.software / "conda"
291
341
 
292
342
  def _install_conda(self, update: bool) -> Path: