PyPI - climate-ref-core - Versions diffs - 0.6.5__tar.gz → 0.7.0__tar.gz - Mend

climate-ref-core 0.6.5tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: climate-ref-core
-Version: 0.6.5
+Version: 0.7.0
 Summary: Core library for the CMIP Rapid Evaluation Framework
 Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
 License-Expression: Apache-2.0
@@ -29,7 +29,7 @@ Requires-Dist: pydantic>=2.10.6
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: requests
 Requires-Dist: rich
-Requires-Dist: setuptools>=75.8.0
+Requires-Dist: setuptools<81
 Requires-Dist: typing-extensions
 Description-Content-Type: text/markdown

{climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "climate-ref-core"
-version = "0.6.5"
+version = "0.7.0"
 description = "Core library for the CMIP Rapid Evaluation Framework"
 readme = "README.md"
 authors = [
@@ -39,7 +39,7 @@ dependencies = [
     "environs>=11",
     "pyyaml>=6.0.2",
     # Not used directly, but required to support some installations
-    "setuptools>=75.8.0",
+    "setuptools<81",
     # SPEC 0000 constraints
     # We follow [SPEC-0000](https://scientific-python.org/specs/spec-0000/)

{climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/constraints.py RENAMED Viewed

@@ -6,7 +6,9 @@ import sys
 import warnings
 from collections import defaultdict
 from collections.abc import Mapping
-from typing import Protocol, runtime_checkable
+from datetime import datetime
+from functools import total_ordering
+from typing import Literal, Protocol, runtime_checkable
 if sys.version_info < (3, 11):
     from typing_extensions import Self
@@ -15,45 +17,21 @@ else:
 import numpy as np
 import pandas as pd
-from attrs import frozen
+from attrs import field, frozen
 from loguru import logger
 from climate_ref_core.datasets import SourceDatasetType
-from climate_ref_core.exceptions import ConstraintNotSatisfied
 @runtime_checkable
-class GroupValidator(Protocol):
+class GroupConstraint(Protocol):
     """
-    A constraint that must be satisfied when executing a given diagnostic run.
-    All constraints must be satisfied for a given group to be run.
-    """
-    def validate(self, group: pd.DataFrame) -> bool:
-        """
-        Validate if the constraint is satisfied by the dataset.
-        This is executed after the apply method to determine if the constraint is satisfied.
-        If the constraint is not satisfied, the group will not be executed.
-        Parameters
-        ----------
-        group
-            A group of datasets that is being validated.
-        Returns
-        -------
-        :
-            Whether the constraint is satisfied
-        """
-        ...
+    An operation to perform on a group of datasets resulting in a new group of datasets.
+    This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
-@runtime_checkable
-class GroupOperation(Protocol):
-    """
-    An operation to perform on a group of datasets resulting in a new group of datasets.
+    If the operation results in an empty group, the constraint is considered not satisfied.
+    The group must satisfy all constraints to be processed.
     !! warning
@@ -90,18 +68,6 @@ class GroupOperation(Protocol):
         ...
-GroupConstraint = GroupOperation | GroupValidator
-"""
-A constraint that must be satisfied for a group of datasets to be executed.
-This is applied to a group of datasets representing the inputs to a potential diagnostic execution.
-The group must satisfy all constraints to be processed.
-This can include operations that are applied to a group of datasets which may modify the group,
-but may also include validators that check if the group satisfies a certain condition.
-"""
 def apply_constraint(
     dataframe: pd.DataFrame,
     constraint: GroupConstraint,
@@ -124,39 +90,67 @@ def apply_constraint(
     :
         The updated group of datasets or None if the constraint was not satisfied
     """
-    try:
-        updated_group = (
-            constraint.apply(dataframe, data_catalog) if isinstance(constraint, GroupOperation) else dataframe
-        )
-        valid = constraint.validate(updated_group) if isinstance(constraint, GroupValidator) else True
-        if not valid:
-            logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
-            raise ConstraintNotSatisfied(f"Constraint {constraint} not satisfied for {dataframe}")
-    except ConstraintNotSatisfied:
+    updated_group = constraint.apply(dataframe, data_catalog)
+    if updated_group.empty:
         logger.debug(f"Constraint {constraint} not satisfied for {dataframe}")
         return None
     return updated_group
+def _to_tuple(value: None | str | tuple[str, ...]) -> tuple[str, ...]:
+    """
+    Clean the value of group_by to a tuple of strings
+    """
+    if value is None:
+        return ()
+    if isinstance(value, str):
+        return (value,)
+    return tuple(value)
 @frozen
 class RequireFacets:
     """
-    A constraint that requires a dataset to have certain facets.
+    A constraint that requires datasets to have certain facet values.
     """
     dimension: str
-    required_facets: tuple[str, ...]
+    """The name of the facet to filter on."""
-    def validate(self, group: pd.DataFrame) -> bool:
+    required_facets: tuple[str, ...] = field(converter=_to_tuple)
+    "The required facet values."
+    operator: Literal["all", "any"] = "all"
+    """Whether all or any of the required facets must be present."""
+    group_by: tuple[str, ...] | None = field(converter=_to_tuple, default=None)
+    """
+    The facets to group the datasets by.
+    Each group created by `group_by` must contain at least one dataset where the
+    value of the given dimension is in the list of required facet values.
+    For example, if there are multiple models and variables in the selection,
+    `group_by` can be used to make sure that only those models are selected that
+    provide all required variables.
+    """
+    def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
         """
-        Check that the required facets are present in the group
+        Filter out groups of datasets that do not provide the required facets
         """
-        if self.dimension not in group:
-            logger.warning(f"Dimension {self.dimension} not present in group {group}")
-            return False
-        return all(value in group[self.dimension].values for value in self.required_facets)
+        op = all if self.operator == "all" else any
+        select = pd.Series(True, index=group.index)
+        groups = [group] if not self.group_by else (g[1] for g in group.groupby(list(self.group_by)))
+        for subgroup in groups:
+            if not op(value in subgroup[self.dimension].values for value in self.required_facets):
+                logger.debug(
+                    f"Constraint {self} not satisfied because required facet values "
+                    f"not found for group {', '.join(subgroup['path'])}"
+                )
+                select.loc[subgroup.index] = False
+        return group[select]
 @frozen
@@ -271,6 +265,123 @@ class AddSupplementaryDataset:
         return cls(supplementary_facets, **kwargs[source_type])
+@frozen
+@total_ordering
+class PartialDateTime:  # noqa: PLW1641
+    """
+    A partial datetime object that can be used to compare datetimes.
+    Only the specified fields are used for comparison.
+    """
+    year: int | None = None
+    month: int | None = None
+    day: int | None = None
+    hour: int | None = None
+    minute: int | None = None
+    second: int | None = None
+    @property
+    def _attrs(self) -> dict[str, int]:
+        """The attributes that are set."""
+        return {
+            a: v
+            for a in self.__slots__  # type: ignore[attr-defined]
+            if not a.startswith("_") and (v := getattr(self, a)) is not None
+        }
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({', '.join(f'{a}={v}' for a, v in self._attrs.items())})"
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, datetime):
+            msg = (
+                f"Can only compare PartialDateTime with `datetime.datetime` "
+                f"objects, got object {other} of type {type(other)}"
+            )
+            raise TypeError(msg)
+        for attr, value in self._attrs.items():
+            other_value = getattr(other, attr)
+            if value != other_value:
+                return False
+        return True
+    def __lt__(self, other: object) -> bool:
+        if not isinstance(other, datetime):
+            msg = (
+                f"Can only compare PartialDateTime with `datetime.datetime` "
+                f"objects, got object {other} of type {type(other)}"
+            )
+            raise TypeError(msg)
+        for attr, value in self._attrs.items():
+            other_value = getattr(other, attr)
+            if value != other_value:
+                return value < other_value  # type: ignore[no-any-return]
+        return False
+@frozen
+class RequireTimerange:
+    """
+    A constraint that requires datasets to have a specific timerange.
+    Specify the start and/or end of the required timerange using a precision
+    that matches the frequency of the datasets.
+    For example, to ensure that datasets at monthly frequency cover the period
+    from 2000 to 2010, use start=PartialDateTime(year=2000, month=1) and
+    end=PartialDateTime(year=2010, month=12).
+    """
+    group_by: tuple[str, ...]
+    """
+    The fields to group the datasets by. Groups that do not cover the timerange
+    will be removed.
+    """
+    start: PartialDateTime | None = None
+    """
+    The start time of the required timerange. If None, no start time is required.
+    """
+    end: PartialDateTime | None = None
+    """
+    The end time of the required timerange. If None, no end time is required.
+    """
+    def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
+        """
+        Check that all subgroups of the group have a contiguous timerange.
+        """
+        select = pd.Series(True, index=group.index)
+        for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
+            start = subgroup["start_time"].min()
+            end = subgroup["end_time"].max()
+            result = True
+            if self.start is not None and start > self.start:
+                logger.debug(
+                    f"Constraint {self} not satisfied because start time {start} "
+                    f"is after required start time for {', '.join(subgroup['path'])}"
+                )
+                result = False
+            if self.end is not None and end < self.end:
+                logger.debug(
+                    f"Constraint {self} not satisfied because end time {end} "
+                    f"is before required end time for {', '.join(subgroup['path'])}"
+                )
+                result = False
+            if result:
+                contiguous_subgroup = RequireContiguousTimerange(group_by=self.group_by).apply(
+                    subgroup, data_catalog
+                )
+                result = len(contiguous_subgroup) == len(subgroup)
+            if not result:
+                select.loc[subgroup.index] = False
+        return group[select]
 @frozen
 class RequireContiguousTimerange:
     """
@@ -279,11 +390,11 @@ class RequireContiguousTimerange:
     group_by: tuple[str, ...]
     """
-    The fields to group the datasets by. Each group must be contiguous in time
-    to fulfill the constraint.
+    The fields to group the datasets by. Groups that are not be contiguous in time
+    are removed.
     """
-    def validate(self, group: pd.DataFrame) -> bool:
+    def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
         """
         Check that all subgroups of the group have a contiguous timerange.
         """
@@ -293,11 +404,10 @@ class RequireContiguousTimerange:
             days=31,  # Maximum number of days in a month.
             hours=1,  # Allow for potential rounding errors.
         )
-        group = group.dropna(subset=["start_time", "end_time"])
-        if len(group) < 2:  # noqa: PLR2004
-            return True
-        for _, subgroup in group.groupby(list(self.group_by)):
+        select = pd.Series(True, index=group.index)
+        for _, subgroup in group.dropna(subset=["start_time", "end_time"]).groupby(list(self.group_by)):
             if len(subgroup) < 2:  # noqa: PLR2004
                 continue
             sorted_group = subgroup.sort_values("start_time", kind="stable")
@@ -325,12 +435,13 @@ class RequireContiguousTimerange:
                 paths = sorted_group["path"]
                 for gap_idx in np.flatnonzero(gap_indices):
                     logger.debug(
-                        f"Constraint {self.__class__.__name__} not satisfied "
-                        f"because gap larger than {max_timedelta} found between "
+                        f"Constraint {self} not satisfied because gap larger "
+                        f"than {max_timedelta} found between "
                         f"{paths.iloc[gap_idx]} and {paths.iloc[gap_idx + 1]}"
                     )
-                return False
-        return True
+                select.loc[subgroup.index] = False
+        return group[select]
 @frozen
@@ -345,17 +456,24 @@ class RequireOverlappingTimerange:
     the groups to fulfill the constraint.
     """
-    def validate(self, group: pd.DataFrame) -> bool:
+    def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
         """
         Check that all subgroups of the group have an overlapping timerange.
         """
-        group = group.dropna(subset=["start_time", "end_time"])
-        if len(group) < 2:  # noqa: PLR2004
-            return True
-        starts = group.groupby(list(self.group_by))["start_time"].min()
-        ends = group.groupby(list(self.group_by))["end_time"].max()
-        return starts.max() < ends.min()  # type: ignore[no-any-return]
+        group_with_time = group.dropna(subset=["start_time", "end_time"])
+        if len(group_with_time) < 2:  # noqa: PLR2004
+            return group
+        starts = group_with_time.groupby(list(self.group_by))["start_time"].min()
+        ends = group_with_time.groupby(list(self.group_by))["end_time"].max()
+        result = starts.max() < ends.min()
+        if not result:
+            logger.debug(
+                f"Constraint {self} not satisfied because no overlapping timerange "
+                f"found for groups in {', '.join(group['path'])}"
+            )
+            return group.loc[[]]
+        return group
 @frozen

{climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/datasets.py RENAMED Viewed

@@ -5,7 +5,7 @@ Dataset management and filtering
 import enum
 import functools
 import hashlib
-from collections.abc import Collection, Iterable
+from collections.abc import Collection, Iterable, Iterator
 from typing import Any, Self
 import pandas as pd
@@ -76,12 +76,6 @@ class FacetFilter:
     The result will only contain datasets where for all fields,
     the value of the field is one of the given values.
     """
-    keep: bool = True
-    """
-    Whether to keep or remove datasets that match the filter.
-    If true (default), datasets that match the filter will be kept else they will be removed.
-    """
 def sort_selector(inp: Selector) -> Selector:
@@ -159,6 +153,9 @@ class ExecutionDatasetCollection:
     def __init__(self, collection: dict[SourceDatasetType | str, DatasetCollection]):
         self._collection = {SourceDatasetType(k): v for k, v in collection.items()}
+    def __repr__(self) -> str:
+        return f"ExecutionDatasetCollection({self._collection})"
     def __contains__(self, key: SourceDatasetType | str) -> bool:
         if isinstance(key, str):
             key = SourceDatasetType(key)
@@ -172,9 +169,24 @@ class ExecutionDatasetCollection:
     def __hash__(self) -> int:
         return hash(self.hash)
+    def __iter__(self) -> Iterator[SourceDatasetType]:
+        return iter(self._collection)
+    def keys(self) -> Iterable[SourceDatasetType]:
+        """
+        Iterate over the source types in the collection.
+        """
+        return self._collection.keys()
+    def values(self) -> Iterable[DatasetCollection]:
+        """
+        Iterate over the datasets in the collection.
+        """
+        return self._collection.values()
     def items(self) -> Iterable[tuple[SourceDatasetType, DatasetCollection]]:
         """
-        Iterate over the datasets in the collection
+        Iterate over the items in the collection.
         """
         return self._collection.items()

{climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/diagnostics.py RENAMED Viewed

@@ -14,6 +14,7 @@ from attrs import field, frozen
 from climate_ref_core.constraints import GroupConstraint
 from climate_ref_core.datasets import ExecutionDatasetCollection, FacetFilter, SourceDatasetType
 from climate_ref_core.metric_values import SeriesMetricValue
+from climate_ref_core.metric_values.typing import SeriesDefinition
 from climate_ref_core.pycmec.metric import CMECMetric
 from climate_ref_core.pycmec.output import CMECOutput
@@ -182,9 +183,11 @@ class ExecutionResult:
     Whether the diagnostic execution ran successfully.
     """
-    series: Sequence[SeriesMetricValue] = field(factory=tuple)
+    series_filename: pathlib.Path | None = None
     """
     A collection of series metric values that were extracted from the execution.
+    These are written to a CSV file in the output directory.
     """
     @staticmethod
@@ -193,6 +196,7 @@ class ExecutionResult:
         *,
         cmec_output_bundle: CMECOutput | dict[str, Any],
         cmec_metric_bundle: CMECMetric | dict[str, Any],
+        series: Sequence[SeriesMetricValue] = tuple(),
     ) -> ExecutionResult:
         """
         Build a ExecutionResult from a CMEC output bundle.
@@ -205,6 +209,8 @@ class ExecutionResult:
             An output bundle in the CMEC format.
         cmec_metric_bundle
             An diagnostic bundle in the CMEC format.
+        series
+            Series metric values extracted from the execution.
         Returns
         -------
@@ -223,17 +229,21 @@ class ExecutionResult:
             cmec_metric = cmec_metric_bundle
         definition.to_output_path(filename=None).mkdir(parents=True, exist_ok=True)
-        bundle_path = definition.to_output_path("output.json")
-        cmec_output.dump_to_json(bundle_path)
-        definition.to_output_path(filename=None).mkdir(parents=True, exist_ok=True)
-        bundle_path = definition.to_output_path("diagnostic.json")
-        cmec_metric.dump_to_json(bundle_path)
+        output_filename = "output.json"
+        metric_filename = "diagnostic.json"
+        series_filename = "series.json"
+        cmec_output.dump_to_json(definition.to_output_path(output_filename))
+        cmec_metric.dump_to_json(definition.to_output_path(metric_filename))
+        SeriesMetricValue.dump_to_json(definition.to_output_path(series_filename), series)
+        # We are using relative paths for the output files for portability of the results
         return ExecutionResult(
             definition=definition,
-            output_bundle_filename=pathlib.Path("output.json"),
-            metric_bundle_filename=pathlib.Path("diagnostic.json"),
+            output_bundle_filename=pathlib.Path(output_filename),
+            metric_bundle_filename=pathlib.Path(metric_filename),
+            series_filename=pathlib.Path(series_filename),
             successful=True,
         )
@@ -311,7 +321,12 @@ class DataRequirement:
     Filters to apply to the data catalog of datasets.
     This is used to reduce the set of datasets to only those that are required by the diagnostic.
-    The filters are applied iteratively to reduce the set of datasets.
+    Each FacetFilter contains one or more facet values that must all be satisfied
+    for a dataset to match that filter. The overall selection keeps any dataset
+    that matches at least one of the provided filters.
+    If no filters are specified, all datasets in the data catalog are used.
     """
     group_by: tuple[str, ...] | None
@@ -351,6 +366,10 @@ class DataRequirement:
         :
             Filtered data catalog
         """
+        if not self.filters or any(not f.facets for f in self.filters):
+            return data_catalog
+        select = pd.Series(False, index=data_catalog.index)
         for facet_filter in self.filters:
             values = {}
             for facet, value in facet_filter.facets.items():
@@ -362,11 +381,9 @@ class DataRequirement:
                     )
                 values[facet] = clean_value
-            mask = data_catalog[list(values)].isin(values).all(axis="columns")
-            if not facet_filter.keep:
-                mask = ~mask
-            data_catalog = data_catalog[mask]
-        return data_catalog
+            select |= data_catalog[list(values)].isin(values).all(axis="columns")
+        return data_catalog[select]
 @runtime_checkable
@@ -432,6 +449,11 @@ class AbstractDiagnostic(Protocol):
     is raised.
     """
+    series: Sequence[SeriesDefinition]
+    """
+    Definition of the series that are produced by the diagnostic.
+    """
     provider: DiagnosticProvider
     """
     The provider that provides the diagnostic.
@@ -493,6 +515,8 @@ class Diagnostic(AbstractDiagnostic):
     See (climate_ref_example.example.ExampleDiagnostic)[] for an example implementation.
     """
+    series: Sequence[SeriesDefinition] = tuple()
     def __init__(self) -> None:
         super().__init__()
         self._provider: DiagnosticProvider | None = None

{climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/executor.py RENAMED Viewed

@@ -160,12 +160,15 @@ def import_executor_cls(fqn: str) -> type[Executor]:
         imp = importlib.import_module(module)
         executor: type[Executor] = getattr(imp, attribute_name)
+        if isinstance(executor, Exception):
+            raise executor
         # We can't really check if the executor is a subclass of Executor here
         # Protocols can't be used with issubclass if they have non-method members
         # We have to check this at class instantiation time
         return executor
-    except ModuleNotFoundError:
+    except (ModuleNotFoundError, ImportError):
         logger.error(f"Package '{fqn}' not found")
         raise InvalidExecutorException(fqn, f"Module '{module}' not found")
     except AttributeError:

{climate_ref_core-0.6.5 → climate_ref_core-0.7.0}/src/climate_ref_core/logging.py RENAMED Viewed

@@ -9,6 +9,7 @@ import contextlib
 import inspect
 import logging
 import multiprocessing
+import os
 import sys
 from collections.abc import Generator
 from pathlib import Path
@@ -94,7 +95,10 @@ def initialise_logging(level: int | str, format: str, log_directory: str | Path)
     logger.info("Starting REF logging")
     logger.info(f"arguments: {sys.argv}")
-    add_log_handler(level=level, format=format, colorize=True)
+    # LOGURU_COLORIZE is the default env var used by loguru to determine if color should be used
+    # We override this to use NO_COLOR which is more widely supported
+    no_color = os.environ.get("NO_COLOR") is not None
+    add_log_handler(level=level, format=format, colorize=not no_color)
 def capture_logging() -> None:
@@ -150,12 +154,12 @@ def remove_log_handler() -> None:
     """
     if hasattr(logger, "default_handler_id"):
         try:
-            logger.remove(logger.default_handler_id)
+            logger.remove(logger.default_handler_id)  # pyright: ignore[reportAttributeAccessIssue]
         except ValueError:
             # This can happen if the handler has already been removed
             # or if the logger was never configured
             pass
-        del logger.default_handler_id
+        del logger.default_handler_id  # pyright: ignore[reportAttributeAccessIssue]
     else:
         raise AssertionError("No default log handler to remove.")

climate-ref-core 0.6.5__tar.gz → 0.7.0__tar.gz

climate-ref-core 0.6.5tar.gz → 0.7.0tar.gz