PyPI - climate-ref-core - Versions diffs - 0.5.4__tar.gz → 0.6.0__tar.gz - Mend

climate-ref-core 0.5.4tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: climate-ref-core
-Version: 0.5.4
+Version: 0.6.0
 Summary: Core library for the CMIP Rapid Evaluation Framework
 Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
-License: Apache-2.0
+License-Expression: Apache-2.0
 License-File: LICENCE
 License-File: NOTICE
-Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
@@ -18,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering
 Requires-Python: >=3.11
 Requires-Dist: attrs>=23.2.0
+Requires-Dist: cattrs>=24.1
 Requires-Dist: environs>=11
 Requires-Dist: loguru>=0.7.0
 Requires-Dist: numpy>=1.25.0
@@ -27,6 +29,7 @@ Requires-Dist: pydantic>=2.10.6
 Requires-Dist: requests
 Requires-Dist: rich
 Requires-Dist: ruamel-yaml>=0.18
+Requires-Dist: setuptools>=75.8.0
 Requires-Dist: typing-extensions
 Description-Content-Type: text/markdown

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "climate-ref-core"
-version = "0.5.4"
+version = "0.6.0"
 description = "Core library for the CMIP Rapid Evaluation Framework"
 readme = "README.md"
 authors = [
@@ -12,21 +12,24 @@ authors = [
     { name = "Nathan Collier", email = "collierno@ornl.gov" },
     { name = "Dora Hegedus", email = "dora.hegedus@stfc.ac.uk" },
 ]
+license = "Apache-2.0"
 requires-python = ">=3.11"
 classifiers = [
-    "Development Status :: 2 - Pre-Alpha",
+    "Development Status :: 3 - Alpha",
     "Operating System :: OS Independent",
+    "Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
     "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
     "attrs>=23.2.0",
+    "cattrs>=24.1",
     "pydantic>=2.10.6",
     "typing_extensions",
     "requests",
@@ -35,6 +38,8 @@ dependencies = [
     "pooch>=1.8.0,<2",
     "ruamel.yaml>=0.18",
     "environs>=11",
+    # Not used directly, but required to support some installations
+    "setuptools>=75.8.0",
     # SPEC 0000 constraints
     # We follow [SPEC-0000](https://scientific-python.org/specs/spec-0000/)
@@ -43,11 +48,8 @@ dependencies = [
     "numpy>=1.25.0"
 ]
-[project.license]
-text = "Apache-2.0"
-[tool.uv]
-dev-dependencies = [
+[dependency-groups]
+dev = [
     "types-requests",
 ]

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/src/climate_ref_core/dataset_registry.py RENAMED Viewed

@@ -12,15 +12,68 @@ import pathlib
 import shutil
 import pooch
+import pooch.hashes
 from loguru import logger
 from rich.progress import track
+from climate_ref_core.env import env
+DATASET_URL = env.str("REF_DATASET_URL", default="https://pub-b093171261094c4ea9adffa01f94ee06.r2.dev")
+def _verify_hash_matches(fname: str | pathlib.Path, known_hash: str) -> bool:
+    """
+    Check if the hash of a file matches a known hash.
+    Coverts hashes to lowercase before comparison to avoid system specific
+    mismatches between hashes in the registry and computed hashes.
+    This is a tweaked version of the `pooch.hashes.hash_matches` function with a custom error message.
+    Parameters
+    ----------
+    fname
+        The path to the file.
+    known_hash
+        The known hash. Optionally, prepend ``alg:`` to the hash to specify the
+        hashing algorithm. Default is SHA256.
+    Raises
+    ------
+    ValueError
+        If the hash does not match.
+    FileNotFoundError
+        If the file does not exist.
+    Returns
+    -------
+    bool
+        True if the hash matches.
+    """
+    fname = pathlib.Path(fname)
+    if not fname.exists():
+        raise FileNotFoundError(f"File {fname!s} does not exist. Cannot verify hash.")
+    algorithm = pooch.hashes.hash_algorithm(known_hash)
+    new_hash = pooch.hashes.file_hash(str(fname), alg=algorithm)
+    matches = new_hash.lower() == known_hash.split(":")[-1].lower()
+    if not matches:
+        raise ValueError(
+            f"{algorithm.upper()} hash of downloaded file ({fname!s}) does not match"
+            f" the known hash: expected {known_hash} but got {new_hash}. "
+            f"The file may have been corrupted or the known hash may be outdated. "
+            f"Delete the file and try again."
+        )
+    return matches
 def fetch_all_files(
     registry: pooch.Pooch,
     name: str,
     output_dir: pathlib.Path | None,
     symlink: bool = False,
+    verify: bool = True,
 ) -> None:
     """
     Fetch all files associated with a pooch registry and write them to an output directory.
@@ -45,12 +98,17 @@ def fetch_all_files(
     symlink
         If True, symlink all files to this directory.
         Otherwise, perform a copy.
+    verify
+        If True, verify the checksums of the local files against the registry.
     """
     if output_dir:
         output_dir.mkdir(parents=True, exist_ok=True)
     for key in track(registry.registry.keys(), description=f"Fetching {name} data"):
         fetch_file = registry.fetch(key)
+        expected_hash = registry.registry[key]
+        if not isinstance(expected_hash, str) or not expected_hash:  # pragma: no cover
+            raise ValueError(f"Expected a hash for {key} but got {expected_hash}")
         if output_dir is None:
             # Just warm the cache and move onto the next file
@@ -68,6 +126,8 @@ def fetch_all_files(
                 shutil.copy(fetch_file, linked_file)
         else:
             logger.info(f"File {linked_file} already exists. Skipping.")
+        if verify:
+            _verify_hash_matches(linked_file, expected_hash)
 class DatasetRegistryManager:

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/src/climate_ref_core/datasets.py RENAMED Viewed

@@ -5,7 +5,7 @@ Dataset management and filtering
 import enum
 import functools
 import hashlib
-from collections.abc import Iterable
+from collections.abc import Collection, Iterable
 from typing import Any, Self
 import pandas as pd
@@ -48,19 +48,17 @@ class SourceDatasetType(enum.Enum):
         return sorted(cls, key=lambda x: x.value)
-def _clean_facets(raw_values: dict[str, str | tuple[str, ...] | list[str]]) -> dict[str, tuple[str, ...]]:
+def _clean_facets(raw_values: dict[str, str | Collection[str]]) -> dict[str, tuple[str, ...]]:
     """
     Clean the value of a facet filter to a tuple of strings
     """
-    result = {}
+    result: dict[str, tuple[str, ...]] = {}
     for key, value in raw_values.items():
-        if isinstance(value, list):
-            result[key] = tuple(value)
-        elif isinstance(value, str):
+        if isinstance(value, str):
             result[key] = (value,)
-        elif isinstance(value, tuple):
-            result[key] = value
+        else:
+            result[key] = tuple(value)
     return result

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/src/climate_ref_core/logging.py RENAMED Viewed

@@ -2,13 +2,16 @@
 Logging utilities
 The REF uses [loguru](https://loguru.readthedocs.io/en/stable/), a simple logging framework.
+The log level and format are configured via the REF configuration file.
 """
 import contextlib
 import inspect
 import logging
+import multiprocessing
 import sys
 from collections.abc import Generator
+from pathlib import Path
 from typing import Any
 import pooch
@@ -24,6 +27,28 @@ Filename for the execution log.
 This file is written via [climate_ref_core.logging.redirect_logs][].
 """
+DEFAULT_LOG_FORMAT = (
+    "<green>{time:YYYY-MM-DD HH:mm:ss.SSS Z}</green> | <level>{level: <8}</level> | "
+    "<cyan>{name}</cyan> - <level>{message}</level>"
+)
+"""
+Default log format used by the REF
+"""
+VERBOSE_LOG_FORMAT = (
+    "<green>{time:YYYY-MM-DD HH:mm:ss.SSS Z} e{elapsed}s</green> | "
+    "<level>{level: <8}</level> | "
+    "{process.name}:{process.id} | "
+    "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
+    "<level>{message}</level>"
+)
+"""
+The verbose log format is used for debugging and development.
+This is the format that is used when writing the log messages to file for later debugging.
+It contains information about the process and function that the log message was generated in.
+"""
 class _InterceptHandler(logging.Handler):
     def emit(self, record: logging.LogRecord) -> None:
@@ -43,6 +68,35 @@ class _InterceptHandler(logging.Handler):
         logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
+def initialise_logging(level: int | str, format: str, log_directory: str | Path) -> None:  # noqa: A002 # pragma: no cover
+    """
+    Initialise the logging for the REF
+    This sets up the default log handler and configures the REF logger.
+    """
+    capture_logging()
+    log_directory = Path(log_directory)
+    process_name = multiprocessing.current_process().name
+    # Remove any existing handlers
+    logger.remove()
+    # Write out debug logs to a file
+    log_directory.mkdir(parents=True, exist_ok=True)
+    filename = f"climate-ref_{{time:YYYY-MM-DD_HH-mm}}_{process_name}.log"
+    logger.add(
+        sink=log_directory / filename,
+        retention=10,
+        level="DEBUG",
+        format=VERBOSE_LOG_FORMAT,
+        colorize=False,
+    )
+    logger.info("Starting REF logging")
+    logger.info(f"arguments: {sys.argv}")
+    add_log_handler(level=level, format=format, colorize=True)
 def capture_logging() -> None:
     """
     Capture logging from the standard library and redirect it to Loguru
@@ -56,6 +110,7 @@ def capture_logging() -> None:
     logging.basicConfig(handlers=[_InterceptHandler()], level=0, force=True)
     # Disable some overly verbose logs
+    logger.disable("alembic.runtime.migration")
     logger.disable("matplotlib.colorbar")
     logger.disable("matplotlib.ticker")
     logger.disable("matplotlib.font_manager")
@@ -154,4 +209,4 @@ def redirect_logs(definition: ExecutionDefinition, log_level: str) -> Generator[
             add_log_handler(**logger.default_handler_kwargs)  # type: ignore[attr-defined]
-__all__ = ["EXECUTION_LOG_FILENAME", "add_log_handler", "capture_logging", "logger", "redirect_logs"]
+__all__ = ["EXECUTION_LOG_FILENAME", "capture_logging", "initialise_logging", "redirect_logs"]

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/src/climate_ref_core/pycmec/controlled_vocabulary.py RENAMED Viewed

@@ -28,8 +28,8 @@ class DimensionValue:
     name: str
     long_name: str
-    description: str | None
-    units: str
+    description: str | None = None
+    units: str | None = None
 @frozen

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/src/climate_ref_core/pycmec/cv_cmip7_aft.yaml RENAMED Viewed

@@ -24,6 +24,11 @@ dimensions:
   description: "Variable ID for the reference dataset (e.g., tas, pr, etc.)"
   allow_extra_values: true
   required: false
+- name: grid_label
+  long_name: Grid Label
+  description: "The grid label of the output (e.g gn)"
+  allow_extra_values: true
+  required: false
 - name: member_id
   long_name: Member ID
   description: "Unique identifier for each ensemble member, includes the variant label and sub-experiment if present"
@@ -48,7 +53,15 @@ dimensions:
     - name: global
       long_name: Global
       description: "Global aggregate"
-      units: dimensionless
+    - name: NHEX
+      long_name: Northern Hemisphere Extra-tropics
+      description: "Northern Hemisphere Extra-tropics (30N-90N)"
+    - name: SHEX
+      long_name: Southern Hemisphere Extra-tropics
+      description: "Southern Hemisphere Extra-tropics (30S-90S)"
+    - name: Tropics
+      long_name: Tropics
+      description: "Tropics (30N-30S)"
 - name: season
   long_name: Season
   description: "Parts of the year from which the metric values are calculated"
@@ -58,23 +71,57 @@ dimensions:
     - name: ann
       long_name: Annual
       description: ""
-      units: dimensionless
     - name: djf
       long_name: Dec,Jan,Feb
       description: "December, January, February"
-      units: dimensionless
     - name: mam
       long_name: Mar,Apr,May
       description: "March, April, May"
-      units: dimensionless
     - name: jja
       long_name: Jun,Jul,Aug
       description: "June, July, August"
-      units: dimensionless
     - name: son
       long_name: Sep,Oct,Nov
       description: "September, October, November"
-      units: dimensionless
+- name: mode
+  long_name: Mode of variability
+  description: "Different modes of variability that can be calculated"
+  required: false
+  allow_extra_values: false
+  values:
+    - name: NAM
+      long_name: Northern Annular Mode
+      description: Northern Annular Mode
+    - name: NAO
+      long_name: North Atlantic Oscillation
+      description: North Atlantic Oscillation
+    - name: PNA
+      long_name: Pacific–North America pattern
+      description: Pacific–North America pattern
+    - name: SAM
+      long_name: Southern Annular Mode
+      description: Southern Annular Mode
+    - name: PDO
+      long_name: Pacific decadal oscillation
+      description: Pacific decadal oscillation
+    - name: NPO
+      long_name: North Pacific Oscillation
+      description: North Pacific Oscillation
+    - name: NPGO
+      long_name: North Pacific Gyre Oscillation
+      description: North Pacific Gyre Oscillation
+- name: method
+  long_name: EOF Method
+  description: "Method for calculating the EOFs in PMP's mode of variability diagnostic"
+  required: false
+  allow_extra_values: false
+  values:
+    - name: cbf
+      long_name: Common Basis Function
+      description: "A projection of the leading EOFs of the reference dataset onto the model data"
+    - name: eof1
+      long_name: EOF1
+      description: "The leading EOF of the reference dataset"
 - name: statistic
   long_name: Statistic
   description: ""

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/src/climate_ref_core/pycmec/metric.py RENAMED Viewed

@@ -20,7 +20,7 @@ from copy import deepcopy
 from enum import Enum
 from typing import Any, cast
-from loguru import logger
+import numpy as np
 from pydantic import (
     BaseModel,
     ConfigDict,
@@ -190,7 +190,6 @@ class MetricResults(RootModel[Any]):
             expected_keys = set(metdims[dim_name].keys())
             if not (dict_keys.issubset(expected_keys)):
                 msg = f"Unknown dimension values: {dict_keys - expected_keys} for {dim_name}"
-                logger.error(msg)
                 if not ALLOW_EXTRA_KEYS:  # pragma: no cover
                     raise ValueError(f"{msg}\nExpected keys: {expected_keys}")
                 else:
@@ -228,7 +227,7 @@ class StrNumDict(RootModel[Any]):
     """A class contains string key and numeric value"""
     model_config = ConfigDict(strict=True)
-    root: dict[str, float | int]
+    root: dict[str, float | int | None]
 def remove_dimensions(raw_metric_bundle: dict[str, Any], dimensions: str | list[str]) -> dict[str, Any]:
@@ -542,6 +541,13 @@ def _walk_results(
             yield ScalarMetricValue(
                 dimensions=metadata, value=value, attributes=results.get(MetricCV.ATTRIBUTES.value)
             )
+        elif value is None:
+            # Replace any None values with NaN
+            # This translates null values in JSON to Python NaN's
+            # Missing values are different from NaN values
+            yield ScalarMetricValue(
+                dimensions=metadata, value=np.nan, attributes=results.get(MetricCV.ATTRIBUTES.value)
+            )
         else:
             yield from _walk_results(dimensions[1:], value, {**metadata})

{climate_ref_core-0.5.4 → climate_ref_core-0.6.0}/tests/unit/test_dataset_registry/test_dataset_registry.py RENAMED Viewed

@@ -5,10 +5,13 @@ import pytest
 from climate_ref_core.dataset_registry import (
     DatasetRegistryManager,
+    _verify_hash_matches,
     dataset_registry_manager,
     fetch_all_files,
 )
+NUM_OBS4REF_FILES = 67
 @pytest.fixture
 def fake_registry_file():
@@ -99,28 +102,73 @@ class TestDatasetRegistry:
 @pytest.mark.parametrize("symlink", [True, False])
-def test_fetch_all_files(mocker, tmp_path, symlink):
+@pytest.mark.parametrize("verify", [True, False])
+def test_fetch_all_files(mocker, tmp_path, symlink, verify):
+    mock_verify = mocker.patch("climate_ref_core.dataset_registry._verify_hash_matches")
     downloaded_file = tmp_path / "out.txt"
     downloaded_file.write_text("foo")
     registry = dataset_registry_manager["obs4ref"]
     registry.fetch = mocker.MagicMock(return_value=downloaded_file)
-    fetch_all_files(registry, "obs4ref", tmp_path, symlink=symlink)
-    assert registry.fetch.call_count == 36
+    fetch_all_files(registry, "obs4ref", tmp_path, symlink=symlink, verify=verify)
+    assert registry.fetch.call_count == NUM_OBS4REF_FILES
-    expected_file = (
-        tmp_path / "obs4REF/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc"
-    )
+    key = "obs4REF/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc"
+    expected_file = tmp_path / key
     assert expected_file.exists()
     assert expected_file.is_symlink() == symlink
     assert expected_file.read_text() == "foo"
+    if verify:
+        mock_verify.assert_any_call(expected_file, registry.registry[key])
+    else:
+        mock_verify.assert_not_called()
+def test_verify_hash_matches(mocker, tmp_path):
+    expected_hash = "sha256:expectedhashvalue"
+    mock_hashes = mocker.patch("climate_ref_core.dataset_registry.pooch.hashes")
+    mock_hashes.hash_algorithm.return_value = "sha256"
+    mock_hashes.file_hash.return_value = "expectedhashvalue"
+    file_path = tmp_path / "file.txt"
+    file_path.touch()
+    _verify_hash_matches(file_path, expected_hash)
+def test_verify_hash_missing_file(tmp_path):
+    expected_hash = "sha256:expectedhashvalue"
+    file_path = tmp_path / "file.txt"
+    with pytest.raises(FileNotFoundError, match="file.txt does not exist. Cannot verify hash"):
+        _verify_hash_matches(file_path, expected_hash)
+def test_verify_hash_differs(mocker, tmp_path):
+    expected_hash = "sha256:expectedhashvalue"
+    mock_hashes = mocker.patch("climate_ref_core.dataset_registry.pooch.hashes")
+    mock_hashes.hash_algorithm.return_value = "sha256"
+    mock_hashes.file_hash.return_value = "opps"
+    file_path = tmp_path / "file.txt"
+    file_path.touch()
+    with pytest.raises(
+        ValueError, match=f"does not match the known hash. expected {expected_hash} but got opps."
+    ):
+        _verify_hash_matches(file_path, expected_hash)
 def test_fetch_all_files_no_output(mocker):
     registry = dataset_registry_manager["obs4ref"]
     registry.fetch = mocker.MagicMock()
     fetch_all_files(registry, "obs4ref", None)
-    assert registry.fetch.call_count == 36
+    assert registry.fetch.call_count == NUM_OBS4REF_FILES