PyPI - climate-ref-core - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

climate-ref-core 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

climate_ref_core/cmip6_to_cmip7.py +598 -0
climate_ref_core/dataset_registry.py +43 -0
climate_ref_core/diagnostics.py +10 -0
climate_ref_core/env.py +37 -0
climate_ref_core/esgf/__init__.py +21 -0
climate_ref_core/esgf/base.py +122 -0
climate_ref_core/esgf/cmip6.py +119 -0
climate_ref_core/esgf/fetcher.py +138 -0
climate_ref_core/esgf/obs4mips.py +94 -0
climate_ref_core/esgf/registry.py +307 -0
climate_ref_core/exceptions.py +24 -0
climate_ref_core/providers.py +143 -17
climate_ref_core/testing.py +621 -0
{climate_ref_core-0.8.0.dist-info → climate_ref_core-0.9.0.dist-info}/METADATA +6 -3
climate_ref_core-0.9.0.dist-info/RECORD +32 -0
climate_ref_core-0.8.0.dist-info/RECORD +0 -24
{climate_ref_core-0.8.0.dist-info → climate_ref_core-0.9.0.dist-info}/WHEEL +0 -0
{climate_ref_core-0.8.0.dist-info → climate_ref_core-0.9.0.dist-info}/licenses/LICENCE +0 -0
{climate_ref_core-0.8.0.dist-info → climate_ref_core-0.9.0.dist-info}/licenses/NOTICE +0 -0

climate_ref_core/esgf/registry.py ADDED Viewed

@@ -0,0 +1,307 @@
+"""
+Registry-based dataset request implementation.
+This module provides request classes for fetching datasets from pooch registries
+(e.g., pmp-climatology) rather than ESGF.
+"""
+from __future__ import annotations
+import re
+from collections.abc import Callable
+from typing import Any
+import pandas as pd
+from loguru import logger
+from climate_ref_core.dataset_registry import dataset_registry_manager
+# Number of path parts in PMP climatology registry keys
+_PMP_CLIMATOLOGY_PATH_PARTS = 5
+# Number of path parts in obs4REF registry keys
+_OBS4REF_PATH_PARTS = 8
+def _parse_obs4ref_key(key: str) -> dict[str, Any]:
+    """
+    Parse an obs4REF registry key to extract metadata.
+    Keys follow the pattern:
+    obs4REF/{institution_id}/{source_id}/{frequency}/{variable_id}/{grid_label}/{version}/{filename}
+    Where filename is:
+    {variable_id}_{frequency}_{source_id}_{inst_short}_{grid_label}_{time_range}.nc
+    Parameters
+    ----------
+    key
+        The registry key (path) to parse
+    Returns
+    -------
+        Dictionary with parsed metadata, or empty dict if parsing fails
+    """
+    # Example: obs4REF/MOHC/HadISST-1-1/mon/ts/gn/v20250415/ts_mon_HadISST-1-1_PCMDI_gn_187001-202501.nc
+    parts = key.split("/")
+    if len(parts) != _OBS4REF_PATH_PARTS:
+        logger.debug(f"Unexpected obs4REF key format (expected 8 parts): {key}")
+        return {}
+    _, institution_id, _source_id, _frequency, _variable_id, _grid_label, version, filename = parts
+    # Parse filename: {var}_{freq}_{source_id}_{inst_short}_{grid}_{time_range}.nc
+    # Handle source_ids with hyphens (e.g., "HadISST-1-1", "GPCP-Monthly-3-2")
+    filename_pattern = re.compile(
+        r"^(?P<variable_id>[a-zA-Z0-9]+)_"
+        r"(?P<frequency>[a-z]+)_"
+        r"(?P<source_id>[A-Za-z0-9-]+)_"
+        r"(?P<institution_short>[A-Za-z0-9-]+)_"
+        r"(?P<grid_label>[a-zA-Z]+)_"
+        r"(?P<time_range>\d+-\d+)\.nc$"
+    )
+    match = filename_pattern.match(filename)
+    if not match:
+        logger.debug(f"obs4REF filename doesn't match expected pattern: {filename}")
+        return {}
+    metadata = match.groupdict()
+    # Add path-derived metadata (can override filename metadata for consistency)
+    metadata["institution_id"] = institution_id
+    metadata["version"] = version
+    # Parse time range (format: YYYYMM-YYYYMM)
+    time_parts = metadata["time_range"].split("-")
+    if len(time_parts) == 2:  # noqa: PLR2004
+        metadata["time_start"] = time_parts[0]
+        metadata["time_end"] = time_parts[1]
+    # Add the full key for reference
+    metadata["key"] = key
+    return metadata
+def _parse_pmp_climatology_key(key: str) -> dict[str, Any]:
+    """
+    Parse a PMP climatology registry key to extract metadata.
+    Keys follow the pattern:
+    PMP_obs4MIPsClims/{variable_id}/{grid_label}/{version}/{filename}
+    Where filename is:
+    {variable_id}_mon_{source_id}_{institution_id}_{grid_label}_{time_range}_AC_{version}_{resolution}.nc
+    Parameters
+    ----------
+    key
+        The registry key (path) to parse
+    Returns
+    -------
+        Dictionary with parsed metadata, or empty dict if parsing fails
+    """
+    # Example: PMP_obs4MIPsClims/psl/gr/v20250224/
+    #          psl_mon_ERA-5_PCMDI_gr_198101-200412_AC_v20250224_2.5x2.5.nc
+    parts = key.split("/")
+    if len(parts) != _PMP_CLIMATOLOGY_PATH_PARTS:
+        logger.debug(f"Unexpected key format (expected 5 parts): {key}")
+        return {}
+    _, _variable_id_dir, _grid_label, _version, filename = parts
+    # Parse filename: {var}_mon_{source_id}_{inst_id}_{grid}_{time}_AC_{ver}_{res}.nc
+    # Handle source_ids with hyphens (e.g., "ERA-5", "GPCP-Monthly-3-2")
+    filename_pattern = re.compile(
+        r"^(?P<variable_id>[a-z]+)_mon_"
+        r"(?P<source_id>[A-Za-z0-9-]+)_"
+        r"(?P<institution_id>[A-Za-z0-9]+)_"
+        r"(?P<grid_label>[a-z]+)_"
+        r"(?P<time_range>\d+-\d+)_AC_"
+        r"(?P<version>v\d+)_"
+        r"(?P<resolution>.+)\.nc$"
+    )
+    match = filename_pattern.match(filename)
+    if not match:
+        logger.debug(f"Filename doesn't match expected pattern: {filename}")
+        return {}
+    metadata = match.groupdict()
+    # Parse time range (format: YYYYMM-YYYYMM)
+    time_parts = metadata["time_range"].split("-")
+    if len(time_parts) == 2:  # noqa: PLR2004
+        metadata["time_start"] = time_parts[0]
+        metadata["time_end"] = time_parts[1]
+    # Add the full key for reference
+    metadata["key"] = key
+    return metadata
+def _matches_facets(
+    metadata: dict[str, Any],
+    facets: dict[str, str | tuple[str, ...]],
+) -> bool:
+    """
+    Check if metadata matches all provided facets.
+    Parameters
+    ----------
+    metadata
+        Parsed metadata dictionary
+    facets
+        Facets to match against. Values can be strings or tuples of strings.
+    Returns
+    -------
+        True if all facets match
+    """
+    for facet_name, facet_value in facets.items():
+        if facet_name not in metadata:
+            return False
+        # Normalize to tuple for comparison
+        allowed_values = (facet_value,) if isinstance(facet_value, str) else facet_value
+        if metadata[facet_name] not in allowed_values:
+            return False
+    return True
+class RegistryRequest:
+    """
+    Request for data from a pooch registry (e.g., pmp-climatology).
+    These data are fetched from a pooch registry rather than ESGF.
+    This is useful for pre-processed datasets like PMP climatologies
+    that are hosted externally but not on ESGF.
+    Parameters
+    ----------
+    slug
+        Unique identifier for this request
+    registry_name
+        Name of the registry to fetch from (e.g., "pmp-climatology")
+    facets
+        Facets to filter datasets (e.g., {"variable_id": "psl", "source_id": "ERA-5"})
+    source_type
+        Type of dataset source (default: "PMPClimatology")
+    time_span
+        Optional time range filter (not used for registry filtering, but required for protocol)
+    Example
+    -------
+    ```python
+    request = RegistryRequest(
+        slug="era5-psl",
+        registry_name="pmp-climatology",
+        facets={"variable_id": "psl", "source_id": "ERA-5"},
+    )
+    df = request.fetch_datasets()
+    ```
+    """
+    def __init__(
+        self,
+        slug: str,
+        registry_name: str,
+        facets: dict[str, str | tuple[str, ...]],
+        source_type: str = "PMPClimatology",
+        time_span: tuple[str, str] | None = None,
+    ) -> None:
+        self.slug = slug
+        self.registry_name = registry_name
+        self.facets = facets
+        self.source_type = source_type
+        self.time_span = time_span
+    def __repr__(self) -> str:
+        return (
+            f"RegistryRequest(slug={self.slug!r}, registry_name={self.registry_name!r}, "
+            f"facets={self.facets!r}, source_type={self.source_type!r}, time_span={self.time_span!r})"
+        )
+    def _get_parser(self) -> Callable[[str], dict[str, Any]]:
+        """Get the appropriate parser function based on registry name."""
+        if self.registry_name == "pmp-climatology":
+            return _parse_pmp_climatology_key
+        elif self.registry_name == "obs4ref":
+            return _parse_obs4ref_key
+        else:
+            # Default to obs4ref parser as fallback
+            logger.warning(f"Unknown registry '{self.registry_name}', using obs4ref parser")
+            return _parse_obs4ref_key
+    def fetch_datasets(self) -> pd.DataFrame:
+        """
+        Fetch matching datasets from the registry.
+        Returns
+        -------
+            DataFrame containing dataset metadata and file paths.
+            Each row represents one file, with columns for metadata
+            and a 'files' column containing a list with the file path.
+        """
+        logger.info(f"Fetching from registry '{self.registry_name}' for request: {self.slug}")
+        try:
+            registry = dataset_registry_manager[self.registry_name]
+        except KeyError:
+            raise ValueError(
+                f"Registry '{self.registry_name}' not found. "
+                f"Available registries: {list(dataset_registry_manager.keys())}"
+            )
+        parser = self._get_parser()
+        matching_rows: list[dict[str, Any]] = []
+        for key in registry.registry.keys():
+            # Parse metadata from the registry key
+            metadata = parser(key)
+            if not metadata:
+                continue
+            # Check if it matches the requested facets
+            if not _matches_facets(metadata, self.facets):
+                continue
+            # Fetch the file (downloads if not cached)
+            try:
+                file_path = registry.fetch(key)
+                logger.debug(f"Fetched: {key} -> {file_path}")
+            except Exception as e:
+                logger.warning(f"Failed to fetch {key}: {e}")
+                continue
+            # Build row compatible with ESGFFetcher expectations
+            row = {
+                **metadata,
+                "files": [file_path],
+                "path": file_path,
+            }
+            matching_rows.append(row)
+        if not matching_rows:
+            logger.warning(f"No datasets found matching facets: {self.facets}")
+            return pd.DataFrame()
+        result = pd.DataFrame(matching_rows)
+        # Filter to only the latest version for each unique dataset
+        # Datasets are identified by source_id, variable_id, and grid_label
+        if "version" in result.columns:
+            group_by_cols = ["source_id", "variable_id", "grid_label"]
+            # Only group by columns that exist in the DataFrame
+            group_by_cols = [col for col in group_by_cols if col in result.columns]
+            if group_by_cols:
+                max_version = result.groupby(group_by_cols, sort=False)["version"].transform("max")
+                result = result[result["version"] == max_version]
+        logger.info(f"Found {len(result)} datasets matching request: {self.slug}")
+        return result

climate_ref_core/exceptions.py CHANGED Viewed

@@ -67,3 +67,27 @@ class DiagnosticError(RefException):
     def __reduce__(self) -> tuple[type["DiagnosticError"], tuple[str, Any]]:
         # Return a tuple: (callable, args_tuple_for_reconstruction)
         return (self.__class__, (self.message, self.result))
+class TestCaseError(RefException):
+    """Raised when there is an error with a test case."""
+    pass
+class TestCaseNotFoundError(TestCaseError):
+    """Raised when a test case is not found."""
+    pass
+class NoTestDataSpecError(TestCaseError):
+    """Raised when a diagnostic has no test_data_spec."""
+    pass
+class DatasetResolutionError(TestCaseError):
+    """Raised when datasets cannot be resolved for a test case."""
+    pass

climate_ref_core/providers.py CHANGED Viewed

@@ -175,6 +175,105 @@ class DiagnosticProvider:
         """
         return self._diagnostics[slug.lower()]
+    def setup(
+        self,
+        config: Config,
+        *,
+        skip_env: bool = False,
+        skip_data: bool = False,
+    ) -> None:
+        """
+        Perform all setup required before offline execution.
+        This calls setup_environment and fetch_data in the correct order.
+        Override individual hooks for fine-grained control.
+        This method MUST be idempotent - safe to call multiple times.
+        Parameters
+        ----------
+        config
+            The application configuration
+        skip_env
+            If True, skip environment setup (e.g., conda)
+        skip_data
+            If True, skip data fetching
+        """
+        if not skip_env:
+            self.setup_environment(config)
+        if not skip_data:
+            self.fetch_data(config)
+    def setup_environment(self, config: Config) -> None:
+        """
+        Set up the execution environment (e.g., conda environment).
+        Default implementation does nothing. Override in subclasses
+        that require environment setup.
+        This method MUST be idempotent.
+        Parameters
+        ----------
+        config
+            The application configuration
+        """
+        pass
+    def fetch_data(self, config: Config) -> None:
+        """
+        Fetch all data required for offline execution.
+        This includes reference datasets, climatology files, map files,
+        recipes, or any other data the provider needs.
+        Default implementation does nothing. Override in subclasses
+        that require data fetching. Providers are responsible for
+        determining what data they need and how to fetch it.
+        Data should be downloaded to the pooch cache (via `fetch_all_files`
+        with `output_dir=None`). Diagnostics can then access data via
+        `registry.abspath`.
+        This method MUST be idempotent.
+        Parameters
+        ----------
+        config
+            The application configuration
+        """
+        pass
+    def validate_setup(self, config: Config) -> bool:
+        """
+        Validate that the provider is ready for offline execution.
+        Returns True if setup is complete and valid, False otherwise.
+        Default implementation returns True.
+        Parameters
+        ----------
+        config
+            The application configuration
+        Returns
+        -------
+        bool
+            True if setup is valid and complete
+        """
+        return True
+    def get_data_path(self) -> Path | None:
+        """
+        Get the path where this provider's data is cached.
+        Returns
+        -------
+        Path | None
+            The data cache path, or None if the provider doesn't use cached data.
+        """
+        return None
 def import_provider(fqn: str) -> DiagnosticProvider:
     """
@@ -316,7 +415,7 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
         self._conda_exe: Path | None = None
         self._prefix: Path | None = None
         self.url = f"git+{repo}@{tag_or_commit}" if repo and tag_or_commit else None
-        self.env_vars: dict[str, str] = {}
+        self.env_vars: dict[str, str] = os.environ.copy()
     @property
     def prefix(self) -> Path:
@@ -338,9 +437,26 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
         """Configure the provider."""
         super().configure(config)
         self.prefix = config.paths.software / "conda"
+        self.env_vars.setdefault("HOME", str(self.prefix))
+    def _is_stale(self, path: Path) -> bool:
+        """Check if a file is older than `MICROMAMBA_MAX_AGE`.
+        Parameters
+        ----------
+        path
+            The path to the file to check.
+        Returns
+        -------
+            True if the file is older than `MICROMAMBA_MAX_AGE`, False otherwise.
+        """
+        creation_time = datetime.datetime.fromtimestamp(path.stat().st_ctime)
+        age = datetime.datetime.now() - creation_time
+        return age > MICROMAMBA_MAX_AGE
     def _install_conda(self, update: bool) -> Path:
-        """Install micromamba in a temporary location.
+        """Install micromamba in a specific location.
         Parameters
         ----------
@@ -354,20 +470,15 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
         """
         conda_exe = self.prefix / "micromamba"
-        if conda_exe.exists() and update:
-            # Only update if the executable is older than `MICROMAMBA_MAX_AGE`.
-            creation_time = datetime.datetime.fromtimestamp(conda_exe.stat().st_ctime)
-            age = datetime.datetime.now() - creation_time
-            if age < MICROMAMBA_MAX_AGE:
-                update = False
-        if not conda_exe.exists() or update:
+        if not conda_exe.exists() or update or self._is_stale(conda_exe):
             logger.info("Installing conda")
             self.prefix.mkdir(parents=True, exist_ok=True)
-            response = requests.get(_get_micromamba_url(), timeout=120)
+            response = requests.get(_get_micromamba_url(), timeout=120, stream=True)
             response.raise_for_status()
             with conda_exe.open(mode="wb") as file:
-                file.write(response.content)
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:  # Filter out keep-alive new chunks
+                        file.write(chunk)
             conda_exe.chmod(stat.S_IRWXU)
             logger.info("Successfully installed conda.")
@@ -428,7 +539,7 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
                 f"{self.env_path}",
             ]
             logger.debug(f"Running {' '.join(cmd)}")
-            subprocess.run(cmd, check=True)  # noqa: S603
+            subprocess.run(cmd, check=True, env=self.env_vars)  # noqa: S603
             if self.url is not None:
                 logger.info(f"Installing development version of {self.slug} from {self.url}")
@@ -443,7 +554,7 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
                     self.url,
                 ]
                 logger.debug(f"Running {' '.join(cmd)}")
-                subprocess.run(cmd, check=True)  # noqa: S603
+                subprocess.run(cmd, check=True, env=self.env_vars)  # noqa: S603
     def run(self, cmd: Iterable[str]) -> None:
         """
@@ -476,8 +587,6 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
             *cmd,
         ]
         logger.info(f"Running '{' '.join(cmd)}'")
-        env_vars = os.environ.copy()
-        env_vars.update(self.env_vars)
         try:
             # This captures the log output until the execution is complete
             # We could poll using `subprocess.Popen` if we want something more responsive
@@ -487,7 +596,7 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 text=True,
-                env=env_vars,
+                env=self.env_vars,
             )
             logger.info("Command output: \n" + res.stdout)
             logger.info("Command execution successful")
@@ -495,3 +604,20 @@ class CondaDiagnosticProvider(CommandLineDiagnosticProvider):
             logger.error(f"Failed to run {cmd}")
             logger.error(e.stdout)
             raise e
+    def setup_environment(self, config: Config) -> None:
+        """Set up the conda environment."""
+        self.create_env()
+    def validate_setup(self, config: Config) -> bool:
+        """Validate conda environment exists."""
+        env_exists = self.env_path.exists()
+        if not env_exists:
+            logger.error(
+                f"Conda environment for {self.slug} is not available at {self.env_path}. "
+                f"Please run `ref providers setup --provider {self.slug}` to install it."
+            )
+        # TODO: Could add more validation here (e.g., check packages installed)
+        return env_exists

climate-ref-core 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

climate-ref-core 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl