PyPI - climate-ref-core - Versions diffs - 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

climate-ref-core 0.8.1py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

climate_ref_core/cmip6_to_cmip7.py +598 -0
climate_ref_core/dataset_registry.py +43 -0
climate_ref_core/diagnostics.py +10 -0
climate_ref_core/env.py +37 -0
climate_ref_core/esgf/__init__.py +21 -0
climate_ref_core/esgf/base.py +122 -0
climate_ref_core/esgf/cmip6.py +119 -0
climate_ref_core/esgf/fetcher.py +138 -0
climate_ref_core/esgf/obs4mips.py +94 -0
climate_ref_core/esgf/registry.py +307 -0
climate_ref_core/exceptions.py +24 -0
climate_ref_core/providers.py +143 -17
climate_ref_core/testing.py +621 -0
{climate_ref_core-0.8.1.dist-info → climate_ref_core-0.9.0.dist-info}/METADATA +4 -2
climate_ref_core-0.9.0.dist-info/RECORD +32 -0
climate_ref_core-0.8.1.dist-info/RECORD +0 -24
{climate_ref_core-0.8.1.dist-info → climate_ref_core-0.9.0.dist-info}/WHEEL +0 -0
{climate_ref_core-0.8.1.dist-info → climate_ref_core-0.9.0.dist-info}/licenses/LICENCE +0 -0
{climate_ref_core-0.8.1.dist-info → climate_ref_core-0.9.0.dist-info}/licenses/NOTICE +0 -0

climate_ref_core/env.py CHANGED Viewed

@@ -32,4 +32,41 @@ def get_env() -> Env:
     return env
+def get_available_cpu_count() -> int:
+    """
+    Detect the number of CPU cores available considering cgroup limitations.
+    Returns
+    -------
+    :
+        The number of allocated CPUs or total cpu count if not running in a cgroup-limited environment.
+    """
+    try:
+        # Check for CPU quota
+        with open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us") as f:
+            quota = int(f.read())
+        with open("/sys/fs/cgroup/cpu/cpu.cfs_period_us") as f:
+            period = int(f.read())
+        if quota > 0 and period > 0:
+            return quota // period
+        # If no quota, check for cpuset
+        with open("/sys/fs/cgroup/cpuset/cpuset.cpus") as f:
+            cpuset = f.read().strip()
+            # Parse the cpuset string (e.g., "0-3", "0,2")
+            count = 0
+            for part in cpuset.split(","):
+                if "-" in part:
+                    start, end = map(int, part.split("-"))
+                    count += end - start + 1
+                else:
+                    count += 1
+            return count
+    except FileNotFoundError:
+        # Not running in a cgroup-limited environment or cgroup files not found
+        return os.cpu_count() or 1
 env = get_env()

climate_ref_core/esgf/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+ESGF dataset fetching
+This module provides classes for searching and fetching datasets from ESGF
+(Earth System Grid Federation) and other data registries.
+"""
+from climate_ref_core.esgf.base import ESGFRequest, IntakeESGFMixin
+from climate_ref_core.esgf.cmip6 import CMIP6Request
+from climate_ref_core.esgf.fetcher import ESGFFetcher
+from climate_ref_core.esgf.obs4mips import Obs4MIPsRequest
+from climate_ref_core.esgf.registry import RegistryRequest
+__all__ = [
+    "CMIP6Request",
+    "ESGFFetcher",
+    "ESGFRequest",
+    "IntakeESGFMixin",
+    "Obs4MIPsRequest",
+    "RegistryRequest",
+]

climate_ref_core/esgf/base.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""
+Base classes and protocols for ESGF data requests.
+This module provides the infrastructure for fetching datasets from ESGF
+using the intake-esgf package.
+"""
+from typing import Any, Protocol, runtime_checkable
+import pandas as pd
+from intake_esgf import ESGFCatalog
+@runtime_checkable
+class ESGFRequest(Protocol):
+    """
+    Protocol for ESGF dataset requests.
+    Implementations provide the logic for searching ESGF and generating
+    output paths for downloaded datasets.
+    """
+    slug: str
+    """Unique identifier for this request."""
+    source_type: str
+    """Type of dataset (e.g., 'CMIP6', 'obs4MIPs')."""
+    time_span: tuple[str, str] | None
+    """Optional time range to filter datasets (start, end)."""
+    def fetch_datasets(self) -> pd.DataFrame:
+        """
+        Fetch dataset metadata from ESGF.
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame containing dataset metadata and file paths.
+            Must contain at minimum:
+            - key: A unique identifier for the dataset
+            - files: A list of files for the dataset
+        """
+        ...
+def _deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
+    """
+    Deduplicate a dataset collection.
+    Uses the metadata from the first dataset in each group,
+    but expands the time range to the min/max timespan of the group.
+    Parameters
+    ----------
+    datasets
+        The dataset collection
+    Returns
+    -------
+    pd.DataFrame
+        The deduplicated dataset collection spanning the times requested
+    """
+    def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
+        first = group.iloc[[0]].copy()
+        if "time_start" in first.columns:
+            first["time_start"] = group["time_start"].min()
+        if "time_end" in first.columns:
+            first["time_end"] = group["time_end"].max()
+        return first
+    result: pd.DataFrame = (
+        datasets.groupby("key")
+        .apply(_deduplicate_group, include_groups=False)  # type: ignore[call-overload]
+        .reset_index()
+    )
+    return result
+class IntakeESGFMixin:
+    """
+    Mixin that fetches datasets from ESGF using intake-esgf.
+    Subclasses must define:
+    - facets: dict[str, str | tuple[str, ...]]
+    - remove_ensembles: bool
+    - time_span: tuple[str, str] | None
+    """
+    facets: dict[str, str | tuple[str, ...]]
+    remove_ensembles: bool
+    time_span: tuple[str, str] | None
+    def fetch_datasets(self) -> pd.DataFrame:
+        """Fetch dataset metadata from ESGF."""
+        facets: dict[str, Any] = dict(self.facets)
+        if self.time_span:
+            facets["file_start"] = self.time_span[0]
+            facets["file_end"] = self.time_span[1]
+        # Convert tuples to lists for intake-esgf compatibility
+        for key, value in facets.items():
+            if isinstance(value, tuple):
+                facets[key] = list(value)
+        cat = ESGFCatalog()  # type: ignore[no-untyped-call]
+        cat.search(**facets)
+        if self.remove_ensembles:
+            cat.remove_ensembles()
+        path_dict = cat.to_path_dict(prefer_streaming=False, minimal_keys=False, quiet=True)
+        if cat.df is None or cat.df.empty:
+            raise ValueError("No datasets found for the given ESGF request")
+        merged_df = cat.df.merge(pd.Series(path_dict, name="files"), left_on="key", right_index=True)
+        if self.time_span:
+            merged_df["time_start"] = self.time_span[0]
+            merged_df["time_end"] = self.time_span[1]
+        return _deduplicate_datasets(merged_df)

climate_ref_core/esgf/cmip6.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""
+CMIP6 dataset request implementation.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+from climate_ref_core.esgf.base import IntakeESGFMixin
+if TYPE_CHECKING:
+    import xarray as xr
+def prefix_to_filename(ds: xr.Dataset, filename_prefix: str) -> str:
+    """
+    Create a filename from a dataset and a prefix.
+    Optionally includes the time range of the dataset if it has a time dimension.
+    Parameters
+    ----------
+    ds
+        Dataset
+    filename_prefix
+        Prefix for the filename (includes the different facets of the dataset)
+    Returns
+    -------
+    str
+        Filename for the dataset
+    """
+    if "time" in ds.dims:
+        time_range = f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
+        filename = f"{filename_prefix}_{time_range}.nc"
+    else:
+        filename = f"{filename_prefix}.nc"
+    return filename
+class CMIP6Request(IntakeESGFMixin):
+    """
+    Represents a CMIP6 dataset request.
+    These data are fetched from ESGF based on the provided facets.
+    """
+    source_type = "CMIP6"
+    cmip6_path_items = (
+        "mip_era",
+        "activity_drs",
+        "institution_id",
+        "source_id",
+        "experiment_id",
+        "member_id",
+        "table_id",
+        "variable_id",
+        "grid_label",
+    )
+    cmip6_filename_paths = (
+        "variable_id",
+        "table_id",
+        "source_id",
+        "experiment_id",
+        "member_id",
+        "grid_label",
+    )
+    available_facets = (
+        "mip_era",
+        "activity_drs",
+        "institution_id",
+        "source_id",
+        "experiment_id",
+        "member_id",
+        "table_id",
+        "variable_id",
+        "grid_label",
+        "version",
+        "data_node",
+    )
+    def __init__(
+        self,
+        slug: str,
+        facets: dict[str, Any],
+        remove_ensembles: bool = False,
+        time_span: tuple[str, str] | None = None,
+    ):
+        """
+        Initialize a CMIP6 request.
+        Parameters
+        ----------
+        slug
+            Unique identifier for this request
+        facets
+            ESGF search facets (e.g., source_id, variable_id, experiment_id)
+        remove_ensembles
+            If True, keep only one ensemble member per model
+        time_span
+            Optional time range filter (start, end) in YYYY-MM format
+        """
+        self.slug = slug
+        self.facets = facets
+        self.remove_ensembles = remove_ensembles
+        self.time_span = time_span
+        for key in self.cmip6_path_items:
+            if key not in self.available_facets:
+                raise ValueError(f"Path item {key!r} not in available facets")
+        for key in self.cmip6_filename_paths:
+            if key not in self.available_facets:
+                raise ValueError(f"Filename path {key!r} not in available facets")
+    def __repr__(self) -> str:
+        return f"CMIP6Request(slug={self.slug!r}, facets={self.facets!r})"

climate_ref_core/esgf/fetcher.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""
+ESGF dataset fetcher for downloading test data.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+import pandas as pd
+from loguru import logger
+from climate_ref_core.esgf.base import ESGFRequest
+if TYPE_CHECKING:
+    from climate_ref_core.diagnostics import AbstractDiagnostic as Diagnostic
+class ESGFFetcher:
+    """
+    Fetches datasets from ESGF and returns metadata with file paths.
+    Uses intake-esgf to search and download datasets.
+    Files that cannot be found locally are stored in intake-esgf's cache directory.
+    """
+    def fetch_request(self, request: ESGFRequest) -> pd.DataFrame:
+        """
+        Fetch datasets for a single ESGF request.
+        Parameters
+        ----------
+        request
+            The ESGF request specifying what to fetch
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame containing dataset metadata and file paths.
+            Each row represents one file, with a 'path' column pointing
+            to the file (either in intake-esgf's cache or one of the root data locations).
+            This format is not identical to the DataCatalog, but it is broadly compatible.
+        """
+        logger.info(f"Fetching datasets for request: {request.slug}")
+        # Search ESGF for matching datasets
+        datasets_df = request.fetch_datasets()
+        if datasets_df.empty:
+            logger.warning(f"No datasets found for request: {request.slug}")
+            return pd.DataFrame()
+        logger.info(f"Found {len(datasets_df)} datasets for request: {request.slug}")
+        # Expand files column - each file becomes a row with a 'path' column
+        rows = []
+        for _, row in datasets_df.iterrows():
+            files = row.get("files", [])
+            if not files:
+                logger.warning(f"No files for dataset: {row.get('key', 'unknown')}")
+                continue
+            for file_path in files:
+                if not Path(file_path).exists():
+                    logger.warning(f"File not found (may need to download from ESGF): {file_path}")
+                    continue
+                row_copy = row.to_dict()
+                row_copy["path"] = str(file_path)
+                rows.append(row_copy)
+        if not rows:
+            logger.warning(f"No files found for request: {request.slug}")
+            return pd.DataFrame()
+        result = pd.DataFrame(rows)
+        result["source_type"] = request.source_type
+        logger.info(f"Fetched {len(result)} files for request: {request.slug}")
+        return result
+    def fetch_for_test_case(
+        self,
+        requests: tuple[ESGFRequest, ...] | None,
+    ) -> pd.DataFrame:
+        """
+        Fetch all data for a test case's requests.
+        Parameters
+        ----------
+        requests
+            The ESGF requests from the test case
+        Returns
+        -------
+        pd.DataFrame
+            Combined DataFrame with all datasets, grouped by source_type
+        """
+        if not requests:
+            return pd.DataFrame()
+        dfs = []
+        for request in requests:
+            df = self.fetch_request(request)
+            if not df.empty:
+                dfs.append(df)
+        if not dfs:
+            return pd.DataFrame()
+        return pd.concat(dfs, ignore_index=True)
+    def list_requests_for_diagnostic(self, diagnostic: Diagnostic) -> list[tuple[str, ESGFRequest]]:
+        """
+        List all ESGF requests for a diagnostic across all test cases.
+        Parameters
+        ----------
+        diagnostic
+            The diagnostic to list requests for
+        Returns
+        -------
+        list[tuple[str, ESGFRequest]]
+            List of (test_case_name, request) tuples
+        """
+        if diagnostic.test_data_spec is None:
+            return []
+        results: list[tuple[str, ESGFRequest]] = []
+        for test_case in diagnostic.test_data_spec.test_cases:
+            if test_case.requests:
+                for request in test_case.requests:
+                    results.append((test_case.name, request))
+        return results

climate_ref_core/esgf/obs4mips.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""
+Obs4MIPs dataset request implementation.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+import pandas as pd
+from climate_ref_core.esgf.base import IntakeESGFMixin
+if TYPE_CHECKING:
+    pass
+class Obs4MIPsRequest(IntakeESGFMixin):
+    """
+    Represents an Obs4MIPs dataset request.
+    These data are fetched from ESGF based on the provided facets.
+    """
+    source_type = "obs4MIPs"
+    obs4mips_path_items = (
+        "activity_id",
+        "institution_id",
+        "source_id",
+        "variable_id",
+        "grid_label",
+    )
+    obs4mips_filename_paths = (
+        "variable_id",
+        "source_id",
+        "grid_label",
+    )
+    avail_facets = (
+        "activity_id",
+        "institution_id",
+        "source_id",
+        "frequency",
+        "variable_id",
+        "grid_label",
+        "version",
+        "data_node",
+        "project",
+    )
+    def __init__(
+        self,
+        slug: str,
+        facets: dict[str, Any],
+        remove_ensembles: bool = False,
+        time_span: tuple[str, str] | None = None,
+    ):
+        """
+        Initialize an Obs4MIPs request.
+        Parameters
+        ----------
+        slug
+            Unique identifier for this request
+        facets
+            ESGF search facets (e.g., source_id, variable_id)
+        remove_ensembles
+            If True, keep only one ensemble member (typically not relevant for obs)
+        time_span
+            Optional time range filter (start, end) in YYYY-MM format
+        """
+        self.slug = slug
+        self.facets = facets
+        self.remove_ensembles = remove_ensembles
+        self.time_span = time_span
+        for key in self.obs4mips_path_items:
+            if key not in self.avail_facets:
+                raise ValueError(f"Path item {key!r} not in available facets")
+        for key in self.obs4mips_filename_paths:
+            if key not in self.avail_facets:
+                raise ValueError(f"Filename path {key!r} not in available facets")
+    def fetch_datasets(self) -> pd.DataFrame:
+        """Fetch dataset metadata from ESGF with project=obs4MIPs."""
+        # Ensure project facet is set to obs4MIPs
+        if "project" not in self.facets:
+            self.facets["project"] = "obs4MIPs"
+        return super().fetch_datasets()
+    def __repr__(self) -> str:
+        return f"Obs4MIPsRequest(slug={self.slug!r}, facets={self.facets!r})"

climate-ref-core 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

climate-ref-core 0.8.1py3-none-any.whl → 0.9.0py3-none-any.whl