PyPI - climate-ref - Versions diffs - 0.6.3__tar.gz → 0.6.5__tar.gz - Mend

climate-ref 0.6.3tar.gz → 0.6.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{climate_ref-0.6.3 → climate_ref-0.6.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: climate-ref
-Version: 0.6.3
+Version: 0.6.5
 Summary: Application which runs the CMIP Rapid Evaluation Framework
 Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
 License-Expression: Apache-2.0

{climate_ref-0.6.3 → climate_ref-0.6.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "climate-ref"
-version = "0.6.3"
+version = "0.6.5"
 description = "Application which runs the CMIP Rapid Evaluation Framework"
 readme = "README.md"
 authors = [

{climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/config.py RENAMED Viewed

@@ -17,7 +17,7 @@ which always take precedence over any other configuration values.
 import importlib.resources
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Literal
 import tomlkit
 from attr import Factory
@@ -215,17 +215,17 @@ class DiagnosticProviderConfig:
     ```toml
     [[diagnostic_providers]]
-    provider = "climate_ref_esmvaltool.provider"
+    provider = "climate_ref_esmvaltool:provider"
     [diagnostic_providers.config]
     [[diagnostic_providers]]
-    provider = "climate_ref_ilamb.provider"
+    provider = "climate_ref_ilamb:provider"
     [diagnostic_providers.config]
     [[diagnostic_providers]]
-    provider = "climate_ref_pmp.provider"
+    provider = "climate_ref_pmp:provider"
     [diagnostic_providers.config]
     ```
@@ -311,10 +311,12 @@ def default_providers() -> list[DiagnosticProviderConfig]:
     if env_providers:
         return [DiagnosticProviderConfig(provider=provider) for provider in env_providers]
+    # Refer to https://setuptools.pypa.io/en/latest/userguide/entry_point.html#entry-points-for-plugins
+    # and https://packaging.python.org/en/latest/specifications/entry-points/
+    # to learn more about entry points.
     return [
-        DiagnosticProviderConfig(provider="climate_ref_esmvaltool.provider", config={}),
-        DiagnosticProviderConfig(provider="climate_ref_ilamb.provider", config={}),
-        DiagnosticProviderConfig(provider="climate_ref_pmp.provider", config={}),
+        DiagnosticProviderConfig(provider=entry_point.value, config={})
+        for entry_point in importlib.metadata.entry_points(group="climate-ref.providers")
     ]
@@ -352,6 +354,16 @@ class Config:
     [loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html#module-loguru._logger).
     """
+    cmip6_parser: Literal["drs", "complete"] = env_field("CMIP6_PARSER", default="complete")
+    """
+    Parser to use for CMIP6 datasets
+    This can be either `drs` or `complete`.
+    - `drs`: Use the DRS parser, which parses the dataset based on the DRS naming conventions.
+    - `complete`: Use the complete parser, which parses the dataset based on all available metadata.
+    """
     paths: PathConfig = Factory(PathConfig)  # noqa
     db: DbConfig = Factory(DbConfig)  # noqa
     executor: ExecutorConfig = Factory(ExecutorConfig)  # noqa

{climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/base.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Protocol, cast
+from typing import Any, Protocol, cast
 import pandas as pd
 from loguru import logger
@@ -35,6 +35,31 @@ def _log_duplicate_metadata(
         )
+class DatasetParsingFunction(Protocol):
+    """
+    Protocol for a function that parses metadata from a file or directory
+    """
+    def __call__(self, file: str, **kwargs: Any) -> dict[str, Any]:
+        """
+        Parse a file or directory and return metadata for the dataset
+        Parameters
+        ----------
+        file
+            File or directory to parse
+        kwargs
+            Additional keyword arguments to pass to the parsing function.
+        Returns
+        -------
+        :
+            Data catalog containing the metadata for the dataset
+        """
+        ...
 class DatasetAdapter(Protocol):
     """
     An adapter to provide a common interface for different dataset types
@@ -173,7 +198,7 @@ class DatasetAdapter(Protocol):
         slug = unique_slugs[0]
         dataset_metadata = data_catalog_dataset[list(self.dataset_specific_metadata)].iloc[0].to_dict()
-        dataset, created = db.get_or_create(DatasetModel, slug=slug, **dataset_metadata)
+        dataset, created = db.get_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
         if not created:
             logger.warning(f"{dataset} already exists in the database. Skipping")
             return None
@@ -212,6 +237,7 @@ class DatasetAdapter(Protocol):
                 {
                     **{k: getattr(file, k) for k in self.file_specific_metadata},
                     **{k: getattr(file.dataset, k) for k in self.dataset_specific_metadata},
+                    "finalised": file.dataset.finalised,
                 }
                 for file in result
             ],

{climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/cmip6.py RENAMED Viewed

@@ -1,18 +1,17 @@
 from __future__ import annotations
-import traceback
 import warnings
 from datetime import datetime
 from pathlib import Path
 from typing import Any
 import pandas as pd
-import xarray as xr
 from ecgtools import Builder
-from ecgtools.parsers.utilities import extract_attr_with_regex  # type: ignore
 from loguru import logger
-from climate_ref.datasets.base import DatasetAdapter
+from climate_ref.config import Config
+from climate_ref.datasets.base import DatasetAdapter, DatasetParsingFunction
+from climate_ref.datasets.cmip6_parsers import parse_cmip6_complete, parse_cmip6_drs
 from climate_ref.models.dataset import CMIP6Dataset
@@ -22,16 +21,19 @@ def _parse_datetime(dt_str: pd.Series[str]) -> pd.Series[datetime | Any]:
     """
     def _inner(date_string: str | None) -> datetime | None:
-        if not date_string:
+        if not date_string or pd.isnull(date_string):
             return None
         # Try to parse the date string with and without milliseconds
-        try:
-            dt = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
-        except ValueError:
-            dt = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S.%f")
+        for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"):
+            try:
+                return datetime.strptime(date_string, fmt)
+            except ValueError:
+                continue
-        return dt
+        # If all parsing attempts fail, log an error and return None
+        logger.error(f"Failed to parse date string: {date_string}")
+        return None
     return pd.Series(
         [_inner(dt) for dt in dt_str],
@@ -44,15 +46,16 @@ def _apply_fixes(data_catalog: pd.DataFrame) -> pd.DataFrame:
     def _fix_parent_variant_label(group: pd.DataFrame) -> pd.DataFrame:
         if group["parent_variant_label"].nunique() == 1:
             return group
-        group["parent_variant_label"] = group["variant_label"].iloc[0]
+        group["parent_variant_label"] = group["parent_variant_label"].iloc[0]
         return group
-    data_catalog = (
-        data_catalog.groupby("instance_id")
-        .apply(_fix_parent_variant_label, include_groups=False)
-        .reset_index(level="instance_id")
-    )
+    if "parent_variant_label" in data_catalog:
+        data_catalog = (
+            data_catalog.groupby("instance_id")
+            .apply(_fix_parent_variant_label, include_groups=False)
+            .reset_index(level="instance_id")
+        )
     if "branch_time_in_child" in data_catalog:
         data_catalog["branch_time_in_child"] = _clean_branch_time(data_catalog["branch_time_in_child"])
@@ -68,88 +71,6 @@ def _clean_branch_time(branch_time: pd.Series[str]) -> pd.Series[float]:
     return pd.to_numeric(branch_time.astype(str).str.replace("D", ""), errors="coerce")
-def parse_cmip6(file: str) -> dict[str, Any]:
-    """
-    Parser for CMIP6
-    This function parses the CMIP6 dataset and returns a dictionary with the metadata.
-    This was copied from the ecgtools package, but we want to log the exception when it fails.
-    """
-    keys = sorted(
-        {
-            "activity_id",
-            "branch_method",
-            "branch_time_in_child",
-            "branch_time_in_parent",
-            "experiment",
-            "experiment_id",
-            "frequency",
-            "grid",
-            "grid_label",
-            "institution_id",
-            "nominal_resolution",
-            "parent_activity_id",
-            "parent_experiment_id",
-            "parent_source_id",
-            "parent_time_units",
-            "parent_variant_label",
-            "realm",
-            "product",
-            "source_id",
-            "source_type",
-            "sub_experiment",
-            "sub_experiment_id",
-            "table_id",
-            "variable_id",
-            "variant_label",
-        }
-    )
-    try:
-        with xr.open_dataset(file, chunks={}, use_cftime=True) as ds:
-            info = {key: ds.attrs.get(key) for key in keys}
-            info["member_id"] = info["variant_label"]
-            variable_id = info["variable_id"]
-            if variable_id:  # pragma: no branch
-                attrs = ds[variable_id].attrs
-                for attr in ["standard_name", "long_name", "units"]:
-                    info[attr] = attrs.get(attr)
-            # Set the default of # of vertical levels to 1
-            vertical_levels = 1
-            start_time, end_time = None, None
-            init_year = None
-            try:
-                vertical_levels = ds[ds.cf["vertical"].name].size
-            except (KeyError, AttributeError, ValueError):
-                ...
-            try:
-                start_time, end_time = str(ds.cf["T"][0].data), str(ds.cf["T"][-1].data)
-            except (KeyError, AttributeError, ValueError):
-                ...
-            if info.get("sub_experiment_id"):  # pragma: no branch
-                init_year = extract_attr_with_regex(info["sub_experiment_id"], r"\d{4}")
-                if init_year:  # pragma: no cover
-                    init_year = int(init_year)
-            info["vertical_levels"] = vertical_levels
-            info["init_year"] = init_year
-            info["start_time"] = start_time
-            info["end_time"] = end_time
-            if not (start_time and end_time):
-                info["time_range"] = None
-            else:
-                info["time_range"] = f"{start_time}-{end_time}"
-        info["path"] = str(file)
-        info["version"] = extract_attr_with_regex(str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}") or "v0"
-        return info
-    except Exception:
-        logger.exception(f"Failed to parse {file}")
-        return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
 class CMIP6DatasetAdapter(DatasetAdapter):
     """
     Adapter for CMIP6 datasets
@@ -191,6 +112,7 @@ class CMIP6DatasetAdapter(DatasetAdapter):
         "standard_name",
         "long_name",
         "units",
+        "finalised",
         slug_column,
     )
@@ -208,8 +130,30 @@ class CMIP6DatasetAdapter(DatasetAdapter):
         "grid_label",
     )
-    def __init__(self, n_jobs: int = 1):
+    def __init__(self, n_jobs: int = 1, config: Config | None = None):
         self.n_jobs = n_jobs
+        self.config = config or Config.default()
+    def get_parsing_function(self) -> DatasetParsingFunction:
+        """
+        Get the parsing function for CMIP6 datasets based on configuration
+        The parsing function used is determined by the `cmip6_parser` configuration value:
+        - "drs": Use the DRS parser (default)
+        - "complete": Use the complete parser that extracts all available metadata
+        Returns
+        -------
+        :
+            The appropriate parsing function based on configuration
+        """
+        parser_type = self.config.cmip6_parser
+        if parser_type == "complete":
+            logger.info("Using complete CMIP6 parser")
+            return parse_cmip6_complete
+        else:
+            logger.info(f"Using DRS CMIP6 parser (config value: {parser_type})")
+            return parse_cmip6_drs
     def find_local_datasets(self, file_or_directory: Path) -> pd.DataFrame:
         """
@@ -228,6 +172,8 @@ class CMIP6DatasetAdapter(DatasetAdapter):
         :
             Data catalog containing the metadata for the dataset
         """
+        parsing_function = self.get_parsing_function()
         with warnings.catch_warnings():
             # Ignore the DeprecationWarning from xarray
             warnings.simplefilter("ignore", DeprecationWarning)
@@ -237,7 +183,7 @@ class CMIP6DatasetAdapter(DatasetAdapter):
                 depth=10,
                 include_patterns=["*.nc"],
                 joblib_parallel_kwargs={"n_jobs": self.n_jobs},
-            ).build(parsing_func=parse_cmip6)  # type: ignore
+            ).build(parsing_func=parsing_function)
         datasets: pd.DataFrame = builder.df.drop(["init_year"], axis=1)
@@ -254,6 +200,14 @@ class CMIP6DatasetAdapter(DatasetAdapter):
             lambda row: "CMIP6." + ".".join([row[item] for item in drs_items]), axis=1
         )
+        # Add in any missing metadata columns
+        missing_columns = set(self.dataset_specific_metadata + self.file_specific_metadata) - set(
+            datasets.columns
+        )
+        if missing_columns:
+            for column in missing_columns:
+                datasets[column] = pd.NA
         # Temporary fix for some datasets
         # TODO: Replace with a standalone package that contains metadata fixes for CMIP6 datasets
         datasets = _apply_fixes(datasets)

climate_ref-0.6.5/src/climate_ref/datasets/cmip6_parsers.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""
+CMIP6 parser functions for extracting metadata from netCDF files
+Additional non-official DRS's may be added in the future.
+"""
+import traceback
+from typing import Any
+import xarray as xr
+from ecgtools.parsers.cmip import parse_cmip6_using_directories  # type: ignore
+from ecgtools.parsers.utilities import extract_attr_with_regex  # type: ignore
+from loguru import logger
+def _parse_daterange(date_range: str) -> tuple[str | None, str | None]:
+    """
+    Parse a date range string into start and end dates
+    The output from this is an estimated date range until the file is completely parsed.
+    Parameters
+    ----------
+    date_range
+        Date range string in the format "YYYYMM-YYYYMM"
+    Returns
+    -------
+    :
+        Tuple containing start and end dates as strings in the format "YYYY-MM-DD"
+    """
+    try:
+        start, end = date_range.split("-")
+        if len(start) != 6 or len(end) != 6:  # noqa: PLR2004
+            raise ValueError("Date range must be in the format 'YYYYMM-YYYYMM'")
+        start = f"{start[:4]}-{start[4:6]}-01"
+        # Up to the 30th of the month, assuming a 30-day month
+        # These values will be corrected later when the file is parsed
+        end = f"{end[:4]}-{end[4:6]}-30"
+        return start, end
+    except ValueError:
+        logger.error(f"Invalid date range format: {date_range}")
+        return None, None
+def parse_cmip6_complete(file: str, **kwargs: Any) -> dict[str, Any]:
+    """
+    Complete parser for CMIP6 files
+    This parser loads each file and extracts all available metadata.
+    For some filesystems this may be slow, as it involves a lot of I/O operations.
+    Parameters
+    ----------
+    file
+        File to parse
+    kwargs
+        Additional keyword arguments (not used, but required for compatibility)
+    Returns
+    -------
+    :
+        Dictionary with extracted metadata
+    """
+    keys = sorted(
+        {
+            "activity_id",
+            "branch_method",
+            "branch_time_in_child",
+            "branch_time_in_parent",
+            "experiment",
+            "experiment_id",
+            "frequency",
+            "grid",
+            "grid_label",
+            "institution_id",
+            "nominal_resolution",
+            "parent_activity_id",
+            "parent_experiment_id",
+            "parent_source_id",
+            "parent_time_units",
+            "parent_variant_label",
+            "realm",
+            "product",
+            "source_id",
+            "source_type",
+            "sub_experiment",
+            "sub_experiment_id",
+            "table_id",
+            "variable_id",
+            "variant_label",
+        }
+    )
+    try:
+        with xr.open_dataset(file, chunks={}, use_cftime=True) as ds:
+            info = {key: ds.attrs.get(key) for key in keys}
+            info["member_id"] = info["variant_label"]
+            variable_id = info["variable_id"]
+            if variable_id:  # pragma: no branch
+                attrs = ds[variable_id].attrs
+                for attr in ["standard_name", "long_name", "units"]:
+                    info[attr] = attrs.get(attr)
+            # Set the default of # of vertical levels to 1
+            vertical_levels = 1
+            start_time, end_time = None, None
+            init_year = None
+            try:
+                vertical_levels = ds[ds.cf["vertical"].name].size
+            except (KeyError, AttributeError, ValueError):
+                ...
+            try:
+                start_time, end_time = str(ds.cf["T"][0].data), str(ds.cf["T"][-1].data)
+            except (KeyError, AttributeError, ValueError):
+                ...
+            if info.get("sub_experiment_id"):  # pragma: no branch
+                init_year = extract_attr_with_regex(info["sub_experiment_id"], r"\d{4}")
+                if init_year:  # pragma: no cover
+                    init_year = int(init_year)
+            info["vertical_levels"] = vertical_levels
+            info["init_year"] = init_year
+            info["start_time"] = start_time
+            info["end_time"] = end_time
+            if not (start_time and end_time):
+                info["time_range"] = None
+            else:
+                info["time_range"] = f"{start_time}-{end_time}"
+        info["path"] = str(file)
+        info["version"] = extract_attr_with_regex(str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}") or "v0"
+        # Mark the dataset as finalised
+        # This is used to indicate that the dataset has been fully parsed and is ready for use
+        info["finalised"] = True
+        return info
+    except Exception:
+        logger.exception(f"Failed to parse {file}")
+        return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
+def parse_cmip6_drs(file: str, **kwargs: Any) -> dict[str, Any]:
+    """
+    DRS parser for CMIP6 files
+    This parser extracts metadata according to the CMIP6 Data Reference Syntax (DRS).
+    This includes the essential metadata required to identify the dataset and is included in the filename.
+    Parameters
+    ----------
+    file
+        File to parse
+    kwargs
+        Additional keyword arguments (not used, but required for compatibility)
+    Returns
+    -------
+    :
+        Dictionary with extracted metadata
+    """
+    info: dict[str, Any] = parse_cmip6_using_directories(file)
+    if "INVALID_ASSET" in info:
+        logger.warning(f"Failed to parse {file}: {info['INVALID_ASSET']}")
+        return info
+    # The member_id is technically incorrect
+    # but for simplicity we are going to ignore sub-experiments for the DRS parser
+    info["variant_label"] = info["member_id"]
+    # Rename the `dcpp_init_year` key to `init_year` if it exists
+    if "dcpp_init_year" in info:
+        info["init_year"] = info.pop("dcpp_init_year")
+    if info.get("time_range"):
+        # Parse the time range if it exists
+        start_time, end_time = _parse_daterange(info["time_range"])
+        info["start_time"] = start_time
+        info["end_time"] = end_time
+    info["finalised"] = False
+    return info

{climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/obs4mips.py RENAMED Viewed

@@ -15,8 +15,17 @@ from climate_ref.datasets.cmip6 import _parse_datetime
 from climate_ref.models.dataset import Dataset, Obs4MIPsDataset
-def parse_obs4mips(file: str) -> dict[str, Any | None]:
-    """Parser for obs4mips"""
+def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
+    """
+    Parser for obs4mips
+    Parameters
+    ----------
+    file
+        File to parse
+    kwargs
+        Additional keyword arguments (not used, but required for protocol compatibility)
+    """
     keys = sorted(
         list(
             {
@@ -106,6 +115,7 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
     dataset_specific_metadata = (
         "activity_id",
+        "finalised",
         "frequency",
         "grid",
         "grid_label",
@@ -159,7 +169,7 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
             depth=10,
             include_patterns=["*.nc"],
             joblib_parallel_kwargs={"n_jobs": self.n_jobs},
-        ).build(parsing_func=parse_obs4mips)  # type: ignore[arg-type]
+        ).build(parsing_func=parse_obs4mips)
         datasets = builder.df
         if datasets.empty:
@@ -178,4 +188,5 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
         datasets["instance_id"] = datasets.apply(
             lambda row: "obs4MIPs." + ".".join([row[item] for item in drs_items]), axis=1
         )
+        datasets["finalised"] = True
         return datasets

climate-ref 0.6.3__tar.gz → 0.6.5__tar.gz

climate-ref 0.6.3tar.gz → 0.6.5tar.gz