PyPI - climate-ref - Versions diffs - 0.5.0__py3-none-any.whl - Mend

climate-ref 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

climate_ref/__init__.py +30 -0
climate_ref/_config_helpers.py +214 -0
climate_ref/alembic.ini +114 -0
climate_ref/cli/__init__.py +138 -0
climate_ref/cli/_utils.py +68 -0
climate_ref/cli/config.py +28 -0
climate_ref/cli/datasets.py +205 -0
climate_ref/cli/executions.py +201 -0
climate_ref/cli/providers.py +84 -0
climate_ref/cli/solve.py +23 -0
climate_ref/config.py +475 -0
climate_ref/constants.py +8 -0
climate_ref/database.py +223 -0
climate_ref/dataset_registry/obs4ref_reference.txt +2 -0
climate_ref/dataset_registry/sample_data.txt +60 -0
climate_ref/datasets/__init__.py +40 -0
climate_ref/datasets/base.py +214 -0
climate_ref/datasets/cmip6.py +202 -0
climate_ref/datasets/obs4mips.py +224 -0
climate_ref/datasets/pmp_climatology.py +15 -0
climate_ref/datasets/utils.py +16 -0
climate_ref/executor/__init__.py +274 -0
climate_ref/executor/local.py +89 -0
climate_ref/migrations/README +22 -0
climate_ref/migrations/env.py +139 -0
climate_ref/migrations/script.py.mako +26 -0
climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +292 -0
climate_ref/models/__init__.py +33 -0
climate_ref/models/base.py +42 -0
climate_ref/models/dataset.py +206 -0
climate_ref/models/diagnostic.py +61 -0
climate_ref/models/execution.py +306 -0
climate_ref/models/metric_value.py +195 -0
climate_ref/models/provider.py +39 -0
climate_ref/provider_registry.py +146 -0
climate_ref/py.typed +0 -0
climate_ref/solver.py +395 -0
climate_ref/testing.py +109 -0
climate_ref-0.5.0.dist-info/METADATA +97 -0
climate_ref-0.5.0.dist-info/RECORD +44 -0
climate_ref-0.5.0.dist-info/WHEEL +4 -0
climate_ref-0.5.0.dist-info/entry_points.txt +2 -0
climate_ref-0.5.0.dist-info/licenses/LICENCE +201 -0
climate_ref-0.5.0.dist-info/licenses/NOTICE +3 -0

climate_ref/models/dataset.py ADDED Viewed

@@ -0,0 +1,206 @@
+import datetime
+from typing import Any, ClassVar
+from sqlalchemy import ForeignKey, func
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+from climate_ref.models.base import Base
+from climate_ref_core.datasets import SourceDatasetType
+class Dataset(Base):
+    """
+    Represents a dataset
+    A dataset is a collection of data files, that is used as an input to the benchmarking process.
+    Adding/removing or updating a dataset will trigger a new diagnostic calculation.
+    A polymorphic association is used to capture the different types of datasets as each
+    dataset type may have different metadata fields.
+    This enables the use of a single table to store all datasets,
+    but still allows for querying specific metadata fields for each dataset type.
+    """
+    __tablename__ = "dataset"
+    id: Mapped[int] = mapped_column(primary_key=True)
+    slug: Mapped[str] = mapped_column(unique=True)
+    """
+    Globally unique identifier for the dataset.
+    In the case of CMIP6 datasets, this is the instance_id.
+    """
+    dataset_type: Mapped[SourceDatasetType] = mapped_column(nullable=False)
+    """
+    Type of dataset
+    """
+    created_at: Mapped[datetime.datetime] = mapped_column(server_default=func.now())
+    """
+    When the dataset was added to the database
+    """
+    updated_at: Mapped[datetime.datetime] = mapped_column(server_default=func.now(), onupdate=func.now())
+    """
+    When the dataset was updated.
+    Updating a dataset will trigger a new diagnostic calculation.
+    """
+    def __repr__(self) -> str:
+        return f"<Dataset slug={self.slug} dataset_type={self.dataset_type} >"
+    __mapper_args__: ClassVar[Any] = {"polymorphic_on": dataset_type}  # type: ignore
+class DatasetFile(Base):
+    """
+    Capture the metadata for a file in a dataset
+    A dataset may have multiple files, but is represented as a single dataset in the database.
+    A lot of the metadata will be duplicated for each file in the dataset,
+    but this will be more efficient for querying, filtering and building a data catalog.
+    """
+    __tablename__ = "dataset_file"
+    id: Mapped[int] = mapped_column(primary_key=True)
+    dataset_id: Mapped[int] = mapped_column(ForeignKey("dataset.id", ondelete="CASCADE"), nullable=False)
+    """
+    Foreign key to the dataset table
+    """
+    start_time: Mapped[datetime.datetime] = mapped_column(nullable=True)
+    """
+    Start time of a given file
+    """
+    end_time: Mapped[datetime.datetime] = mapped_column(nullable=True)
+    """
+    Start time of a given file
+    """
+    path: Mapped[str] = mapped_column()
+    """
+    Prefix that describes where the dataset is stored relative to the data directory
+    """
+    dataset = relationship("Dataset", backref="files")
+class CMIP6Dataset(Dataset):
+    """
+    Represents a CMIP6 dataset
+    Fields that are not marked as required in
+    https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf
+    are optional.
+    """
+    __tablename__ = "cmip6_dataset"
+    id: Mapped[int] = mapped_column(ForeignKey("dataset.id"), primary_key=True)
+    activity_id: Mapped[str] = mapped_column()
+    branch_method: Mapped[str] = mapped_column(nullable=True)
+    branch_time_in_child: Mapped[float] = mapped_column(nullable=True)
+    branch_time_in_parent: Mapped[float] = mapped_column(nullable=True)
+    experiment: Mapped[str] = mapped_column()
+    experiment_id: Mapped[str] = mapped_column()
+    frequency: Mapped[str] = mapped_column()
+    grid: Mapped[str] = mapped_column()
+    grid_label: Mapped[str] = mapped_column()
+    institution_id: Mapped[str] = mapped_column()
+    long_name: Mapped[str] = mapped_column(nullable=True)
+    member_id: Mapped[str] = mapped_column()
+    nominal_resolution: Mapped[str] = mapped_column()
+    parent_activity_id: Mapped[str] = mapped_column(nullable=True)
+    parent_experiment_id: Mapped[str] = mapped_column(nullable=True)
+    parent_source_id: Mapped[str] = mapped_column(nullable=True)
+    parent_time_units: Mapped[str] = mapped_column(nullable=True)
+    parent_variant_label: Mapped[str] = mapped_column(nullable=True)
+    realm: Mapped[str] = mapped_column()
+    product: Mapped[str] = mapped_column()
+    source_id: Mapped[str] = mapped_column()
+    standard_name: Mapped[str] = mapped_column()
+    source_type: Mapped[str] = mapped_column()
+    sub_experiment: Mapped[str] = mapped_column()
+    sub_experiment_id: Mapped[str] = mapped_column()
+    table_id: Mapped[str] = mapped_column()
+    units: Mapped[str] = mapped_column()
+    variable_id: Mapped[str] = mapped_column()
+    variant_label: Mapped[str] = mapped_column()
+    vertical_levels: Mapped[int] = mapped_column(nullable=True)
+    version: Mapped[str] = mapped_column()
+    instance_id: Mapped[str] = mapped_column()
+    """
+    Unique identifier for the dataset.
+    """
+    __mapper_args__: ClassVar[Any] = {"polymorphic_identity": SourceDatasetType.CMIP6}  # type: ignore
+class Obs4MIPsDataset(Dataset):
+    """
+    Represents a obs4mips dataset
+    TODO: Should the metadata fields be part of the file or dataset?
+    """
+    __tablename__ = "obs4mips_dataset"
+    id: Mapped[int] = mapped_column(ForeignKey("dataset.id"), primary_key=True)
+    activity_id: Mapped[str] = mapped_column()
+    frequency: Mapped[str] = mapped_column()
+    grid: Mapped[str] = mapped_column()
+    grid_label: Mapped[str] = mapped_column()
+    institution_id: Mapped[str] = mapped_column()
+    long_name: Mapped[str] = mapped_column()
+    nominal_resolution: Mapped[str] = mapped_column()
+    realm: Mapped[str] = mapped_column()
+    product: Mapped[str] = mapped_column()
+    source_id: Mapped[str] = mapped_column()
+    source_type: Mapped[str] = mapped_column()
+    units: Mapped[str] = mapped_column()
+    variable_id: Mapped[str] = mapped_column()
+    variant_label: Mapped[str] = mapped_column()
+    vertical_levels: Mapped[int] = mapped_column()
+    source_version_number: Mapped[str] = mapped_column()
+    instance_id: Mapped[str] = mapped_column()
+    """
+    Unique identifier for the dataset.
+    """
+    __mapper_args__: ClassVar[Any] = {"polymorphic_identity": SourceDatasetType.obs4MIPs}  # type: ignore
+class PMPClimatologyDataset(Dataset):
+    """
+    Represents a climatology dataset from PMP
+    These data are similar to obs4MIPs datasets, but are post-processed
+    """
+    __tablename__ = "pmp_climatology_dataset"
+    id: Mapped[int] = mapped_column(ForeignKey("dataset.id"), primary_key=True)
+    activity_id: Mapped[str] = mapped_column()
+    frequency: Mapped[str] = mapped_column()
+    grid: Mapped[str] = mapped_column()
+    grid_label: Mapped[str] = mapped_column()
+    institution_id: Mapped[str] = mapped_column()
+    long_name: Mapped[str] = mapped_column()
+    nominal_resolution: Mapped[str] = mapped_column()
+    realm: Mapped[str] = mapped_column()
+    product: Mapped[str] = mapped_column()
+    source_id: Mapped[str] = mapped_column()
+    source_type: Mapped[str] = mapped_column()
+    units: Mapped[str] = mapped_column()
+    variable_id: Mapped[str] = mapped_column()
+    variant_label: Mapped[str] = mapped_column()
+    vertical_levels: Mapped[int] = mapped_column()
+    source_version_number: Mapped[str] = mapped_column()
+    instance_id: Mapped[str] = mapped_column()
+    """
+    Unique identifier for the dataset.
+    """
+    __mapper_args__: ClassVar[Any] = {"polymorphic_identity": SourceDatasetType.PMPClimatology}  # type: ignore

climate_ref/models/diagnostic.py ADDED Viewed

@@ -0,0 +1,61 @@
+from typing import TYPE_CHECKING
+from sqlalchemy import ForeignKey, UniqueConstraint
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+from climate_ref.models.base import Base, CreatedUpdatedMixin
+if TYPE_CHECKING:
+    from climate_ref.models.execution import ExecutionGroup
+    from climate_ref.models.provider import Provider
+class Diagnostic(CreatedUpdatedMixin, Base):
+    """
+    Represents a diagnostic that can be calculated
+    """
+    __tablename__ = "diagnostic"
+    __table_args__ = (UniqueConstraint("provider_id", "slug", name="diagnostic_ident"),)
+    id: Mapped[int] = mapped_column(primary_key=True)
+    slug: Mapped[str] = mapped_column(unique=True)
+    """
+    Unique identifier for the diagnostic
+    This will be used to reference the diagnostic in the benchmarking process
+    """
+    name: Mapped[str] = mapped_column()
+    """
+    Long name of the diagnostic
+    """
+    provider_id: Mapped[int] = mapped_column(ForeignKey("provider.id"))
+    """
+    The provider that provides the diagnostic
+    """
+    enabled: Mapped[bool] = mapped_column(default=True)
+    """
+    Whether the diagnostic is enabled or not
+    If a diagnostic is not enabled, it will not be used for any calculations.
+    """
+    provider: Mapped["Provider"] = relationship(back_populates="diagnostics")
+    execution_groups: Mapped[list["ExecutionGroup"]] = relationship(back_populates="diagnostic")
+    def __repr__(self) -> str:
+        return f"<Metric slug={self.slug}>"
+    def full_slug(self) -> str:
+        """
+        Get the full slug of the diagnostic, including the provider slug
+        Returns
+        -------
+        str
+            Full slug of the diagnostic
+        """
+        return f"{self.provider.slug}/{self.slug}"

climate_ref/models/execution.py ADDED Viewed

@@ -0,0 +1,306 @@
+import enum
+import pathlib
+from typing import TYPE_CHECKING, Any
+from loguru import logger
+from sqlalchemy import Column, ForeignKey, Table, UniqueConstraint, func
+from sqlalchemy.orm import Mapped, Session, mapped_column, relationship
+from sqlalchemy.orm.query import RowReturningQuery
+from climate_ref.models import Dataset
+from climate_ref.models.base import Base, CreatedUpdatedMixin
+from climate_ref_core.datasets import ExecutionDatasetCollection
+if TYPE_CHECKING:
+    from climate_ref.database import Database
+    from climate_ref.models.diagnostic import Diagnostic
+    from climate_ref.models.metric_value import MetricValue
+class ExecutionGroup(CreatedUpdatedMixin, Base):
+    """
+    Represents a group of executions with a shared set of input datasets.
+    When solving, the `ExecutionGroup`s are derived from the available datasets,
+    the defined diagnostics and their data requirements. From the information in the
+    group an execution can be triggered, which is an actual run of a diagnostic calculation
+    with a specific set of input datasets.
+    When the `ExecutionGroup` is created, it is marked dirty, meaning there are no
+    current executions available. When an Execution was run successfully for a
+    ExecutionGroup, the dirty mark is removed. After ingesting new data and
+    solving again and if new versions of the input datasets are available, the
+    ExecutionGroup will be marked dirty again.
+    The diagnostic_id and key form a unique identifier for `ExecutionGroup`s.
+    """
+    __tablename__ = "execution_group"
+    __table_args__ = (UniqueConstraint("diagnostic_id", "key", name="execution_ident"),)
+    id: Mapped[int] = mapped_column(primary_key=True)
+    diagnostic_id: Mapped[int] = mapped_column(ForeignKey("diagnostic.id"))
+    """
+    The diagnostic that this execution group belongs to
+    """
+    key: Mapped[str] = mapped_column(index=True)
+    """
+    Key for the datasets in this Execution group.
+    """
+    dirty: Mapped[bool] = mapped_column(default=False)
+    """
+    Whether the execution group should be rerun
+    An execution group is dirty if the diagnostic or any of the input datasets has been
+    updated since the last execution.
+    """
+    selectors: Mapped[dict[str, Any]] = mapped_column(default=dict)
+    """
+    Collection of selectors that define the group
+    These selectors are the unique key, value pairs that were selected during the initial groupby
+    operation.
+    These are also used to define the dataset key.
+    """
+    diagnostic: Mapped["Diagnostic"] = relationship(back_populates="execution_groups")
+    executions: Mapped[list["Execution"]] = relationship(
+        back_populates="execution_group", order_by="Execution.created_at"
+    )
+    def should_run(self, dataset_hash: str) -> bool:
+        """
+        Check if the diagnostic execution group needs to be executed.
+        The diagnostic execution group should be run if:
+        * the execution group is marked as dirty
+        * no executions have been performed ever
+        * the dataset hash is different from the last run
+        """
+        if not self.executions:
+            logger.debug(f"Execution group {self.diagnostic.slug}/{self.key} was never executed")
+            return True
+        if self.executions[-1].dataset_hash != dataset_hash:
+            logger.debug(
+                f"Execution group {self.diagnostic.slug}/{self.key} hash mismatch:"
+                f" {self.executions[-1].dataset_hash} != {dataset_hash}"
+            )
+            return True
+        if self.dirty:
+            logger.debug(f"Execution group {self.diagnostic.slug}/{self.key} is dirty")
+            return True
+        return False
+execution_datasets = Table(
+    "execution_dataset",
+    Base.metadata,
+    Column("execution_id", ForeignKey("execution.id")),
+    Column("dataset_id", ForeignKey("dataset.id")),
+)
+class Execution(CreatedUpdatedMixin, Base):
+    """
+    Represents a single execution of a diagnostic
+    Each result is part of a group of executions that share similar input datasets.
+    An execution group might be run multiple times as new data becomes available,
+    each run will create a `Execution`.
+    """
+    __tablename__ = "execution"
+    id: Mapped[int] = mapped_column(primary_key=True)
+    output_fragment: Mapped[str] = mapped_column()
+    """
+    Relative directory to store the output of the execution.
+    During execution this directory is relative to the temporary directory.
+    If the diagnostic execution is successful, the executions will be moved to the final output directory
+    and the temporary directory will be cleaned up.
+    This directory may contain multiple input and output files.
+    """
+    execution_group_id: Mapped[int] = mapped_column(
+        ForeignKey(
+            "execution_group.id",
+            name="fk_execution_id",
+        )
+    )
+    """
+    The execution group that this execution belongs to
+    """
+    dataset_hash: Mapped[str] = mapped_column(index=True)
+    """
+    Hash of the datasets used to calculate the diagnostic
+    This is used to verify if an existing diagnostic execution has been run with the same datasets.
+    """
+    successful: Mapped[bool] = mapped_column(nullable=True)
+    """
+    Was the run successful
+    """
+    path: Mapped[str] = mapped_column(nullable=True)
+    """
+    Path to the output bundle
+    Relative to the diagnostic execution result output directory
+    """
+    retracted: Mapped[bool] = mapped_column(default=False)
+    """
+    Whether the diagnostic execution result has been retracted or not
+    This may happen if a dataset has been retracted, or if the diagnostic execution was incorrect.
+    Rather than delete the values, they are marked as retracted.
+    These data may still be visible in the UI, but should be marked as retracted.
+    """
+    execution_group: Mapped["ExecutionGroup"] = relationship(back_populates="executions")
+    outputs: Mapped[list["ExecutionOutput"]] = relationship(back_populates="execution")
+    values: Mapped[list["MetricValue"]] = relationship(back_populates="execution")
+    datasets: Mapped[list[Dataset]] = relationship(secondary=execution_datasets)
+    """
+    The datasets used in this execution
+    """
+    def register_datasets(self, db: "Database", execution_dataset: ExecutionDatasetCollection) -> None:
+        """
+        Register the datasets used in the diagnostic calculation with the execution
+        """
+        for _, dataset in execution_dataset.items():
+            db.session.execute(
+                execution_datasets.insert(),
+                [{"execution_id": self.id, "dataset_id": idx} for idx in dataset.index],
+            )
+    def mark_successful(self, path: pathlib.Path | str) -> None:
+        """
+        Mark the diagnostic execution as successful
+        """
+        # TODO: this needs to accept both a diagnostic and output bundle
+        self.successful = True
+        self.path = str(path)
+    def mark_failed(self) -> None:
+        """
+        Mark the diagnostic execution as unsuccessful
+        """
+        self.successful = False
+class ResultOutputType(enum.Enum):
+    """
+    Types of supported outputs
+    These map to the categories of output in the CMEC output bundle
+    """
+    Plot = "plot"
+    Data = "data"
+    HTML = "html"
+class ExecutionOutput(CreatedUpdatedMixin, Base):
+    """
+    An output generated as part of an execution.
+    This output may be a plot, data file or HTML file.
+    These outputs are defined in the CMEC output bundle
+    """
+    __tablename__ = "execution_output"
+    id: Mapped[int] = mapped_column(primary_key=True)
+    execution_id: Mapped[int] = mapped_column(ForeignKey("execution.id"), index=True)
+    output_type: Mapped[ResultOutputType] = mapped_column(index=True)
+    """
+    Type of the output
+    This will determine how the output is displayed
+    """
+    filename: Mapped[str] = mapped_column(nullable=True)
+    """
+    Path to the output
+    Relative to the diagnostic execution result output directory
+    """
+    short_name: Mapped[str] = mapped_column(nullable=True)
+    """
+    Short key of the output
+    This is unique for a given result and output type
+    """
+    long_name: Mapped[str] = mapped_column(nullable=True)
+    """
+    Human readable name describing the plot
+    """
+    description: Mapped[str] = mapped_column(nullable=True)
+    """
+    Long description describing the plot
+    """
+    execution: Mapped["Execution"] = relationship(back_populates="outputs")
+def get_execution_group_and_latest(
+    session: Session,
+) -> RowReturningQuery[tuple[ExecutionGroup, Execution | None]]:
+    """
+    Query to get the most recent result for each execution group
+    Parameters
+    ----------
+    session
+        The database session to use for the query.
+    Returns
+    -------
+        Query to get the most recent result for each execution group.
+        The result is a tuple of the execution group and the most recent result,
+        which can be None.
+    """
+    # Find the most recent result for each execution group
+    # This uses an aggregate function because it is more efficient than order by
+    subquery = (
+        session.query(
+            Execution.execution_group_id,
+            func.max(Execution.created_at).label("latest_created_at"),
+        )
+        .group_by(Execution.execution_group_id)
+        .subquery()
+    )
+    # Join the diagnostic execution with the latest result
+    query = (
+        session.query(ExecutionGroup, Execution)
+        .outerjoin(subquery, ExecutionGroup.id == subquery.c.execution_group_id)
+        .outerjoin(
+            Execution,
+            (Execution.execution_group_id == ExecutionGroup.id)
+            & (Execution.created_at == subquery.c.latest_created_at),
+        )
+    )
+    return query  # type: ignore