PyPI - climate-ref - Versions diffs - 0.6.0__tar.gz → 0.6.2__tar.gz - Mend

climate-ref 0.6.0tar.gz → 0.6.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

{climate_ref-0.6.0 → climate_ref-0.6.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: climate-ref
-Version: 0.6.0
+Version: 0.6.2
 Summary: Application which runs the CMIP Rapid Evaluation Framework
 Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
 License-Expression: Apache-2.0
@@ -10,7 +10,8 @@ Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Operating System :: OS Independent
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
@@ -25,6 +26,7 @@ Requires-Dist: climate-ref-core
 Requires-Dist: ecgtools>=2024.7.31
 Requires-Dist: environs>=11.0.0
 Requires-Dist: loguru>=0.7.2
+Requires-Dist: parsl>=2025.5.19; sys_platform != 'win32'
 Requires-Dist: platformdirs>=4.3.6
 Requires-Dist: sqlalchemy>=2.0.36
 Requires-Dist: tomlkit>=0.13.2

{climate_ref-0.6.0 → climate_ref-0.6.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "climate-ref"
-version = "0.6.0"
+version = "0.6.2"
 description = "Application which runs the CMIP Rapid Evaluation Framework"
 readme = "README.md"
 authors = [
@@ -16,7 +16,6 @@ license = "Apache-2.0"
 requires-python = ">=3.11"
 classifiers = [
     "Development Status :: 3 - Alpha",
-    "Operating System :: OS Independent",
     "Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
     "Programming Language :: Python",
@@ -26,6 +25,8 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
+    "Operating System :: MacOS :: MacOS X",
+    "Operating System :: POSIX :: Linux",
 ]
 dependencies = [
     "climate-ref-core",
@@ -39,7 +40,10 @@ dependencies = [
     "loguru>=0.7.2",
     "ecgtools>=2024.7.31",
     "platformdirs>=4.3.6",
-    "tqdm>=4.67.1"
+    "tqdm>=4.67.1",
+    # parsl doesn't support Windows yet
+    # We don't target Windows either, but this __might__ allow Windows users to install the package
+    'parsl>=2025.5.19; sys_platform != "win32"'
 ]
 [project.optional-dependencies]

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/cli/__init__.py RENAMED Viewed

@@ -88,7 +88,7 @@ def build_app() -> typer.Typer:
     :
         The CLI app
     """
-    app = typer.Typer(name="climate_ref", no_args_is_help=True)
+    app = typer.Typer(name="ref", no_args_is_help=True)
     app.command(name="solve")(solve.solve)
     app.add_typer(config.app, name="config")
@@ -136,10 +136,10 @@ def main(  # noqa: PLR0913
     ] = None,
 ) -> None:
     """
-    climate_ref: A CLI for the Assessment Fast Track Rapid Evaluation Framework
+    A CLI for the Assessment Fast Track Rapid Evaluation Framework
     This CLI provides a number of commands for managing and executing diagnostics.
-    """
+    """  # noqa: D401
     if quiet:
         log_level = LogLevel.Warning
     if verbose:

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/cli/config.py RENAMED Viewed

@@ -20,9 +20,9 @@ def list_(ctx: typer.Context) -> None:
     print(config.dumps(defaults=True))
-@app.command()
-def update() -> None:
-    """
-    Update a configuration value
-    """
-    print("config")
+# @app.command()
+# def update() -> None:
+#     """
+#     Update a configuration value
+#     """
+#     print("config")

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/cli/datasets.py RENAMED Viewed

@@ -1,5 +1,9 @@
 """
 View and ingest input datasets
+The metadata from these datasets are stored in the database so that they can be used to determine
+which executions are required for a given diagnostic without having to re-parse the datasets.
 """
 import errno
@@ -105,9 +109,12 @@ def ingest(  # noqa: PLR0913
     ] = False,
 ) -> None:
     """
-    Ingest a dataset
+    Ingest a directory of datasets into the database
+    Each dataset will be loaded and validated using the specified dataset adapter.
+    This will extract metadata from the datasets and store it in the database.
-    This will register a dataset in the database to be used for diagnostics calculations.
+    A table of the datasets will be printed to the console at the end of the operation.
     """
     config = ctx.obj.config
     db = ctx.obj.database

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/cli/executions.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-View diagnostic executions
+View execution groups and their results
 """
 import pathlib
@@ -29,11 +29,19 @@ console = Console()
 @app.command()
 def list_groups(
     ctx: typer.Context,
-    column: Annotated[list[str] | None, typer.Option()] = None,
+    column: Annotated[
+        list[str] | None,
+        typer.Option(help="Only include specified columns in the output"),
+    ] = None,
     limit: int = typer.Option(100, help="Limit the number of rows to display"),
 ) -> None:
     """
     List the diagnostic execution groups that have been identified
+    The data catalog is sorted by the date that the execution group was created (first = newest).
+    If the `--column` option is provided, only the specified columns will be displayed.
+    The output will be in a tabular format.
     """
     session = ctx.obj.database.session
@@ -178,6 +186,8 @@ def _log_panel(result_directory: pathlib.Path) -> Panel | None:
 def inspect(ctx: typer.Context, execution_id: int) -> None:
     """
     Inspect a specific execution group by its ID
+    This will display the execution details, datasets, results directory, and logs if available.
     """
     config: Config = ctx.obj.config
     session = ctx.obj.database.session

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/cli/providers.py RENAMED Viewed

@@ -56,7 +56,10 @@ def create_env(
     ] = None,
 ) -> None:
     """
-    Create a virtual environment containing the provider software.
+    Create a conda environment containing the provider software.
+    If no provider is specified, all providers will be installed.
+    If the provider is up to date or does not use a virtual environment, it will be skipped.
     """
     config = ctx.obj.config
     db = ctx.obj.database

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/cli/solve.py RENAMED Viewed

@@ -49,6 +49,10 @@ def solve(  # noqa: PLR0913
     This may trigger a number of additional calculations depending on what data has been ingested
     since the last solve.
+    This command will block until all executions have been solved or the timeout is reached.
+    Filters can be applied to limit the diagnostics and providers that are considered, see the options
+    `--diagnostic` and `--provider` for more information.
     """
     config = ctx.obj.config
     db = ctx.obj.database

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/config.py RENAMED Viewed

@@ -15,6 +15,7 @@ which always take precedence over any other configuration values.
 # https://github.com/ESGF/esgf-download/blob/main/esgpull/config.py
 import importlib.resources
+import os
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -64,6 +65,7 @@ def ensure_absolute_path(path: str | Path) -> Path:
     """
     if isinstance(path, str):
         path = Path(path)
+    path = Path(*[os.path.expandvars(p) for p in path.parts])
     return path.resolve()

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/executor/__init__.py RENAMED Viewed

@@ -9,8 +9,9 @@ The simplest executor is the `LocalExecutor`, which runs the diagnostic in the s
 This is useful for local testing and debugging.
 """
+from .hpc import HPCExecutor
 from .local import LocalExecutor
 from .result_handling import handle_execution_result
 from .synchronous import SynchronousExecutor
-__all__ = ["LocalExecutor", "SynchronousExecutor", "handle_execution_result"]
+__all__ = ["HPCExecutor", "LocalExecutor", "SynchronousExecutor", "handle_execution_result"]

climate_ref-0.6.2/src/climate_ref/executor/hpc.py ADDED Viewed

@@ -0,0 +1,320 @@
+"""
+HPC-based Executor to use job schedulers.
+If you want to
+- run REF under the HPC workflows
+- run REF in multiple nodes
+"""
+try:
+    import parsl
+except ImportError:  # pragma: no cover
+    raise ImportError("The HPCExecutor requires the `parsl` package")
+import os
+import time
+from typing import Any
+import parsl
+from loguru import logger
+from parsl import python_app
+from parsl.config import Config as ParslConfig
+from parsl.executors import HighThroughputExecutor
+from parsl.launchers import SrunLauncher
+from parsl.providers import SlurmProvider
+from tqdm import tqdm
+from climate_ref.config import Config
+from climate_ref.database import Database
+from climate_ref.models import Execution
+from climate_ref.slurm import HAS_REAL_SLURM, SlurmChecker
+from climate_ref_core.diagnostics import ExecutionDefinition, ExecutionResult
+from climate_ref_core.exceptions import DiagnosticError, ExecutionError
+from climate_ref_core.executor import execute_locally
+from .local import ExecutionFuture, process_result
+@python_app
+def _process_run(definition: ExecutionDefinition, log_level: str) -> ExecutionResult:
+    """Run the function on computer nodes"""
+    # This is a catch-all for any exceptions that occur in the process and need to raise for
+    # parsl retries to work
+    try:
+        return execute_locally(definition=definition, log_level=log_level, raise_error=True)
+    except DiagnosticError as e:  # pragma: no cover
+        # any diagnostic error will be caught here
+        logger.exception("Error running diagnostic")
+        raise e
+def _to_float(x: Any) -> float | None:
+    if x is None:
+        return None
+    if isinstance(x, int | float):
+        return float(x)
+    try:
+        return float(x)
+    except (ValueError, TypeError):
+        return None
+def _to_int(x: Any) -> int | None:
+    if x is None:
+        return None
+    if isinstance(x, int):
+        return x
+    try:
+        return int(float(x))  # Handles both "123" and "123.0"
+    except (ValueError, TypeError):
+        return None
+class HPCExecutor:
+    """
+    Run diagnostics by submitting a job script
+    """
+    name = "hpc"
+    def __init__(
+        self,
+        *,
+        database: Database | None = None,
+        config: Config | None = None,
+        **executor_config: str | float | int,
+    ) -> None:
+        config = config or Config.default()
+        database = database or Database.from_config(config, run_migrations=False)
+        self.config = config
+        self.database = database
+        self.scheduler = executor_config.get("scheduler", "slurm")
+        self.account = str(executor_config.get("account", os.environ.get("USER")))
+        self.username = executor_config.get("username", os.environ.get("USER"))
+        self.partition = str(executor_config.get("partition")) if executor_config.get("partition") else None
+        self.qos = str(executor_config.get("qos")) if executor_config.get("qos") else None
+        self.req_nodes = int(executor_config.get("req_nodes", 1))
+        self.walltime = str(executor_config.get("walltime", "00:10:00"))
+        self.log_dir = str(executor_config.get("log_dir", "runinfo"))
+        self.cores_per_worker = _to_int(executor_config.get("cores_per_worker"))
+        self.mem_per_worker = _to_float(executor_config.get("mem_per_worker"))
+        hours, minutes, seconds = map(int, self.walltime.split(":"))
+        total_minutes = hours * 60 + minutes + seconds / 60
+        self.total_minutes = total_minutes
+        if executor_config.get("validation") and HAS_REAL_SLURM:
+            self._validate_slurm_params()
+        self._initialize_parsl()
+        self.parsl_results: list[ExecutionFuture] = []
+    def _validate_slurm_params(self) -> None:
+        """Validate the Slurm configuration using SlurmChecker.
+        Raises
+        ------
+            ValueError: If account, partition or QOS are invalid or inaccessible.
+        """
+        slurm_checker = SlurmChecker()
+        if self.account and not slurm_checker.get_account_info(self.account):
+            raise ValueError(f"Account: {self.account} not valid")
+        partition_limits = None
+        node_info = None
+        if self.partition:
+            if not slurm_checker.get_partition_info(self.partition):
+                raise ValueError(f"Partition: {self.partition} not valid")
+            if not slurm_checker.can_account_use_partition(self.account, self.partition):
+                raise ValueError(f"Account: {self.account} cannot access partiton: {self.partition}")
+            partition_limits = slurm_checker.get_partition_limits(self.partition)
+            node_info = slurm_checker.get_node_from_partition(self.partition)
+        qos_limits = None
+        if self.qos:
+            if not slurm_checker.get_qos_info(self.qos):
+                raise ValueError(f"QOS: {self.qos} not valid")
+            if not slurm_checker.can_account_use_qos(self.account, self.qos):
+                raise ValueError(f"Account: {self.account} cannot access qos: {self.qos}")
+            qos_limits = slurm_checker.get_qos_limits(self.qos)
+        max_cores_per_node = int(node_info["cpus"]) if node_info else None
+        if max_cores_per_node and self.cores_per_worker:
+            if self.cores_per_worker > max_cores_per_node:
+                raise ValueError(
+                    f"cores_per_work:{self.cores_per_worker}"
+                    f"larger than the maximum in a node {max_cores_per_node}"
+                )
+        max_mem_per_node = float(node_info["real_memory"]) if node_info else None
+        if max_mem_per_node and self.mem_per_worker:
+            if self.mem_per_worker > max_mem_per_node:
+                raise ValueError(
+                    f"mem_per_work:{self.mem_per_worker}"
+                    f"larger than the maximum mem in a node {max_mem_per_node}"
+                )
+        max_walltime_partition = (
+            partition_limits["max_time_minutes"] if partition_limits else self.total_minutes
+        )
+        max_walltime_qos = qos_limits["max_time_minutes"] if qos_limits else self.total_minutes
+        max_walltime_minutes = min(float(max_walltime_partition), float(max_walltime_qos))
+        if self.total_minutes > float(max_walltime_minutes):
+            raise ValueError(
+                f"Walltime: {self.walltime} exceed the maximum time "
+                f"{max_walltime_minutes} allowed by {self.partition} and {self.qos}"
+            )
+    def _initialize_parsl(self) -> None:
+        executor_config = self.config.executor.config
+        provider = SlurmProvider(
+            account=self.account,
+            partition=self.partition,
+            qos=self.qos,
+            nodes_per_block=self.req_nodes,
+            max_blocks=int(executor_config.get("max_blocks", 1)),
+            scheduler_options=executor_config.get("scheduler_options", "#SBATCH -C cpu"),
+            worker_init=executor_config.get("worker_init", "source .venv/bin/activate"),
+            launcher=SrunLauncher(
+                debug=True,
+                overrides=executor_config.get("overrides", ""),
+            ),
+            walltime=self.walltime,
+            cmd_timeout=int(executor_config.get("cmd_timeout", 120)),
+        )
+        executor = HighThroughputExecutor(
+            label="ref_hpc_executor",
+            cores_per_worker=self.cores_per_worker if self.cores_per_worker else 1,
+            mem_per_worker=self.mem_per_worker,
+            max_workers_per_node=_to_int(executor_config.get("max_workers_per_node", 16)),
+            cpu_affinity=str(executor_config.get("cpu_affinity")),
+            provider=provider,
+        )
+        hpc_config = ParslConfig(
+            run_dir=self.log_dir, executors=[executor], retries=int(executor_config.get("retries", 2))
+        )
+        parsl.load(hpc_config)
+    def run(
+        self,
+        definition: ExecutionDefinition,
+        execution: Execution | None = None,
+    ) -> None:
+        """
+        Run a diagnostic in process
+        Parameters
+        ----------
+        definition
+            A description of the information needed for this execution of the diagnostic
+        execution
+            A database model representing the execution of the diagnostic.
+            If provided, the result will be updated in the database when completed.
+        """
+        # Submit the execution to the process pool
+        # and track the future so we can wait for it to complete
+        future = _process_run(
+            definition=definition,
+            log_level=self.config.log_level,
+        )
+        self.parsl_results.append(
+            ExecutionFuture(
+                future=future,
+                definition=definition,
+                execution_id=execution.id if execution else None,
+            )
+        )
+    def join(self, timeout: float) -> None:
+        """
+        Wait for all diagnostics to finish
+        This will block until all diagnostics have completed or the timeout is reached.
+        If the timeout is reached, the method will return and raise an exception.
+        Parameters
+        ----------
+        timeout
+            Timeout in seconds (won't used in HPCExecutor)
+        Raises
+        ------
+        TimeoutError
+            If the timeout is reached
+        """
+        start_time = time.time()
+        refresh_time = 0.5
+        results = self.parsl_results
+        t = tqdm(total=len(results), desc="Waiting for executions to complete", unit="execution")
+        try:
+            while results:
+                # Iterate over a copy of the list and remove finished tasks
+                for result in results[:]:
+                    if result.future.done():
+                        # Cannot catch the execption raised by result.future.result
+                        if result.future.exception() is None:
+                            try:
+                                execution_result = result.future.result(timeout=0)
+                            except Exception as e:
+                                # Something went wrong when attempting to run the execution
+                                # This is likely a failure in the execution itself not the diagnostic
+                                raise ExecutionError(
+                                    f"Failed to execute {result.definition.execution_slug()!r}"
+                                ) from e
+                        else:
+                            err = result.future.exception()
+                            if isinstance(err, DiagnosticError):
+                                execution_result = err.result
+                            else:
+                                execution_result = None
+                        assert execution_result is not None, "Execution result should not be None"
+                        assert isinstance(execution_result, ExecutionResult), (
+                            "Execution result should be of type ExecutionResult"
+                        )
+                        # Process the result in the main process
+                        # The results should be committed after each execution
+                        with self.database.session.begin():
+                            execution = (
+                                self.database.session.get(Execution, result.execution_id)
+                                if result.execution_id
+                                else None
+                            )
+                            process_result(self.config, self.database, execution_result, execution)
+                        logger.debug(f"Execution completed: {result}")
+                        t.update(n=1)
+                        results.remove(result)
+                # Break early to avoid waiting for one more sleep cycle
+                if len(results) == 0:
+                    break
+                elapsed_time = time.time() - start_time
+                if elapsed_time > self.total_minutes * 60:
+                    logger.debug(f"Time elasped {elapsed_time} for joining the results")
+                # Wait for a short time before checking for completed executions
+                time.sleep(refresh_time)
+        finally:
+            t.close()
+            if parsl.dfk():
+                parsl.dfk().cleanup()

{climate_ref-0.6.0 → climate_ref-0.6.2}/src/climate_ref/executor/local.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import concurrent.futures
+import multiprocessing
 import time
 from concurrent.futures import Future, ProcessPoolExecutor
 from typing import Any
@@ -124,7 +125,12 @@ class LocalExecutor:
         if pool is not None:
             self.pool = pool
         else:
-            self.pool = ProcessPoolExecutor(max_workers=n, initializer=_process_initialiser)
+            self.pool = ProcessPoolExecutor(
+                max_workers=n,
+                initializer=_process_initialiser,
+                # Explicitly set the context to "spawn" to avoid issues with hanging on MacOS
+                mp_context=multiprocessing.get_context("spawn"),
+            )
         self._results: list[ExecutionFuture] = []
     def run(

climate-ref 0.6.0__tar.gz → 0.6.2__tar.gz

climate-ref 0.6.0tar.gz → 0.6.2tar.gz