climate-ref 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {climate_ref-0.6.0 → climate_ref-0.6.1}/PKG-INFO +2 -1
- {climate_ref-0.6.0 → climate_ref-0.6.1}/pyproject.toml +3 -2
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/executor/__init__.py +2 -1
- climate_ref-0.6.1/src/climate_ref/executor/hpc.py +308 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/executor/local.py +7 -1
- climate_ref-0.6.1/src/climate_ref/slurm.py +192 -0
- climate_ref-0.6.1/tests/unit/executor/test_hpc_executor.py +85 -0
- climate_ref-0.6.1/tests/unit/test_slurm.py +359 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/.gitignore +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/Dockerfile +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/LICENCE +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/NOTICE +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/README.md +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/conftest.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/__init__.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/_config_helpers.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/alembic.ini +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/cli/__init__.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/cli/_utils.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/cli/config.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/cli/datasets.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/cli/executions.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/cli/providers.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/cli/solve.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/config.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/constants.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/database.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/dataset_registry/obs4ref_reference.txt +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/dataset_registry/sample_data.txt +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/datasets/__init__.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/datasets/base.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/datasets/cmip6.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/datasets/obs4mips.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/datasets/pmp_climatology.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/datasets/utils.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/executor/result_handling.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/executor/synchronous.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/migrations/README +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/migrations/env.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/migrations/script.py.mako +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/migrations/versions/2025-05-09T2032_03dbb4998e49_series_metric_value.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/models/__init__.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/models/base.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/models/dataset.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/models/diagnostic.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/models/execution.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/models/metric_value.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/models/provider.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/provider_registry.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/py.typed +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/solver.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/testing.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/cli/test_config.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/cli/test_datasets.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/cli/test_executions/test_inspect.txt +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/cli/test_executions.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/cli/test_providers.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/cli/test_root.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/cli/test_solve.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/conftest.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_db.yml +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_local.yml +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_cmip6.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_datasets.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_db.yml +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_local.yml +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_obs4mips.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_pmp_climatology/pmp_catalog_local.yml +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_pmp_climatology.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_utils.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/executor/test_local_executor.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/executor/test_result_handling.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/executor/test_synchronous_executor.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/models/test_metric_execution.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/models/test_metric_value.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/test_config.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/test_database.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/test_provider_registry.py +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/test_solver/test_solve_metrics.yml +0 -0
- {climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/test_solver.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: climate-ref
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Application which runs the CMIP Rapid Evaluation Framework
|
|
5
5
|
Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -25,6 +25,7 @@ Requires-Dist: climate-ref-core
|
|
|
25
25
|
Requires-Dist: ecgtools>=2024.7.31
|
|
26
26
|
Requires-Dist: environs>=11.0.0
|
|
27
27
|
Requires-Dist: loguru>=0.7.2
|
|
28
|
+
Requires-Dist: parsl>=2025.5.19
|
|
28
29
|
Requires-Dist: platformdirs>=4.3.6
|
|
29
30
|
Requires-Dist: sqlalchemy>=2.0.36
|
|
30
31
|
Requires-Dist: tomlkit>=0.13.2
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "climate-ref"
|
|
3
|
-
version = "0.6.
|
|
3
|
+
version = "0.6.1"
|
|
4
4
|
description = "Application which runs the CMIP Rapid Evaluation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -39,7 +39,8 @@ dependencies = [
|
|
|
39
39
|
"loguru>=0.7.2",
|
|
40
40
|
"ecgtools>=2024.7.31",
|
|
41
41
|
"platformdirs>=4.3.6",
|
|
42
|
-
"tqdm>=4.67.1"
|
|
42
|
+
"tqdm>=4.67.1",
|
|
43
|
+
"parsl>=2025.5.19"
|
|
43
44
|
]
|
|
44
45
|
|
|
45
46
|
[project.optional-dependencies]
|
|
@@ -9,8 +9,9 @@ The simplest executor is the `LocalExecutor`, which runs the diagnostic in the s
|
|
|
9
9
|
This is useful for local testing and debugging.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
+
from .hpc import HPCExecutor
|
|
12
13
|
from .local import LocalExecutor
|
|
13
14
|
from .result_handling import handle_execution_result
|
|
14
15
|
from .synchronous import SynchronousExecutor
|
|
15
16
|
|
|
16
|
-
__all__ = ["LocalExecutor", "SynchronousExecutor", "handle_execution_result"]
|
|
17
|
+
__all__ = ["HPCExecutor", "LocalExecutor", "SynchronousExecutor", "handle_execution_result"]
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HPC-based Executor to use job schedulers.
|
|
3
|
+
|
|
4
|
+
If you want to
|
|
5
|
+
- run REF under the HPC workflows
|
|
6
|
+
- run REF in multiple nodes
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
from typing import Any, cast
|
|
13
|
+
|
|
14
|
+
import parsl
|
|
15
|
+
from loguru import logger
|
|
16
|
+
from parsl import python_app
|
|
17
|
+
from parsl.config import Config as ParslConfig
|
|
18
|
+
from parsl.executors import HighThroughputExecutor
|
|
19
|
+
from parsl.launchers import SrunLauncher
|
|
20
|
+
from parsl.providers import SlurmProvider
|
|
21
|
+
from tqdm import tqdm
|
|
22
|
+
|
|
23
|
+
from climate_ref.config import Config
|
|
24
|
+
from climate_ref.database import Database
|
|
25
|
+
from climate_ref.models import Execution
|
|
26
|
+
from climate_ref.slurm import HAS_REAL_SLURM, SlurmChecker
|
|
27
|
+
from climate_ref_core.diagnostics import ExecutionDefinition, ExecutionResult
|
|
28
|
+
from climate_ref_core.exceptions import DiagnosticError, ExecutionError
|
|
29
|
+
from climate_ref_core.executor import execute_locally
|
|
30
|
+
|
|
31
|
+
from .local import ExecutionFuture, process_result
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@python_app
|
|
35
|
+
def _process_run(definition: ExecutionDefinition, log_level: str) -> ExecutionResult:
|
|
36
|
+
"""Run the function on computer nodes"""
|
|
37
|
+
# This is a catch-all for any exceptions that occur in the process and need to raise for
|
|
38
|
+
# parsl retries to work
|
|
39
|
+
try:
|
|
40
|
+
return execute_locally(definition=definition, log_level=log_level, raise_error=True)
|
|
41
|
+
except DiagnosticError as e: # pragma: no cover
|
|
42
|
+
# any diagnostic error will be caught here
|
|
43
|
+
logger.exception("Error running diagnostic")
|
|
44
|
+
return cast(ExecutionResult, e.result)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _to_float(x: Any) -> float | None:
|
|
48
|
+
if x is None:
|
|
49
|
+
return None
|
|
50
|
+
if isinstance(x, int | float):
|
|
51
|
+
return float(x)
|
|
52
|
+
try:
|
|
53
|
+
return float(x)
|
|
54
|
+
except (ValueError, TypeError):
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _to_int(x: Any) -> int | None:
|
|
59
|
+
if x is None:
|
|
60
|
+
return None
|
|
61
|
+
if isinstance(x, int):
|
|
62
|
+
return x
|
|
63
|
+
try:
|
|
64
|
+
return int(float(x)) # Handles both "123" and "123.0"
|
|
65
|
+
except (ValueError, TypeError):
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class HPCExecutor:
|
|
70
|
+
"""
|
|
71
|
+
Run diagnostics by submitting a job script
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
name = "hpc"
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
*,
|
|
80
|
+
database: Database | None = None,
|
|
81
|
+
config: Config | None = None,
|
|
82
|
+
**executor_config: str | float | int,
|
|
83
|
+
) -> None:
|
|
84
|
+
config = config or Config.default()
|
|
85
|
+
database = database or Database.from_config(config, run_migrations=False)
|
|
86
|
+
|
|
87
|
+
self.config = config
|
|
88
|
+
self.database = database
|
|
89
|
+
|
|
90
|
+
self.scheduler = executor_config.get("scheduler", "slurm")
|
|
91
|
+
self.account = str(executor_config.get("account", os.environ.get("USER")))
|
|
92
|
+
self.username = executor_config.get("username", os.environ.get("USER"))
|
|
93
|
+
self.partition = str(executor_config.get("partition")) if executor_config.get("partition") else None
|
|
94
|
+
self.qos = str(executor_config.get("qos")) if executor_config.get("qos") else None
|
|
95
|
+
self.req_nodes = int(executor_config.get("req_nodes", 1))
|
|
96
|
+
self.walltime = str(executor_config.get("walltime", "00:10:00"))
|
|
97
|
+
self.log_dir = str(executor_config.get("log_dir", "runinfo"))
|
|
98
|
+
|
|
99
|
+
self.cores_per_worker = _to_int(executor_config.get("cores_per_worker"))
|
|
100
|
+
self.mem_per_worker = _to_float(executor_config.get("mem_per_worker"))
|
|
101
|
+
|
|
102
|
+
hours, minutes, seconds = map(int, self.walltime.split(":"))
|
|
103
|
+
total_minutes = hours * 60 + minutes + seconds / 60
|
|
104
|
+
self.total_minutes = total_minutes
|
|
105
|
+
|
|
106
|
+
if executor_config.get("validation") and HAS_REAL_SLURM:
|
|
107
|
+
self._validate_slurm_params()
|
|
108
|
+
|
|
109
|
+
self._initialize_parsl()
|
|
110
|
+
|
|
111
|
+
self.parsl_results: list[ExecutionFuture] = []
|
|
112
|
+
|
|
113
|
+
def _validate_slurm_params(self) -> None:
|
|
114
|
+
"""Validate the Slurm configuration using SlurmChecker.
|
|
115
|
+
|
|
116
|
+
Raises
|
|
117
|
+
------
|
|
118
|
+
ValueError: If account, partition or QOS are invalid or inaccessible.
|
|
119
|
+
"""
|
|
120
|
+
slurm_checker = SlurmChecker()
|
|
121
|
+
if self.account and not slurm_checker.get_account_info(self.account):
|
|
122
|
+
raise ValueError(f"Account: {self.account} not valid")
|
|
123
|
+
|
|
124
|
+
partition_limits = None
|
|
125
|
+
node_info = None
|
|
126
|
+
|
|
127
|
+
if self.partition:
|
|
128
|
+
if not slurm_checker.get_partition_info(self.partition):
|
|
129
|
+
raise ValueError(f"Partition: {self.partition} not valid")
|
|
130
|
+
|
|
131
|
+
if not slurm_checker.can_account_use_partition(self.account, self.partition):
|
|
132
|
+
raise ValueError(f"Account: {self.account} cannot access partiton: {self.partition}")
|
|
133
|
+
|
|
134
|
+
partition_limits = slurm_checker.get_partition_limits(self.partition)
|
|
135
|
+
node_info = slurm_checker.get_node_from_partition(self.partition)
|
|
136
|
+
|
|
137
|
+
qos_limits = None
|
|
138
|
+
if self.qos:
|
|
139
|
+
if not slurm_checker.get_qos_info(self.qos):
|
|
140
|
+
raise ValueError(f"QOS: {self.qos} not valid")
|
|
141
|
+
|
|
142
|
+
if not slurm_checker.can_account_use_qos(self.account, self.qos):
|
|
143
|
+
raise ValueError(f"Account: {self.account} cannot access qos: {self.qos}")
|
|
144
|
+
|
|
145
|
+
qos_limits = slurm_checker.get_qos_limits(self.qos)
|
|
146
|
+
|
|
147
|
+
max_cores_per_node = int(node_info["cpus"]) if node_info else None
|
|
148
|
+
if max_cores_per_node and self.cores_per_worker:
|
|
149
|
+
if self.cores_per_worker > max_cores_per_node:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"cores_per_work:{self.cores_per_worker}"
|
|
152
|
+
f"larger than the maximum in a node {max_cores_per_node}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
max_mem_per_node = float(node_info["real_memory"]) if node_info else None
|
|
156
|
+
if max_mem_per_node and self.mem_per_worker:
|
|
157
|
+
if self.mem_per_worker > max_mem_per_node:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"mem_per_work:{self.mem_per_worker}"
|
|
160
|
+
f"larger than the maximum mem in a node {max_mem_per_node}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
max_walltime_partition = (
|
|
164
|
+
partition_limits["max_time_minutes"] if partition_limits else self.total_minutes
|
|
165
|
+
)
|
|
166
|
+
max_walltime_qos = qos_limits["max_time_minutes"] if qos_limits else self.total_minutes
|
|
167
|
+
|
|
168
|
+
max_walltime_minutes = min(float(max_walltime_partition), float(max_walltime_qos))
|
|
169
|
+
|
|
170
|
+
if self.total_minutes > float(max_walltime_minutes):
|
|
171
|
+
raise ValueError(
|
|
172
|
+
f"Walltime: {self.walltime} exceed the maximum time "
|
|
173
|
+
f"{max_walltime_minutes} allowed by {self.partition} and {self.qos}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _initialize_parsl(self) -> None:
|
|
177
|
+
executor_config = self.config.executor.config
|
|
178
|
+
|
|
179
|
+
provider = SlurmProvider(
|
|
180
|
+
account=self.account,
|
|
181
|
+
partition=self.partition,
|
|
182
|
+
qos=self.qos,
|
|
183
|
+
nodes_per_block=self.req_nodes,
|
|
184
|
+
max_blocks=int(executor_config.get("max_blocks", 1)),
|
|
185
|
+
scheduler_options=executor_config.get("scheduler_options", "#SBATCH -C cpu"),
|
|
186
|
+
worker_init=executor_config.get("worker_init", "source .venv/bin/activate"),
|
|
187
|
+
launcher=SrunLauncher(
|
|
188
|
+
debug=True,
|
|
189
|
+
overrides=executor_config.get("overrides", ""),
|
|
190
|
+
),
|
|
191
|
+
walltime=self.walltime,
|
|
192
|
+
cmd_timeout=int(executor_config.get("cmd_timeout", 120)),
|
|
193
|
+
)
|
|
194
|
+
executor = HighThroughputExecutor(
|
|
195
|
+
label="ref_hpc_executor",
|
|
196
|
+
cores_per_worker=self.cores_per_worker if self.cores_per_worker else 1,
|
|
197
|
+
mem_per_worker=self.mem_per_worker,
|
|
198
|
+
max_workers_per_node=_to_int(executor_config.get("max_workers_per_node", 16)),
|
|
199
|
+
cpu_affinity=str(executor_config.get("cpu_affinity")),
|
|
200
|
+
provider=provider,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
hpc_config = ParslConfig(
|
|
204
|
+
run_dir=self.log_dir, executors=[executor], retries=int(executor_config.get("retries", 2))
|
|
205
|
+
)
|
|
206
|
+
parsl.load(hpc_config)
|
|
207
|
+
|
|
208
|
+
def run(
|
|
209
|
+
self,
|
|
210
|
+
definition: ExecutionDefinition,
|
|
211
|
+
execution: Execution | None = None,
|
|
212
|
+
) -> None:
|
|
213
|
+
"""
|
|
214
|
+
Run a diagnostic in process
|
|
215
|
+
|
|
216
|
+
Parameters
|
|
217
|
+
----------
|
|
218
|
+
definition
|
|
219
|
+
A description of the information needed for this execution of the diagnostic
|
|
220
|
+
execution
|
|
221
|
+
A database model representing the execution of the diagnostic.
|
|
222
|
+
If provided, the result will be updated in the database when completed.
|
|
223
|
+
"""
|
|
224
|
+
# Submit the execution to the process pool
|
|
225
|
+
# and track the future so we can wait for it to complete
|
|
226
|
+
future = _process_run(
|
|
227
|
+
definition=definition,
|
|
228
|
+
log_level=self.config.log_level,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
self.parsl_results.append(
|
|
232
|
+
ExecutionFuture(
|
|
233
|
+
future=future,
|
|
234
|
+
definition=definition,
|
|
235
|
+
execution_id=execution.id if execution else None,
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def join(self, timeout: float) -> None:
|
|
240
|
+
"""
|
|
241
|
+
Wait for all diagnostics to finish
|
|
242
|
+
|
|
243
|
+
This will block until all diagnostics have completed or the timeout is reached.
|
|
244
|
+
If the timeout is reached, the method will return and raise an exception.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
timeout
|
|
249
|
+
Timeout in seconds (won't used in HPCExecutor)
|
|
250
|
+
|
|
251
|
+
Raises
|
|
252
|
+
------
|
|
253
|
+
TimeoutError
|
|
254
|
+
If the timeout is reached
|
|
255
|
+
"""
|
|
256
|
+
start_time = time.time()
|
|
257
|
+
refresh_time = 0.5
|
|
258
|
+
|
|
259
|
+
results = self.parsl_results
|
|
260
|
+
t = tqdm(total=len(results), desc="Waiting for executions to complete", unit="execution")
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
while results:
|
|
264
|
+
# Iterate over a copy of the list and remove finished tasks
|
|
265
|
+
for result in results[:]:
|
|
266
|
+
if result.future.done():
|
|
267
|
+
try:
|
|
268
|
+
execution_result = result.future.result(timeout=0)
|
|
269
|
+
except Exception as e:
|
|
270
|
+
# Something went wrong when attempting to run the execution
|
|
271
|
+
# This is likely a failure in the execution itself not the diagnostic
|
|
272
|
+
raise ExecutionError(
|
|
273
|
+
f"Failed to execute {result.definition.execution_slug()!r}"
|
|
274
|
+
) from e
|
|
275
|
+
|
|
276
|
+
assert execution_result is not None, "Execution result should not be None"
|
|
277
|
+
assert isinstance(execution_result, ExecutionResult), (
|
|
278
|
+
"Execution result should be of type ExecutionResult"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Process the result in the main process
|
|
282
|
+
# The results should be committed after each execution
|
|
283
|
+
with self.database.session.begin():
|
|
284
|
+
execution = (
|
|
285
|
+
self.database.session.get(Execution, result.execution_id)
|
|
286
|
+
if result.execution_id
|
|
287
|
+
else None
|
|
288
|
+
)
|
|
289
|
+
process_result(self.config, self.database, result.future.result(), execution)
|
|
290
|
+
logger.debug(f"Execution completed: {result}")
|
|
291
|
+
t.update(n=1)
|
|
292
|
+
results.remove(result)
|
|
293
|
+
|
|
294
|
+
# Break early to avoid waiting for one more sleep cycle
|
|
295
|
+
if len(results) == 0:
|
|
296
|
+
break
|
|
297
|
+
|
|
298
|
+
elapsed_time = time.time() - start_time
|
|
299
|
+
|
|
300
|
+
if elapsed_time > self.total_minutes * 60:
|
|
301
|
+
logger.debug(f"Time elasped {elapsed_time} for joining the results")
|
|
302
|
+
|
|
303
|
+
# Wait for a short time before checking for completed executions
|
|
304
|
+
time.sleep(refresh_time)
|
|
305
|
+
finally:
|
|
306
|
+
t.close()
|
|
307
|
+
if parsl.dfk():
|
|
308
|
+
parsl.dfk().cleanup()
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
|
+
import multiprocessing
|
|
2
3
|
import time
|
|
3
4
|
from concurrent.futures import Future, ProcessPoolExecutor
|
|
4
5
|
from typing import Any
|
|
@@ -124,7 +125,12 @@ class LocalExecutor:
|
|
|
124
125
|
if pool is not None:
|
|
125
126
|
self.pool = pool
|
|
126
127
|
else:
|
|
127
|
-
self.pool = ProcessPoolExecutor(
|
|
128
|
+
self.pool = ProcessPoolExecutor(
|
|
129
|
+
max_workers=n,
|
|
130
|
+
initializer=_process_initialiser,
|
|
131
|
+
# Explicitly set the context to "spawn" to avoid issues with hanging on MacOS
|
|
132
|
+
mp_context=multiprocessing.get_context("spawn"),
|
|
133
|
+
)
|
|
128
134
|
self._results: list[ExecutionFuture] = []
|
|
129
135
|
|
|
130
136
|
def run(
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
HAS_REAL_SLURM = importlib.util.find_spec("pyslurm") is not None
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SlurmChecker:
|
|
8
|
+
"""Check and get slurm settings."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, intest: bool = False) -> None:
|
|
11
|
+
if HAS_REAL_SLURM:
|
|
12
|
+
import pyslurm # type: ignore
|
|
13
|
+
|
|
14
|
+
self.slurm_association: dict[int, Any] | None = pyslurm.db.Associations.load()
|
|
15
|
+
self.slurm_partition: dict[str, Any] | None = pyslurm.Partitions.load()
|
|
16
|
+
self.slurm_qos: dict[str, Any] | None = pyslurm.qos().get()
|
|
17
|
+
self.slurm_node: dict[str, Any] | None = pyslurm.Nodes.load()
|
|
18
|
+
elif intest:
|
|
19
|
+
import pyslurm
|
|
20
|
+
|
|
21
|
+
self.slurm_association = pyslurm.db.Associations.load() # dict [num -> Association]
|
|
22
|
+
self.slurm_partition = pyslurm.Partitions.load() # collection
|
|
23
|
+
self.slurm_qos = pyslurm.qos().get() # dict
|
|
24
|
+
self.slurm_node = pyslurm.Nodes.load() # dict
|
|
25
|
+
else:
|
|
26
|
+
print("Warning: pyslurm not found. Skipping HPCExecutor config validations")
|
|
27
|
+
self.slurm_association = None
|
|
28
|
+
self.slurm_partition = None
|
|
29
|
+
self.slurm_qos = None
|
|
30
|
+
self.slurm_node = None
|
|
31
|
+
|
|
32
|
+
def get_partition_info(self, partition_name: str) -> Any:
|
|
33
|
+
"""Check if a partition exists in the Slurm configuration."""
|
|
34
|
+
return self.slurm_partition.get(partition_name) if self.slurm_partition else None
|
|
35
|
+
|
|
36
|
+
def get_qos_info(self, qos_name: str) -> Any:
|
|
37
|
+
"""Check if a qos exists in the Slurm configuration."""
|
|
38
|
+
return self.slurm_qos.get(qos_name) if self.slurm_qos else None
|
|
39
|
+
|
|
40
|
+
def get_account_info(self, account_name: str) -> list[Any]:
|
|
41
|
+
"""Get all associations for an account"""
|
|
42
|
+
if self.slurm_association:
|
|
43
|
+
return [a for a in self.slurm_association.values() if a.account == account_name]
|
|
44
|
+
else:
|
|
45
|
+
return [None]
|
|
46
|
+
|
|
47
|
+
def can_account_use_partition(self, account_name: str, partition_name: str) -> bool:
|
|
48
|
+
"""
|
|
49
|
+
Check if an account has access to a specific partition.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
bool: True if accessible, False if not accessible or error occurred
|
|
54
|
+
"""
|
|
55
|
+
account_info = self.get_account_info(account_name)
|
|
56
|
+
if not account_info:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
partition_info = self.get_partition_info(partition_name)
|
|
60
|
+
|
|
61
|
+
if not partition_info:
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
allowed_partitions = account_info[0].partition
|
|
65
|
+
if allowed_partitions is None:
|
|
66
|
+
return True
|
|
67
|
+
else:
|
|
68
|
+
return partition_name in allowed_partitions
|
|
69
|
+
|
|
70
|
+
def can_account_use_qos(self, account_name: str, qos_name: str) -> bool:
|
|
71
|
+
"""
|
|
72
|
+
Check if an account has access to a specific qos.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
bool: True if accessible, False if not accessible or error occurred
|
|
77
|
+
"""
|
|
78
|
+
account_info = self.get_account_info(account_name)
|
|
79
|
+
|
|
80
|
+
if not account_info:
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
qos_info = self.get_qos_info(qos_name)
|
|
84
|
+
if not qos_info:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
sample_acc = account_info[0]
|
|
88
|
+
for acc in account_info:
|
|
89
|
+
if acc.user == "minxu":
|
|
90
|
+
sample_acc = acc
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
allowed_qoss = sample_acc.qos
|
|
94
|
+
if allowed_qoss is None:
|
|
95
|
+
return True
|
|
96
|
+
else:
|
|
97
|
+
return qos_name in allowed_qoss
|
|
98
|
+
|
|
99
|
+
def get_partition_limits(self, partition_name: str) -> dict[str, str | int] | None:
|
|
100
|
+
"""
|
|
101
|
+
Get time limits for a specific partition.
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
Dict with 'max_time' and 'default_time' (strings or UNLIMITED)
|
|
106
|
+
or None if partition doesn't exist or error occurred
|
|
107
|
+
"""
|
|
108
|
+
partition_info = self.get_partition_info(partition_name)
|
|
109
|
+
if not partition_info:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
"max_time_minutes": partition_info.to_dict().get("max_time", 0), # in minutes
|
|
114
|
+
"default_time_minutes": partition_info.to_dict().get("default_time", 30), # in minutes
|
|
115
|
+
"max_nodes": partition_info.to_dict().get("max_node", 1),
|
|
116
|
+
"total_nodes": partition_info.to_dict().get("total_nodes", 0),
|
|
117
|
+
"total_cpus": partition_info.to_dict().get("total_cpus", 0),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
def get_node_from_partition(self, partition_name: str) -> dict[str, str | int] | None:
|
|
121
|
+
"""
|
|
122
|
+
Get the node information for a specific partition.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
Dicts
|
|
127
|
+
"""
|
|
128
|
+
partition_info = self.get_partition_info(partition_name)
|
|
129
|
+
if not partition_info:
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
sample_node = None
|
|
133
|
+
|
|
134
|
+
if self.slurm_node:
|
|
135
|
+
for node in self.slurm_node.values():
|
|
136
|
+
if partition_name in node.partitions and "cpu" in node.available_features:
|
|
137
|
+
sample_node = node
|
|
138
|
+
break
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"cpus": int(sample_node.total_cpus) if sample_node is not None else 1,
|
|
142
|
+
"cores_per_socket": int(sample_node.cores_per_socket) if sample_node is not None else 1,
|
|
143
|
+
"sockets": int(sample_node.sockets) if sample_node is not None else 1,
|
|
144
|
+
"threads_per_core": int(sample_node.threads_per_core) if sample_node is not None else 1,
|
|
145
|
+
"real_memory": int(sample_node.real_memory) if sample_node is not None else 215,
|
|
146
|
+
"node_names": sample_node.name if sample_node is not None else "unknown",
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
def get_qos_limits(self, qos_name: str) -> dict[str, str | int]:
|
|
150
|
+
"""
|
|
151
|
+
Get time limits for a specific qos.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
Dict with 'max_time' and 'default_time' (strings or UNLIMITED)
|
|
156
|
+
or None if partition doesn't exist or error occurred
|
|
157
|
+
"""
|
|
158
|
+
qos_info = self.get_qos_info(qos_name)
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"max_time_minutes": qos_info.get("max_wall_pj", 1.0e6),
|
|
162
|
+
"max_jobs_pu": qos_info.get("max_jobs_pu", 1.0e6),
|
|
163
|
+
"max_submit_jobs_pu": qos_info.get("max_submit_jobs_pu", 1.0e6),
|
|
164
|
+
"max_tres_pj": qos_info.get("max_tres_pj").split("=")[0],
|
|
165
|
+
"default_time_minutes": 120,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
def check_account_partition_access_with_limits(
|
|
169
|
+
self, account_name: str, partition_name: str
|
|
170
|
+
) -> dict[str, Any]:
|
|
171
|
+
"""
|
|
172
|
+
Comprehensive check of account access and partition limits.
|
|
173
|
+
|
|
174
|
+
Returns dictionary with all relevant information.
|
|
175
|
+
"""
|
|
176
|
+
result = {
|
|
177
|
+
"account_exists": True if self.get_account_info(account_name) else False,
|
|
178
|
+
"partition_exists": True if self.get_partition_info(partition_name) else False,
|
|
179
|
+
"has_access": False,
|
|
180
|
+
"time_limits": None,
|
|
181
|
+
"error": "none",
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
if result["account_exists"] and result["partition_exists"]:
|
|
186
|
+
result["has_access"] = self.can_account_use_partition(account_name, partition_name)
|
|
187
|
+
if result["has_access"]:
|
|
188
|
+
result["time_limits"] = self.get_partition_info(partition_name).to_dict().get("max_time")
|
|
189
|
+
except Exception as e:
|
|
190
|
+
result["error"] = str(e)
|
|
191
|
+
|
|
192
|
+
return result
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from unittest.mock import MagicMock, patch
|
|
3
|
+
|
|
4
|
+
import parsl
|
|
5
|
+
import pytest
|
|
6
|
+
from parsl.dataflow import futures
|
|
7
|
+
|
|
8
|
+
from climate_ref.executor.hpc import HPCExecutor, execute_locally
|
|
9
|
+
from climate_ref.executor.local import ExecutionFuture
|
|
10
|
+
from climate_ref_core.diagnostics import ExecutionResult
|
|
11
|
+
from climate_ref_core.exceptions import DiagnosticError, ExecutionError
|
|
12
|
+
from climate_ref_core.executor import Executor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_execute_locally_failed(definition_factory, mock_diagnostic):
|
|
16
|
+
mock_diagnostic.run = lambda definition: 1 / 0
|
|
17
|
+
|
|
18
|
+
# execution raises an exception
|
|
19
|
+
with pytest.raises(DiagnosticError):
|
|
20
|
+
result = execute_locally(
|
|
21
|
+
definition_factory(diagnostic=mock_diagnostic),
|
|
22
|
+
log_level="DEBUG",
|
|
23
|
+
raise_error=True,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
assert result is None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TestHPCExecutor:
|
|
30
|
+
def test_is_executor(self, tmp_path):
|
|
31
|
+
executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
|
|
32
|
+
|
|
33
|
+
assert executor.name == "hpc"
|
|
34
|
+
assert isinstance(executor, Executor)
|
|
35
|
+
parsl.dfk().cleanup()
|
|
36
|
+
|
|
37
|
+
def test_run_metric(self, metric_definition, provider, mock_diagnostic, mocker, caplog, tmp_path):
|
|
38
|
+
with patch.object(HPCExecutor, "run", autospec=True) as mock_run:
|
|
39
|
+
# Configure the mock to behave similarly to the original
|
|
40
|
+
mock_run.side_effect = lambda self, definition, execution=None: (
|
|
41
|
+
self.parsl_results.append(
|
|
42
|
+
ExecutionFuture(
|
|
43
|
+
future=MagicMock(), # Mock the future object
|
|
44
|
+
definition=definition,
|
|
45
|
+
execution_id=execution.id if execution else None,
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
|
|
51
|
+
|
|
52
|
+
# shall have the SerializationError, but not raised
|
|
53
|
+
executor.run(metric_definition, None)
|
|
54
|
+
assert len(executor.parsl_results) == 1
|
|
55
|
+
assert executor.parsl_results[0].definition == metric_definition
|
|
56
|
+
assert executor.parsl_results[0].execution_id is None
|
|
57
|
+
|
|
58
|
+
parsl.dfk().cleanup()
|
|
59
|
+
|
|
60
|
+
def test_join(self, metric_definition, tmp_path):
|
|
61
|
+
executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
|
|
62
|
+
future = futures.AppFuture(1)
|
|
63
|
+
executor.parsl_results = [ExecutionFuture(future, definition=metric_definition, execution_id=None)]
|
|
64
|
+
|
|
65
|
+
future.set_result(
|
|
66
|
+
ExecutionResult(
|
|
67
|
+
definition=metric_definition,
|
|
68
|
+
successful=False,
|
|
69
|
+
output_bundle_filename=None,
|
|
70
|
+
metric_bundle_filename=None,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
executor.join(0.1)
|
|
74
|
+
|
|
75
|
+
assert len(executor.parsl_results) == 0
|
|
76
|
+
|
|
77
|
+
def test_join_exception(self, metric_definition, tmp_path):
|
|
78
|
+
executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
|
|
79
|
+
future = futures.AppFuture(1)
|
|
80
|
+
executor.parsl_results = [ExecutionFuture(future, definition=metric_definition, execution_id=None)]
|
|
81
|
+
|
|
82
|
+
future.set_exception(ValueError("Some thing bad went wrong"))
|
|
83
|
+
|
|
84
|
+
with pytest.raises(ExecutionError, match=re.escape("Failed to execute 'mock_provider/mock/key'")):
|
|
85
|
+
executor.join(0.1)
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
from unittest.mock import MagicMock
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from climate_ref.slurm import SlurmChecker
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def create_mock_association():
|
|
10
|
+
"""Factory fixture for creating mock objects"""
|
|
11
|
+
|
|
12
|
+
def _create_mock_association(**kwargs):
|
|
13
|
+
"""
|
|
14
|
+
Creates a mock PySlurm association object with configurable attributes.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
**kwargs: Key-value pairs to set as attributes on the mock association
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
MagicMock: Configured mock association object
|
|
21
|
+
"""
|
|
22
|
+
# Default values for association attributes
|
|
23
|
+
defaults = {
|
|
24
|
+
"account": "test_account",
|
|
25
|
+
"cluster": "test_cluster",
|
|
26
|
+
"comment": None,
|
|
27
|
+
"group_jobs": "UNLIMITED",
|
|
28
|
+
"group_jobs_accrue": "UNLIMITED",
|
|
29
|
+
"group_submit_jobs": "UNLIMITED",
|
|
30
|
+
"group_tres": None,
|
|
31
|
+
"group_tres_mins": None,
|
|
32
|
+
"group_tres_run_mins": None,
|
|
33
|
+
"group_wall_time": "UNLIMITED",
|
|
34
|
+
"is_default": False,
|
|
35
|
+
"max_jobs_accrue": 4,
|
|
36
|
+
"max_submit_jobs": "UNLIMITED",
|
|
37
|
+
"max_tres_mins_per_job": None,
|
|
38
|
+
"max_tres_per_job": None,
|
|
39
|
+
"max_tres_per_node": None,
|
|
40
|
+
"max_tres_run_mins_per_user": None,
|
|
41
|
+
"max_wall_time_per_job": "UNLIMITED",
|
|
42
|
+
"max_jobs": 10,
|
|
43
|
+
"max_wall_pj": "1-00:00:00",
|
|
44
|
+
"min_priority_threshold": "UNLIMITED",
|
|
45
|
+
"partition": "test_partition",
|
|
46
|
+
"shares": 100,
|
|
47
|
+
"qos": ["normal"],
|
|
48
|
+
"priority": 1000,
|
|
49
|
+
"id": 1,
|
|
50
|
+
"lft": 0,
|
|
51
|
+
"parent_acct": None,
|
|
52
|
+
"parent_account_id": 4,
|
|
53
|
+
"rgt": 0,
|
|
54
|
+
"user": "test_user",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Update defaults with any provided kwargs
|
|
58
|
+
defaults.update(kwargs)
|
|
59
|
+
|
|
60
|
+
# Create the mock association
|
|
61
|
+
# mock_assoc = MagicMock(spec=['account', 'partition'])
|
|
62
|
+
mock_assoc = MagicMock()
|
|
63
|
+
|
|
64
|
+
# Set attributes on the mock
|
|
65
|
+
for key, value in defaults.items():
|
|
66
|
+
setattr(mock_assoc, key, value)
|
|
67
|
+
|
|
68
|
+
return mock_assoc
|
|
69
|
+
|
|
70
|
+
return _create_mock_association
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@pytest.fixture
|
|
74
|
+
def create_mock_partition():
|
|
75
|
+
"""Factory fixture for creating mock objects"""
|
|
76
|
+
|
|
77
|
+
def _create_mock_partition(**kwargs):
|
|
78
|
+
"""
|
|
79
|
+
Creates a mock PySlurm partition object with configurable attributes.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
**kwargs: Key-value pairs to set as attributes on the mock partition
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
MagicMock: Configured mock partition object
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
# Default values for partition attributes
|
|
89
|
+
defaults = {
|
|
90
|
+
"allow_root_jobs": False,
|
|
91
|
+
"allowed_accounts": ["ALL"],
|
|
92
|
+
"allowed_groups": ["ALL"],
|
|
93
|
+
"allowed_qos": ["ALL"],
|
|
94
|
+
"allowed_submit_nodes": ["ALL"],
|
|
95
|
+
"alternate": None,
|
|
96
|
+
"cluster": "test_cluster",
|
|
97
|
+
"cpu_binding": None,
|
|
98
|
+
"default_cpus_per_gpu": None,
|
|
99
|
+
"default_memory_per_cpu": None,
|
|
100
|
+
"default_memory_per_gpu": None,
|
|
101
|
+
"default_memory_per_node": "UNLIMITED",
|
|
102
|
+
"default_time": 720,
|
|
103
|
+
"denied_accounts": ["ALL"],
|
|
104
|
+
"denied_qos": ["ALL"],
|
|
105
|
+
"is_default": False,
|
|
106
|
+
"is_hidden": False,
|
|
107
|
+
"is_root_only": False,
|
|
108
|
+
"is_user_exclusive": False,
|
|
109
|
+
"least_loaded_nodes_scheduling": False,
|
|
110
|
+
"max_cpus_per_node": "UNLIMITED",
|
|
111
|
+
"max_cpus_per_socket": "UNLIMITED",
|
|
112
|
+
"max_memory_per_cpu": None,
|
|
113
|
+
"max_memory_per_node": "UNLIMITED",
|
|
114
|
+
"max_nodes": "UNLIMITED",
|
|
115
|
+
"max_time": 7200,
|
|
116
|
+
"min_nodes": 0,
|
|
117
|
+
"name": "batch",
|
|
118
|
+
"nodes": "baseline[3-140]",
|
|
119
|
+
"nodesets": [],
|
|
120
|
+
"over_time_limit": None,
|
|
121
|
+
"oversubscribe": "EXCLUSIVE",
|
|
122
|
+
"preempt_mode": "OFF",
|
|
123
|
+
"preemption_grace_time": None,
|
|
124
|
+
"priority_job_factor": 1,
|
|
125
|
+
"priority_tier": 1,
|
|
126
|
+
"qos": None,
|
|
127
|
+
"requires_reservation": False,
|
|
128
|
+
"select_type_parameters": [],
|
|
129
|
+
"state": "UP",
|
|
130
|
+
"total_cpus": 17664,
|
|
131
|
+
"total_nodes": 138,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
# Update defaults with any provided kwargs
|
|
135
|
+
defaults.update(kwargs)
|
|
136
|
+
|
|
137
|
+
# Create the mock partition
|
|
138
|
+
mock_part = MagicMock()
|
|
139
|
+
|
|
140
|
+
# Set attributes on the mock
|
|
141
|
+
for key, value in defaults.items():
|
|
142
|
+
setattr(mock_part, key, value)
|
|
143
|
+
|
|
144
|
+
def mock_to_dict():
|
|
145
|
+
return defaults
|
|
146
|
+
|
|
147
|
+
mock_part.to_dict.side_effect = mock_to_dict
|
|
148
|
+
|
|
149
|
+
return mock_part
|
|
150
|
+
|
|
151
|
+
return _create_mock_partition
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@pytest.fixture
|
|
155
|
+
def create_mock_qos():
|
|
156
|
+
"""Factory fixture for creating mock objects"""
|
|
157
|
+
|
|
158
|
+
def _create_mock_qos(**kwargs):
|
|
159
|
+
"""
|
|
160
|
+
Creates a mock PySlurm QoS object with configurable attributes.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
**kwargs: Key-value pairs to set as attributes on the mock QoS
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
MagicMock: Configured mock QoS object
|
|
167
|
+
"""
|
|
168
|
+
# Default values for QoS attributes
|
|
169
|
+
defaults = {
|
|
170
|
+
"name": "normal",
|
|
171
|
+
"description": "Default Quality of Service",
|
|
172
|
+
"flags": 0,
|
|
173
|
+
"grace_time": 0,
|
|
174
|
+
"grp_jobs": 4294967295,
|
|
175
|
+
"grp_submit_jobs": 4294967295,
|
|
176
|
+
"grp_tres": None,
|
|
177
|
+
"grp_tres_mins": None,
|
|
178
|
+
"grp_tres_run_mins": None,
|
|
179
|
+
"grp_wall": 4294967295,
|
|
180
|
+
"max_jobs_pu": 4294967295,
|
|
181
|
+
"max_submit_jobs_pu": 4294967295,
|
|
182
|
+
"max_tres_mins_pj": None,
|
|
183
|
+
"max_tres_pj": None,
|
|
184
|
+
"max_tres_pn": None,
|
|
185
|
+
"max_tres_pu": None,
|
|
186
|
+
"max_tres_run_mins_pu": None,
|
|
187
|
+
"max_wall_pj": 4294967295,
|
|
188
|
+
"min_tres_pj": None,
|
|
189
|
+
"preempt_mode": "OFF",
|
|
190
|
+
"priority": 0,
|
|
191
|
+
"usage_factor": 1.0,
|
|
192
|
+
"usage_thres": 4294967295.0,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
# Update defaults with any provided kwargs
|
|
196
|
+
defaults.update(kwargs)
|
|
197
|
+
|
|
198
|
+
# Create the mock QoS
|
|
199
|
+
mock_qos = MagicMock()
|
|
200
|
+
|
|
201
|
+
# Set attributes on the mock
|
|
202
|
+
for key, value in defaults.items():
|
|
203
|
+
setattr(mock_qos, key, value)
|
|
204
|
+
|
|
205
|
+
return mock_qos
|
|
206
|
+
|
|
207
|
+
return _create_mock_qos
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@pytest.fixture
|
|
211
|
+
def create_mock_node():
|
|
212
|
+
"""Factory fixture for creating mock objects"""
|
|
213
|
+
|
|
214
|
+
def _create_mock_node(**kwargs):
|
|
215
|
+
"""
|
|
216
|
+
Creates a mock PySlurm node object with configurable attributes.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
**kwargs: Key-value pairs to set as attributes on the mock node
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
MagicMock: Configured mock node object
|
|
223
|
+
"""
|
|
224
|
+
# Default values for node attributes
|
|
225
|
+
defaults = {
|
|
226
|
+
"active_features": [],
|
|
227
|
+
"address": "test_address",
|
|
228
|
+
"allocated_cpus": 0,
|
|
229
|
+
"allocated_gres": {},
|
|
230
|
+
"allocated_memory": 0,
|
|
231
|
+
"allocated_tres": {},
|
|
232
|
+
"architecture": "x86_64",
|
|
233
|
+
"available_features": [],
|
|
234
|
+
"avg_watts": 0,
|
|
235
|
+
"bcast_address": None,
|
|
236
|
+
"boards": 1,
|
|
237
|
+
"boot_time": 1747765065,
|
|
238
|
+
"cap_watts": 0,
|
|
239
|
+
"cluster": "baseline",
|
|
240
|
+
"comment": None,
|
|
241
|
+
"configured_gres": {},
|
|
242
|
+
"cores_per_socket": 64,
|
|
243
|
+
"cores_reserved_for_system": None,
|
|
244
|
+
"cpu_binding": None,
|
|
245
|
+
"cpu_load": 0.18,
|
|
246
|
+
"current_watts": 0,
|
|
247
|
+
"effective_cpus": 128,
|
|
248
|
+
"external_sensors": {"joules_total": None, "current_watts": None, "temperature": None},
|
|
249
|
+
"extra": None,
|
|
250
|
+
"free_memory": 231147,
|
|
251
|
+
"hostname": "baseline101",
|
|
252
|
+
"idle_cpus": 128,
|
|
253
|
+
"idle_memory": 256000,
|
|
254
|
+
"last_busy_time": 1747789216,
|
|
255
|
+
"mcs_label": None,
|
|
256
|
+
"memory_reserved_for_system": None,
|
|
257
|
+
"name": "baseline101",
|
|
258
|
+
"next_state": None,
|
|
259
|
+
"operating_system": "Linux 4.18.0-553.46.1.el8_10.x86_64 #1 SMP Sat Mar 15 01:37:33 EDT 2025",
|
|
260
|
+
"owner": None,
|
|
261
|
+
"partitions": ["batch", "batch_low_memory"],
|
|
262
|
+
"real_memory": 256000,
|
|
263
|
+
"reason": None,
|
|
264
|
+
"reason_time": None,
|
|
265
|
+
"reason_user": None,
|
|
266
|
+
"slurm_version": "23.02.7",
|
|
267
|
+
"slurmd_port": 6818,
|
|
268
|
+
"slurmd_start_time": 1747765241,
|
|
269
|
+
"sockets": 2,
|
|
270
|
+
"state": "IDLE",
|
|
271
|
+
"temporary_disk": None,
|
|
272
|
+
"threads_per_core": 1,
|
|
273
|
+
"total_cpus": 128,
|
|
274
|
+
"weight": 1,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
# Update defaults with any provided kwargs
|
|
278
|
+
defaults.update(kwargs)
|
|
279
|
+
|
|
280
|
+
# Create the mock node
|
|
281
|
+
mock_node = MagicMock()
|
|
282
|
+
|
|
283
|
+
# Set attributes on the mock
|
|
284
|
+
for key, value in defaults.items():
|
|
285
|
+
setattr(mock_node, key, value)
|
|
286
|
+
|
|
287
|
+
return mock_node
|
|
288
|
+
|
|
289
|
+
return _create_mock_node
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def test_slurm_checker(
|
|
293
|
+
create_mock_association, create_mock_partition, create_mock_qos, create_mock_node, mocker
|
|
294
|
+
):
|
|
295
|
+
# Setup mock data
|
|
296
|
+
associations = {
|
|
297
|
+
1: create_mock_association(account="climate_ref1", partition="cpu"),
|
|
298
|
+
2: create_mock_association(account="climate_ref2", partition="gpu"),
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
partitions = {
|
|
302
|
+
"normal": create_mock_partition(name="nomral"),
|
|
303
|
+
"cpu": create_mock_partition(name="cpu"),
|
|
304
|
+
"batch": create_mock_partition(name="batch"),
|
|
305
|
+
"gpu": create_mock_partition(name="gpu"),
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
qoss = {
|
|
309
|
+
"normal": create_mock_qos(name="normal"),
|
|
310
|
+
}
|
|
311
|
+
nodes = {
|
|
312
|
+
"node0001": create_mock_node(name="node0001"),
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
mock_pyslurm = MagicMock()
|
|
316
|
+
|
|
317
|
+
# Set up the nested structure
|
|
318
|
+
mock_pyslurm.db = MagicMock()
|
|
319
|
+
mock_pyslurm.db.Associations = MagicMock()
|
|
320
|
+
mock_pyslurm.db.Associations.load.return_value = associations
|
|
321
|
+
|
|
322
|
+
mock_pyslurm.Partitions = MagicMock()
|
|
323
|
+
mock_pyslurm.Partitions.load.return_value = partitions
|
|
324
|
+
|
|
325
|
+
mock_pyslurm.Nodes = MagicMock()
|
|
326
|
+
mock_pyslurm.Nodes.load.return_value = nodes
|
|
327
|
+
|
|
328
|
+
# Mock QoS
|
|
329
|
+
mock_qos_instance = MagicMock()
|
|
330
|
+
mock_qos_instance.get.return_value = qoss
|
|
331
|
+
mock_pyslurm.qos.return_value = mock_qos_instance
|
|
332
|
+
|
|
333
|
+
# Patch the module before import
|
|
334
|
+
mocker.patch.dict("sys.modules", {"pyslurm": mock_pyslurm})
|
|
335
|
+
|
|
336
|
+
# Create and test the checker
|
|
337
|
+
checker = SlurmChecker(intest=True)
|
|
338
|
+
|
|
339
|
+
assert checker.can_account_use_partition("climate_ref1", "cpu") is True
|
|
340
|
+
assert checker.can_account_use_partition("climate_ref2", "gpu") is True
|
|
341
|
+
assert checker.can_account_use_partition("climate", "nonexistent") is False
|
|
342
|
+
|
|
343
|
+
assert checker.can_account_use_qos("climate_ref1", "normal") is True
|
|
344
|
+
assert checker.can_account_use_qos("climate_ref3", "normal") is False
|
|
345
|
+
|
|
346
|
+
assert checker.get_partition_limits("cpu") == {
|
|
347
|
+
"max_time_minutes": 7200,
|
|
348
|
+
"default_time_minutes": 720,
|
|
349
|
+
"max_nodes": 1,
|
|
350
|
+
"total_nodes": 138,
|
|
351
|
+
"total_cpus": 17664,
|
|
352
|
+
}
|
|
353
|
+
assert checker.check_account_partition_access_with_limits("climate_ref2", "gpu") == {
|
|
354
|
+
"account_exists": True,
|
|
355
|
+
"partition_exists": True,
|
|
356
|
+
"has_access": True,
|
|
357
|
+
"time_limits": 7200,
|
|
358
|
+
"error": "none",
|
|
359
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{climate_ref-0.6.0 → climate_ref-0.6.1}/src/climate_ref/dataset_registry/obs4ref_reference.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_local.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_db.yml
RENAMED
|
File without changes
|
{climate_ref-0.6.0 → climate_ref-0.6.1}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_local.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|