runops 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runops/__init__.py +5 -0
- runops/_data/README.md +476 -0
- runops/adapters/__init__.py +29 -0
- runops/adapters/_utils/__init__.py +36 -0
- runops/adapters/_utils/toml_utils.py +81 -0
- runops/adapters/base.py +335 -0
- runops/adapters/contrib/__init__.py +5 -0
- runops/adapters/contrib/beach.py +837 -0
- runops/adapters/contrib/emses.py +1010 -0
- runops/adapters/generic.py +439 -0
- runops/adapters/registry.py +244 -0
- runops/cli/__init__.py +3 -0
- runops/cli/analyze.py +222 -0
- runops/cli/clone.py +104 -0
- runops/cli/config.py +217 -0
- runops/cli/context.py +56 -0
- runops/cli/create.py +263 -0
- runops/cli/dashboard.py +179 -0
- runops/cli/extend.py +204 -0
- runops/cli/history.py +105 -0
- runops/cli/init.py +1432 -0
- runops/cli/jobs.py +145 -0
- runops/cli/knowledge.py +1017 -0
- runops/cli/list.py +102 -0
- runops/cli/log.py +163 -0
- runops/cli/main.py +96 -0
- runops/cli/manage.py +231 -0
- runops/cli/new.py +343 -0
- runops/cli/notes.py +257 -0
- runops/cli/run_lookup.py +148 -0
- runops/cli/setup.py +174 -0
- runops/cli/status.py +187 -0
- runops/cli/submit.py +297 -0
- runops/cli/update.py +113 -0
- runops/cli/update_harness.py +245 -0
- runops/cli/update_refs.py +370 -0
- runops/core/__init__.py +3 -0
- runops/core/actions.py +1186 -0
- runops/core/analysis.py +1090 -0
- runops/core/campaign.py +156 -0
- runops/core/case.py +307 -0
- runops/core/context.py +426 -0
- runops/core/discovery.py +192 -0
- runops/core/environment.py +266 -0
- runops/core/exceptions.py +93 -0
- runops/core/knowledge.py +595 -0
- runops/core/knowledge_source.py +1204 -0
- runops/core/manifest.py +219 -0
- runops/core/project.py +171 -0
- runops/core/provenance.py +147 -0
- runops/core/retry.py +193 -0
- runops/core/run.py +170 -0
- runops/core/run_creation.py +456 -0
- runops/core/site.py +337 -0
- runops/core/state.py +197 -0
- runops/core/survey.py +380 -0
- runops/core/validation.py +40 -0
- runops/harness/__init__.py +27 -0
- runops/harness/builder.py +327 -0
- runops/harness/claude.py +189 -0
- runops/jobgen/__init__.py +3 -0
- runops/jobgen/generator.py +295 -0
- runops/launchers/__init__.py +17 -0
- runops/launchers/base.py +313 -0
- runops/launchers/mpiexec.py +131 -0
- runops/launchers/mpirun.py +132 -0
- runops/launchers/srun.py +126 -0
- runops/sites/__init__.py +0 -0
- runops/sites/camphor.md +98 -0
- runops/sites/camphor.toml +27 -0
- runops/slurm/__init__.py +3 -0
- runops/slurm/query.py +384 -0
- runops/slurm/submit.py +203 -0
- runops/templates/__init__.py +29 -0
- runops/templates/adapters/beach/agent_guide.md +50 -0
- runops/templates/adapters/beach/beach.toml +19 -0
- runops/templates/adapters/beach/case.toml +16 -0
- runops/templates/adapters/beach/summarize.py +272 -0
- runops/templates/adapters/emses/agent_guide.md +39 -0
- runops/templates/adapters/emses/case.toml +18 -0
- runops/templates/adapters/emses/plasma.toml +118 -0
- runops/templates/adapters/emses/summarize.py +413 -0
- runops/templates/adapters/generic/case.toml.j2 +13 -0
- runops/templates/adapters/generic/summarize.py +21 -0
- runops/templates/agent.md +156 -0
- runops/templates/rules/cookbook.md +22 -0
- runops/templates/scaffold/campaign.toml.j2 +10 -0
- runops/templates/scaffold/cases_claude.md +22 -0
- runops/templates/scaffold/facts.toml +2 -0
- runops/templates/scaffold/gitignore.txt +30 -0
- runops/templates/scaffold/notes/README.md +69 -0
- runops/templates/scaffold/rules/plan-before-act.md +17 -0
- runops/templates/scaffold/rules/runops-workflow.md +84 -0
- runops/templates/scaffold/rules/upstream-feedback.md +85 -0
- runops/templates/scaffold/runs_claude.md +24 -0
- runops/templates/scaffold/vscode_settings.json +9 -0
- runops/templates/skills/analyze/SKILL.md +40 -0
- runops/templates/skills/check-status/SKILL.md +29 -0
- runops/templates/skills/cleanup/SKILL.md +43 -0
- runops/templates/skills/create-run/SKILL.md +135 -0
- runops/templates/skills/debug-failed/SKILL.md +38 -0
- runops/templates/skills/learn/SKILL.md +54 -0
- runops/templates/skills/new-case/SKILL.md +108 -0
- runops/templates/skills/note/SKILL.md +107 -0
- runops/templates/skills/run-all/SKILL.md +47 -0
- runops/templates/skills/runops-reference/SKILL.md +203 -0
- runops/templates/skills/setup-campaign/SKILL.md +111 -0
- runops/templates/skills/setup-env/SKILL.md +32 -0
- runops/templates/skills/survey-design/SKILL.md +73 -0
- runops/templates/survey.toml.j2 +22 -0
- runops-0.2.0.dist-info/METADATA +491 -0
- runops-0.2.0.dist-info/RECORD +115 -0
- runops-0.2.0.dist-info/WHEEL +4 -0
- runops-0.2.0.dist-info/entry_points.txt +2 -0
- runops-0.2.0.dist-info/licenses/LICENSE +201 -0
runops/slurm/query.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"""Slurm job state queries via squeue and sacct.
|
|
2
|
+
|
|
3
|
+
Provides functions to query active and historical job states and map them to
|
|
4
|
+
runops ``RunState`` values. All subprocess calls go through an injectable
|
|
5
|
+
``CommandRunner`` callable so that tests never invoke real Slurm commands.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
from runops.core.state import RunState
|
|
15
|
+
from runops.slurm.submit import (
|
|
16
|
+
CommandResult,
|
|
17
|
+
CommandRunner,
|
|
18
|
+
SlurmNotFoundError,
|
|
19
|
+
_default_runner,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Slurm state -> runops RunState mapping
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
_SLURM_STATE_MAP: dict[str, RunState] = {
|
|
27
|
+
# Active / queued
|
|
28
|
+
"PENDING": RunState.SUBMITTED,
|
|
29
|
+
"CONFIGURING": RunState.RUNNING,
|
|
30
|
+
"RUNNING": RunState.RUNNING,
|
|
31
|
+
"COMPLETING": RunState.RUNNING,
|
|
32
|
+
"SUSPENDED": RunState.RUNNING,
|
|
33
|
+
"REQUEUED": RunState.SUBMITTED,
|
|
34
|
+
# Successful termination
|
|
35
|
+
"COMPLETED": RunState.COMPLETED,
|
|
36
|
+
# Failure modes
|
|
37
|
+
"FAILED": RunState.FAILED,
|
|
38
|
+
"NODE_FAIL": RunState.FAILED,
|
|
39
|
+
"OUT_OF_MEMORY": RunState.FAILED,
|
|
40
|
+
"TIMEOUT": RunState.FAILED,
|
|
41
|
+
"PREEMPTED": RunState.FAILED,
|
|
42
|
+
"BOOT_FAIL": RunState.FAILED,
|
|
43
|
+
"DEADLINE": RunState.FAILED,
|
|
44
|
+
# Cancellation
|
|
45
|
+
"CANCELLED": RunState.CANCELLED,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
#: Maps Slurm failure states to human-readable failure reasons.
|
|
50
|
+
_FAILURE_REASON_MAP: dict[str, str] = {
|
|
51
|
+
"TIMEOUT": "timeout",
|
|
52
|
+
"OUT_OF_MEMORY": "oom",
|
|
53
|
+
"NODE_FAIL": "node_fail",
|
|
54
|
+
"PREEMPTED": "preempted",
|
|
55
|
+
"BOOT_FAIL": "boot_fail",
|
|
56
|
+
"DEADLINE": "deadline",
|
|
57
|
+
"FAILED": "exit_error",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class SlurmQueryError(RuntimeError):
|
|
62
|
+
"""Raised when a Slurm query command fails unexpectedly."""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(frozen=True)
|
|
66
|
+
class PartitionInfo:
|
|
67
|
+
"""Information about a Slurm partition from ``sinfo``.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
name: Partition name.
|
|
71
|
+
avail: Availability status (e.g. ``"up"``).
|
|
72
|
+
timelimit: Raw time limit string from sinfo (e.g. ``"5-00:00:00"``).
|
|
73
|
+
timelimit_hours: Time limit in hours (for easy comparison).
|
|
74
|
+
nodes_total: Total node count in the partition.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
name: str
|
|
78
|
+
avail: str
|
|
79
|
+
timelimit: str
|
|
80
|
+
timelimit_hours: float
|
|
81
|
+
nodes_total: int = 0
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass(frozen=True)
|
|
85
|
+
class JobStatus:
|
|
86
|
+
"""Result of a Slurm job status query.
|
|
87
|
+
|
|
88
|
+
Attributes:
|
|
89
|
+
run_state: Mapped runops RunState.
|
|
90
|
+
slurm_state: Raw Slurm state string.
|
|
91
|
+
failure_reason: Reason for failure (empty if not failed).
|
|
92
|
+
exit_code: Slurm exit code string (if available).
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
run_state: RunState
|
|
96
|
+
slurm_state: str
|
|
97
|
+
failure_reason: str = ""
|
|
98
|
+
exit_code: str = ""
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# Helpers
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def map_slurm_state(slurm_state: str) -> RunState:
|
|
107
|
+
"""Map a raw Slurm job state string to a runops ``RunState``.
|
|
108
|
+
|
|
109
|
+
Slurm may append qualifiers like ``CANCELLED by 1000`` -- only the first
|
|
110
|
+
word is used for the lookup.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
slurm_state: Raw state string from squeue or sacct (e.g.
|
|
114
|
+
``"RUNNING"``, ``"CANCELLED by 1000"``).
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
The corresponding ``RunState``.
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
SlurmQueryError: If the state is not recognised.
|
|
121
|
+
"""
|
|
122
|
+
# Take first token to handle "CANCELLED by UID" variants
|
|
123
|
+
key = slurm_state.strip().split()[0].rstrip("+")
|
|
124
|
+
try:
|
|
125
|
+
return _SLURM_STATE_MAP[key]
|
|
126
|
+
except KeyError:
|
|
127
|
+
raise SlurmQueryError(f"Unknown Slurm job state: {slurm_state!r}") from None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
# sinfo (partition queries)
|
|
132
|
+
# ---------------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
_TIMELIMIT_RE = re.compile(r"(?:(\d+)-)?(\d+):(\d+):(\d+)")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _parse_timelimit(timelimit: str) -> float:
|
|
138
|
+
"""Parse a Slurm time limit string to hours.
|
|
139
|
+
|
|
140
|
+
Supports formats like ``5-00:00:00``, ``120:00:00``, ``infinite``.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
timelimit: Raw time limit string from sinfo.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Time limit in hours, or ``float('inf')`` for unlimited.
|
|
147
|
+
"""
|
|
148
|
+
if timelimit.lower() in ("infinite", "n/a"):
|
|
149
|
+
return float("inf")
|
|
150
|
+
m = _TIMELIMIT_RE.match(timelimit.strip())
|
|
151
|
+
if not m:
|
|
152
|
+
return float("inf")
|
|
153
|
+
day_str, hour_str, min_str, sec_str = m.groups()
|
|
154
|
+
day = int(day_str) if day_str else 0
|
|
155
|
+
hour = int(hour_str)
|
|
156
|
+
minutes = int(min_str)
|
|
157
|
+
sec = int(sec_str)
|
|
158
|
+
return 24.0 * day + hour + minutes / 60.0 + sec / 3600.0
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def sinfo_partitions(
|
|
162
|
+
*,
|
|
163
|
+
runner: CommandRunner | None = None,
|
|
164
|
+
) -> OrderedDict[str, PartitionInfo]:
|
|
165
|
+
"""Query ``sinfo`` for available partitions and their limits.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
runner: Optional command runner for testing.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Ordered dict mapping partition name to :class:`PartitionInfo`.
|
|
172
|
+
|
|
173
|
+
Raises:
|
|
174
|
+
SlurmNotFoundError: If ``sinfo`` is not on PATH.
|
|
175
|
+
SlurmQueryError: If sinfo returns an error.
|
|
176
|
+
"""
|
|
177
|
+
run = runner or _default_runner
|
|
178
|
+
cmd = [
|
|
179
|
+
"sinfo",
|
|
180
|
+
"--noheader",
|
|
181
|
+
"--format=%P|%a|%l|%D",
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
result: CommandResult = run(cmd)
|
|
186
|
+
except SlurmNotFoundError:
|
|
187
|
+
raise
|
|
188
|
+
|
|
189
|
+
if result.returncode != 0:
|
|
190
|
+
raise SlurmQueryError(
|
|
191
|
+
f"sinfo failed (exit {result.returncode}):\n{result.stderr.strip()}"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
partitions: OrderedDict[str, PartitionInfo] = OrderedDict()
|
|
195
|
+
for line in result.stdout.strip().splitlines():
|
|
196
|
+
parts = line.strip().split("|")
|
|
197
|
+
if len(parts) < 4:
|
|
198
|
+
continue
|
|
199
|
+
name = parts[0].rstrip("*") # default partition has trailing '*'
|
|
200
|
+
avail = parts[1]
|
|
201
|
+
timelimit = parts[2]
|
|
202
|
+
try:
|
|
203
|
+
nodes_total = int(parts[3])
|
|
204
|
+
except ValueError:
|
|
205
|
+
nodes_total = 0
|
|
206
|
+
|
|
207
|
+
partitions[name] = PartitionInfo(
|
|
208
|
+
name=name,
|
|
209
|
+
avail=avail,
|
|
210
|
+
timelimit=timelimit,
|
|
211
|
+
timelimit_hours=_parse_timelimit(timelimit),
|
|
212
|
+
nodes_total=nodes_total,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
return partitions
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# ---------------------------------------------------------------------------
|
|
219
|
+
# squeue
|
|
220
|
+
# ---------------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def squeue_status(
|
|
224
|
+
job_id: str,
|
|
225
|
+
*,
|
|
226
|
+
runner: CommandRunner | None = None,
|
|
227
|
+
) -> str | None:
|
|
228
|
+
"""Query ``squeue`` for an active job's state.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
job_id: Slurm job ID.
|
|
232
|
+
runner: Optional command runner for testing.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
The raw Slurm state string (e.g. ``"RUNNING"``) if the job is still
|
|
236
|
+
in the queue, or ``None`` if it has left the queue.
|
|
237
|
+
|
|
238
|
+
Raises:
|
|
239
|
+
SlurmNotFoundError: If ``squeue`` is not on PATH.
|
|
240
|
+
SlurmQueryError: If squeue returns a non-zero exit code.
|
|
241
|
+
"""
|
|
242
|
+
run = runner or _default_runner
|
|
243
|
+
cmd = [
|
|
244
|
+
"squeue",
|
|
245
|
+
"--job",
|
|
246
|
+
job_id,
|
|
247
|
+
"--noheader",
|
|
248
|
+
"--format=%T",
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
result: CommandResult = run(cmd)
|
|
253
|
+
except SlurmNotFoundError:
|
|
254
|
+
raise
|
|
255
|
+
|
|
256
|
+
if result.returncode != 0:
|
|
257
|
+
# squeue returns non-zero when the job is not found on some clusters
|
|
258
|
+
if "Invalid job id" in result.stderr:
|
|
259
|
+
return None
|
|
260
|
+
raise SlurmQueryError(
|
|
261
|
+
f"squeue failed (exit {result.returncode}):\n{result.stderr.strip()}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
state = result.stdout.strip()
|
|
265
|
+
if not state:
|
|
266
|
+
return None
|
|
267
|
+
return state
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# ---------------------------------------------------------------------------
|
|
271
|
+
# sacct
|
|
272
|
+
# ---------------------------------------------------------------------------
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def sacct_status(
|
|
276
|
+
job_id: str,
|
|
277
|
+
*,
|
|
278
|
+
runner: CommandRunner | None = None,
|
|
279
|
+
) -> dict[str, str] | None:
|
|
280
|
+
"""Query ``sacct`` for a historical job's state and exit code.
|
|
281
|
+
|
|
282
|
+
Uses ``--parsable2 --noheader`` with explicit format fields for reliable
|
|
283
|
+
parsing.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
job_id: Slurm job ID.
|
|
287
|
+
runner: Optional command runner for testing.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
A dictionary with keys ``"state"`` and ``"exit_code"`` if the job is
|
|
291
|
+
found, or ``None`` if sacct has no record.
|
|
292
|
+
|
|
293
|
+
Raises:
|
|
294
|
+
SlurmNotFoundError: If ``sacct`` is not on PATH.
|
|
295
|
+
SlurmQueryError: If sacct returns a non-zero exit code.
|
|
296
|
+
"""
|
|
297
|
+
run = runner or _default_runner
|
|
298
|
+
cmd = [
|
|
299
|
+
"sacct",
|
|
300
|
+
"--jobs",
|
|
301
|
+
job_id,
|
|
302
|
+
"--noheader",
|
|
303
|
+
"--parsable2",
|
|
304
|
+
"--format=JobID,State,ExitCode",
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
result: CommandResult = run(cmd)
|
|
309
|
+
except SlurmNotFoundError:
|
|
310
|
+
raise
|
|
311
|
+
|
|
312
|
+
if result.returncode != 0:
|
|
313
|
+
raise SlurmQueryError(
|
|
314
|
+
f"sacct failed (exit {result.returncode}):\n{result.stderr.strip()}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# sacct may return multiple lines (one per step). We want the "batch"
|
|
318
|
+
# line or the main job line (the one whose JobID matches exactly).
|
|
319
|
+
for line in result.stdout.strip().splitlines():
|
|
320
|
+
parts = line.split("|")
|
|
321
|
+
if len(parts) < 3:
|
|
322
|
+
continue
|
|
323
|
+
sacct_job_id, state, exit_code = parts[0], parts[1], parts[2]
|
|
324
|
+
# Match the main job entry (not sub-steps like "12345.batch")
|
|
325
|
+
if sacct_job_id.strip() == job_id:
|
|
326
|
+
return {"state": state.strip(), "exit_code": exit_code.strip()}
|
|
327
|
+
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# ---------------------------------------------------------------------------
|
|
332
|
+
# Combined query
|
|
333
|
+
# ---------------------------------------------------------------------------
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def query_job_status(
|
|
337
|
+
job_id: str,
|
|
338
|
+
*,
|
|
339
|
+
runner: CommandRunner | None = None,
|
|
340
|
+
) -> JobStatus:
|
|
341
|
+
"""Determine the current runops state of a Slurm job.
|
|
342
|
+
|
|
343
|
+
Strategy: try ``squeue`` first (cheap, covers active jobs). If the job
|
|
344
|
+
is no longer in the queue, fall back to ``sacct`` (covers completed /
|
|
345
|
+
historical jobs).
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
job_id: Slurm job ID.
|
|
349
|
+
runner: Optional command runner for testing.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
A :class:`JobStatus` with the mapped state, raw Slurm state,
|
|
353
|
+
failure reason, and exit code.
|
|
354
|
+
|
|
355
|
+
Raises:
|
|
356
|
+
SlurmNotFoundError: If Slurm commands are not on PATH.
|
|
357
|
+
SlurmQueryError: If neither squeue nor sacct can find the job, or
|
|
358
|
+
if the returned state is unrecognised.
|
|
359
|
+
"""
|
|
360
|
+
# 1. Try squeue (active jobs)
|
|
361
|
+
sq_state = squeue_status(job_id, runner=runner)
|
|
362
|
+
if sq_state is not None:
|
|
363
|
+
raw = sq_state.strip().split()[0].rstrip("+")
|
|
364
|
+
return JobStatus(
|
|
365
|
+
run_state=map_slurm_state(sq_state),
|
|
366
|
+
slurm_state=raw,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# 2. Fall back to sacct (historical jobs)
|
|
370
|
+
sa_info = sacct_status(job_id, runner=runner)
|
|
371
|
+
if sa_info is not None:
|
|
372
|
+
raw = sa_info["state"].strip().split()[0].rstrip("+")
|
|
373
|
+
run_state = map_slurm_state(sa_info["state"])
|
|
374
|
+
return JobStatus(
|
|
375
|
+
run_state=run_state,
|
|
376
|
+
slurm_state=raw,
|
|
377
|
+
failure_reason=_FAILURE_REASON_MAP.get(raw, ""),
|
|
378
|
+
exit_code=sa_info.get("exit_code", ""),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
raise SlurmQueryError(
|
|
382
|
+
f"Job {job_id} not found in squeue or sacct. "
|
|
383
|
+
"It may have been purged from the Slurm database."
|
|
384
|
+
)
|
runops/slurm/submit.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Slurm sbatch submission.
|
|
2
|
+
|
|
3
|
+
Provides functions to submit job scripts via sbatch and parse the resulting
|
|
4
|
+
job ID. All subprocess calls go through a single ``run_command`` callable
|
|
5
|
+
so that tests can inject a mock without touching the real Slurm installation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
import subprocess
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import NamedTuple
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Thin subprocess abstraction
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
_SBATCH_JOB_RE = re.compile(r"Submitted batch job (\d+)")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CommandResult(NamedTuple):
|
|
24
|
+
"""Result of a shell command execution."""
|
|
25
|
+
|
|
26
|
+
returncode: int
|
|
27
|
+
stdout: str
|
|
28
|
+
stderr: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
#: Type alias for the injectable command runner.
|
|
32
|
+
CommandRunner = Callable[[list[str]], CommandResult]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _default_runner(cmd: list[str]) -> CommandResult:
|
|
36
|
+
"""Run a command via ``subprocess.run``.
|
|
37
|
+
|
|
38
|
+
This is the production implementation used when no mock is injected.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
cmd: Command and arguments to execute.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A ``CommandResult`` with return code, stdout, and stderr.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
SlurmNotFoundError: If the command executable is not found on PATH.
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
proc = subprocess.run(
|
|
51
|
+
cmd,
|
|
52
|
+
capture_output=True,
|
|
53
|
+
text=True,
|
|
54
|
+
check=False,
|
|
55
|
+
timeout=60,
|
|
56
|
+
)
|
|
57
|
+
except FileNotFoundError as exc:
|
|
58
|
+
raise SlurmNotFoundError(
|
|
59
|
+
f"Command not found: {cmd[0]!r}. Is Slurm installed and on PATH?"
|
|
60
|
+
) from exc
|
|
61
|
+
except subprocess.TimeoutExpired as exc:
|
|
62
|
+
raise SlurmSubmitError(f"sbatch timed out after 60 seconds: {exc}") from exc
|
|
63
|
+
return CommandResult(
|
|
64
|
+
returncode=proc.returncode,
|
|
65
|
+
stdout=proc.stdout,
|
|
66
|
+
stderr=proc.stderr,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# Exceptions
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SlurmNotFoundError(RuntimeError):
|
|
76
|
+
"""Raised when the Slurm command is not found on PATH."""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class SlurmSubmitError(RuntimeError):
|
|
80
|
+
"""Raised when sbatch fails or returns unexpected output."""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class SlurmCancelError(RuntimeError):
|
|
84
|
+
"""Raised when scancel fails."""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
# Public API
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def parse_job_id(sbatch_stdout: str) -> str:
|
|
93
|
+
"""Extract the job ID from sbatch standard output.
|
|
94
|
+
|
|
95
|
+
Expected format: ``Submitted batch job 12345``
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
sbatch_stdout: The captured stdout from an sbatch invocation.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
The numeric job ID as a string.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
SlurmSubmitError: If the output does not match the expected pattern.
|
|
105
|
+
"""
|
|
106
|
+
match = _SBATCH_JOB_RE.search(sbatch_stdout)
|
|
107
|
+
if match is None:
|
|
108
|
+
raise SlurmSubmitError(
|
|
109
|
+
f"Could not parse job ID from sbatch output: {sbatch_stdout!r}"
|
|
110
|
+
)
|
|
111
|
+
return match.group(1)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def sbatch_submit(
|
|
115
|
+
job_script: Path,
|
|
116
|
+
working_dir: Path,
|
|
117
|
+
*,
|
|
118
|
+
extra_args: list[str] | None = None,
|
|
119
|
+
afterok: str | None = None,
|
|
120
|
+
runner: CommandRunner | None = None,
|
|
121
|
+
) -> str:
|
|
122
|
+
"""Submit a job script via ``sbatch``.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
job_script: Path to the job script file (e.g. ``submit/job.sh``).
|
|
126
|
+
working_dir: Working directory for the sbatch process (typically
|
|
127
|
+
the run directory's ``work/`` subdirectory).
|
|
128
|
+
extra_args: Additional sbatch arguments (e.g.
|
|
129
|
+
``["--partition=gr10451a"]``). These override script directives.
|
|
130
|
+
afterok: If set, add ``--dependency=afterok:<job_id>`` so this job
|
|
131
|
+
starts only after the specified job completes successfully.
|
|
132
|
+
runner: Optional callable that executes a command list and returns
|
|
133
|
+
a ``CommandResult``. Defaults to the real subprocess runner.
|
|
134
|
+
Inject a mock here for testing.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
The Slurm job ID as a string.
|
|
138
|
+
|
|
139
|
+
Raises:
|
|
140
|
+
FileNotFoundError: If *job_script* does not exist.
|
|
141
|
+
SlurmNotFoundError: If ``sbatch`` is not on PATH.
|
|
142
|
+
SlurmSubmitError: If sbatch returns a non-zero exit code or its
|
|
143
|
+
output cannot be parsed.
|
|
144
|
+
"""
|
|
145
|
+
if not job_script.exists():
|
|
146
|
+
raise FileNotFoundError(f"Job script not found: {job_script}")
|
|
147
|
+
|
|
148
|
+
run = runner or _default_runner
|
|
149
|
+
cmd = ["sbatch", f"--chdir={working_dir}"]
|
|
150
|
+
if afterok:
|
|
151
|
+
cmd.append(f"--dependency=afterok:{afterok}")
|
|
152
|
+
if extra_args:
|
|
153
|
+
cmd.extend(extra_args)
|
|
154
|
+
cmd.append(str(job_script))
|
|
155
|
+
result = run(cmd)
|
|
156
|
+
|
|
157
|
+
if result.returncode != 0:
|
|
158
|
+
raise SlurmSubmitError(
|
|
159
|
+
f"sbatch failed (exit {result.returncode}):\n{result.stderr.strip()}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return parse_job_id(result.stdout)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def scancel_job(
|
|
166
|
+
job_id: str,
|
|
167
|
+
*,
|
|
168
|
+
runner: CommandRunner | None = None,
|
|
169
|
+
) -> None:
|
|
170
|
+
"""Cancel a Slurm job via ``scancel``.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
job_id: The Slurm job ID to cancel.
|
|
174
|
+
runner: Optional callable that executes a command list and returns
|
|
175
|
+
a ``CommandResult``. Defaults to the real subprocess runner.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
SlurmNotFoundError: If ``scancel`` is not on PATH.
|
|
179
|
+
SlurmCancelError: If scancel returns a non-zero exit code.
|
|
180
|
+
"""
|
|
181
|
+
run = runner or _default_runner
|
|
182
|
+
result = run(["scancel", job_id])
|
|
183
|
+
if result.returncode != 0:
|
|
184
|
+
raise SlurmCancelError(
|
|
185
|
+
f"scancel failed (exit {result.returncode}):\n{result.stderr.strip()}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# Keep the old name as an alias so the existing test import still works,
|
|
190
|
+
# but point it at the new implementation.
|
|
191
|
+
def sbatch(job_script: Path) -> str:
|
|
192
|
+
"""Submit a job script via sbatch (legacy wrapper).
|
|
193
|
+
|
|
194
|
+
Deprecated: prefer ``sbatch_submit`` which accepts a working directory
|
|
195
|
+
and an injectable runner.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
job_script: Path to the job.sh file.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
The Slurm job_id as a string.
|
|
202
|
+
"""
|
|
203
|
+
return sbatch_submit(job_script, job_script.parent)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Template loading utilities for runops."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import jinja2
|
|
8
|
+
|
|
9
|
+
_TEMPLATES_DIR = Path(__file__).resolve().parent
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_jinja_env() -> jinja2.Environment:
|
|
13
|
+
"""Return a Jinja2 environment that loads from runops/templates/."""
|
|
14
|
+
return jinja2.Environment(
|
|
15
|
+
loader=jinja2.FileSystemLoader(str(_TEMPLATES_DIR)),
|
|
16
|
+
keep_trailing_newline=True,
|
|
17
|
+
undefined=jinja2.StrictUndefined,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_static(relative_path: str) -> str:
|
|
22
|
+
"""Load a static (non-Jinja2) template file as text."""
|
|
23
|
+
return (_TEMPLATES_DIR / relative_path).read_text(encoding="utf-8")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def render(template_path: str, **kwargs: object) -> str:
|
|
27
|
+
"""Render a Jinja2 template with the given variables."""
|
|
28
|
+
env = get_jinja_env()
|
|
29
|
+
return env.get_template(template_path).render(**kwargs)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
### BEACH (BEM + Accumulated CHarge Simulator)
|
|
2
|
+
|
|
3
|
+
#### 概要
|
|
4
|
+
BEACH は境界要素法 (BEM) ベースの表面帯電シミュレーション。
|
|
5
|
+
MPI + OpenMP ハイブリッド並列で実行し、宇宙機表面の帯電現象を計算する。
|
|
6
|
+
|
|
7
|
+
#### 入力ファイル
|
|
8
|
+
- **`input/beach.toml`** — メイン設定ファイル (TOML 形式)
|
|
9
|
+
- `[sim]`: `dt`, `max_step`, `batch_count`, `field_solver`
|
|
10
|
+
- `[mesh]`: `obj_path` (OBJ メッシュファイルパス)
|
|
11
|
+
- `[environment]`: プラズマ環境パラメータ (密度, 温度, etc.)
|
|
12
|
+
- `[output]`: `dir` (出力ディレクトリ)
|
|
13
|
+
|
|
14
|
+
#### 出力ファイル (`work/latest/` 以下)
|
|
15
|
+
- `summary.txt` — 計算結果サマリー (完了時に生成)
|
|
16
|
+
- `charges.csv` — 表面電荷分布
|
|
17
|
+
- `charge_history.csv` — 電荷時間履歴
|
|
18
|
+
- `potential_history.csv` — 電位時間履歴
|
|
19
|
+
- `mesh_triangles.csv`, `mesh_sources.csv` — メッシュ情報
|
|
20
|
+
- `performance_profile.csv` — 性能プロファイル
|
|
21
|
+
|
|
22
|
+
#### 完了判定
|
|
23
|
+
- `work/latest/summary.txt` が存在 → completed
|
|
24
|
+
- stderr に "error", "fatal", "killed" → failed
|
|
25
|
+
- `charges.csv` のみ存在 → running (途中)
|
|
26
|
+
|
|
27
|
+
#### パラメータサーベイでよく変えるパラメータ
|
|
28
|
+
- `sim.dt`, `sim.max_step`, `sim.batch_count`
|
|
29
|
+
- `environment.electron_density`, `environment.electron_temperature`
|
|
30
|
+
- `environment.ion_density`, `environment.ion_temperature`
|
|
31
|
+
- `mesh.obj_path` (異なるジオメトリの比較)
|
|
32
|
+
|
|
33
|
+
#### ドキュメント・参考
|
|
34
|
+
- BEACH ソースリポジトリの README / docs/
|
|
35
|
+
- OBJ メッシュファイルは Blender 等で作成
|
|
36
|
+
- パラメータの dot 記法例: `sim.dt=1.0e-6`, `environment.electron_density=1.0e12`
|
|
37
|
+
|
|
38
|
+
#### 実行コマンド
|
|
39
|
+
```
|
|
40
|
+
srun beach input/beach.toml
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
`beach.toml` の `output.dir` は `work/latest` に自動設定される。
|
|
44
|
+
|
|
45
|
+
#### 環境変数 (OpenMP)
|
|
46
|
+
```
|
|
47
|
+
OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
|
|
48
|
+
OMP_PROC_BIND=spread
|
|
49
|
+
OMP_PLACES=cores
|
|
50
|
+
```
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# BEACH configuration
|
|
2
|
+
# See BEACH documentation for full parameter reference
|
|
3
|
+
|
|
4
|
+
[sim]
|
|
5
|
+
dt = 1.0e-6
|
|
6
|
+
max_step = 1000
|
|
7
|
+
batch_count = 100
|
|
8
|
+
|
|
9
|
+
[mesh]
|
|
10
|
+
# obj_path = "mesh.obj"
|
|
11
|
+
|
|
12
|
+
[environment]
|
|
13
|
+
electron_density = 1.0e12
|
|
14
|
+
electron_temperature = 1.0
|
|
15
|
+
ion_density = 1.0e12
|
|
16
|
+
ion_temperature = 1.0
|
|
17
|
+
|
|
18
|
+
[output]
|
|
19
|
+
# dir is set automatically by runops
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[case]
|
|
2
|
+
name = ""
|
|
3
|
+
simulator = "beach"
|
|
4
|
+
launcher = "default"
|
|
5
|
+
description = ""
|
|
6
|
+
|
|
7
|
+
[params]
|
|
8
|
+
# "sim.dt" = 1.0e-6
|
|
9
|
+
# "sim.max_step" = 1000
|
|
10
|
+
# "environment.electron_density" = 1.0e12
|
|
11
|
+
|
|
12
|
+
[job]
|
|
13
|
+
partition = ""
|
|
14
|
+
nodes = 1
|
|
15
|
+
ntasks = 1
|
|
16
|
+
walltime = "01:00:00"
|