runops 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. runops/__init__.py +5 -0
  2. runops/_data/README.md +476 -0
  3. runops/adapters/__init__.py +29 -0
  4. runops/adapters/_utils/__init__.py +36 -0
  5. runops/adapters/_utils/toml_utils.py +81 -0
  6. runops/adapters/base.py +335 -0
  7. runops/adapters/contrib/__init__.py +5 -0
  8. runops/adapters/contrib/beach.py +837 -0
  9. runops/adapters/contrib/emses.py +1010 -0
  10. runops/adapters/generic.py +439 -0
  11. runops/adapters/registry.py +244 -0
  12. runops/cli/__init__.py +3 -0
  13. runops/cli/analyze.py +222 -0
  14. runops/cli/clone.py +104 -0
  15. runops/cli/config.py +217 -0
  16. runops/cli/context.py +56 -0
  17. runops/cli/create.py +263 -0
  18. runops/cli/dashboard.py +179 -0
  19. runops/cli/extend.py +204 -0
  20. runops/cli/history.py +105 -0
  21. runops/cli/init.py +1432 -0
  22. runops/cli/jobs.py +145 -0
  23. runops/cli/knowledge.py +1017 -0
  24. runops/cli/list.py +102 -0
  25. runops/cli/log.py +163 -0
  26. runops/cli/main.py +96 -0
  27. runops/cli/manage.py +231 -0
  28. runops/cli/new.py +343 -0
  29. runops/cli/notes.py +257 -0
  30. runops/cli/run_lookup.py +148 -0
  31. runops/cli/setup.py +174 -0
  32. runops/cli/status.py +187 -0
  33. runops/cli/submit.py +297 -0
  34. runops/cli/update.py +113 -0
  35. runops/cli/update_harness.py +245 -0
  36. runops/cli/update_refs.py +370 -0
  37. runops/core/__init__.py +3 -0
  38. runops/core/actions.py +1186 -0
  39. runops/core/analysis.py +1090 -0
  40. runops/core/campaign.py +156 -0
  41. runops/core/case.py +307 -0
  42. runops/core/context.py +426 -0
  43. runops/core/discovery.py +192 -0
  44. runops/core/environment.py +266 -0
  45. runops/core/exceptions.py +93 -0
  46. runops/core/knowledge.py +595 -0
  47. runops/core/knowledge_source.py +1204 -0
  48. runops/core/manifest.py +219 -0
  49. runops/core/project.py +171 -0
  50. runops/core/provenance.py +147 -0
  51. runops/core/retry.py +193 -0
  52. runops/core/run.py +170 -0
  53. runops/core/run_creation.py +456 -0
  54. runops/core/site.py +337 -0
  55. runops/core/state.py +197 -0
  56. runops/core/survey.py +380 -0
  57. runops/core/validation.py +40 -0
  58. runops/harness/__init__.py +27 -0
  59. runops/harness/builder.py +327 -0
  60. runops/harness/claude.py +189 -0
  61. runops/jobgen/__init__.py +3 -0
  62. runops/jobgen/generator.py +295 -0
  63. runops/launchers/__init__.py +17 -0
  64. runops/launchers/base.py +313 -0
  65. runops/launchers/mpiexec.py +131 -0
  66. runops/launchers/mpirun.py +132 -0
  67. runops/launchers/srun.py +126 -0
  68. runops/sites/__init__.py +0 -0
  69. runops/sites/camphor.md +98 -0
  70. runops/sites/camphor.toml +27 -0
  71. runops/slurm/__init__.py +3 -0
  72. runops/slurm/query.py +384 -0
  73. runops/slurm/submit.py +203 -0
  74. runops/templates/__init__.py +29 -0
  75. runops/templates/adapters/beach/agent_guide.md +50 -0
  76. runops/templates/adapters/beach/beach.toml +19 -0
  77. runops/templates/adapters/beach/case.toml +16 -0
  78. runops/templates/adapters/beach/summarize.py +272 -0
  79. runops/templates/adapters/emses/agent_guide.md +39 -0
  80. runops/templates/adapters/emses/case.toml +18 -0
  81. runops/templates/adapters/emses/plasma.toml +118 -0
  82. runops/templates/adapters/emses/summarize.py +413 -0
  83. runops/templates/adapters/generic/case.toml.j2 +13 -0
  84. runops/templates/adapters/generic/summarize.py +21 -0
  85. runops/templates/agent.md +156 -0
  86. runops/templates/rules/cookbook.md +22 -0
  87. runops/templates/scaffold/campaign.toml.j2 +10 -0
  88. runops/templates/scaffold/cases_claude.md +22 -0
  89. runops/templates/scaffold/facts.toml +2 -0
  90. runops/templates/scaffold/gitignore.txt +30 -0
  91. runops/templates/scaffold/notes/README.md +69 -0
  92. runops/templates/scaffold/rules/plan-before-act.md +17 -0
  93. runops/templates/scaffold/rules/runops-workflow.md +84 -0
  94. runops/templates/scaffold/rules/upstream-feedback.md +85 -0
  95. runops/templates/scaffold/runs_claude.md +24 -0
  96. runops/templates/scaffold/vscode_settings.json +9 -0
  97. runops/templates/skills/analyze/SKILL.md +40 -0
  98. runops/templates/skills/check-status/SKILL.md +29 -0
  99. runops/templates/skills/cleanup/SKILL.md +43 -0
  100. runops/templates/skills/create-run/SKILL.md +135 -0
  101. runops/templates/skills/debug-failed/SKILL.md +38 -0
  102. runops/templates/skills/learn/SKILL.md +54 -0
  103. runops/templates/skills/new-case/SKILL.md +108 -0
  104. runops/templates/skills/note/SKILL.md +107 -0
  105. runops/templates/skills/run-all/SKILL.md +47 -0
  106. runops/templates/skills/runops-reference/SKILL.md +203 -0
  107. runops/templates/skills/setup-campaign/SKILL.md +111 -0
  108. runops/templates/skills/setup-env/SKILL.md +32 -0
  109. runops/templates/skills/survey-design/SKILL.md +73 -0
  110. runops/templates/survey.toml.j2 +22 -0
  111. runops-0.2.0.dist-info/METADATA +491 -0
  112. runops-0.2.0.dist-info/RECORD +115 -0
  113. runops-0.2.0.dist-info/WHEEL +4 -0
  114. runops-0.2.0.dist-info/entry_points.txt +2 -0
  115. runops-0.2.0.dist-info/licenses/LICENSE +201 -0
runops/slurm/query.py ADDED
@@ -0,0 +1,384 @@
1
+ """Slurm job state queries via squeue and sacct.
2
+
3
+ Provides functions to query active and historical job states and map them to
4
+ runops ``RunState`` values. All subprocess calls go through an injectable
5
+ ``CommandRunner`` callable so that tests never invoke real Slurm commands.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from collections import OrderedDict
12
+ from dataclasses import dataclass
13
+
14
+ from runops.core.state import RunState
15
+ from runops.slurm.submit import (
16
+ CommandResult,
17
+ CommandRunner,
18
+ SlurmNotFoundError,
19
+ _default_runner,
20
+ )
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Slurm state -> runops RunState mapping
24
+ # ---------------------------------------------------------------------------
25
+
26
+ _SLURM_STATE_MAP: dict[str, RunState] = {
27
+ # Active / queued
28
+ "PENDING": RunState.SUBMITTED,
29
+ "CONFIGURING": RunState.RUNNING,
30
+ "RUNNING": RunState.RUNNING,
31
+ "COMPLETING": RunState.RUNNING,
32
+ "SUSPENDED": RunState.RUNNING,
33
+ "REQUEUED": RunState.SUBMITTED,
34
+ # Successful termination
35
+ "COMPLETED": RunState.COMPLETED,
36
+ # Failure modes
37
+ "FAILED": RunState.FAILED,
38
+ "NODE_FAIL": RunState.FAILED,
39
+ "OUT_OF_MEMORY": RunState.FAILED,
40
+ "TIMEOUT": RunState.FAILED,
41
+ "PREEMPTED": RunState.FAILED,
42
+ "BOOT_FAIL": RunState.FAILED,
43
+ "DEADLINE": RunState.FAILED,
44
+ # Cancellation
45
+ "CANCELLED": RunState.CANCELLED,
46
+ }
47
+
48
+
49
+ #: Maps Slurm failure states to human-readable failure reasons.
50
+ _FAILURE_REASON_MAP: dict[str, str] = {
51
+ "TIMEOUT": "timeout",
52
+ "OUT_OF_MEMORY": "oom",
53
+ "NODE_FAIL": "node_fail",
54
+ "PREEMPTED": "preempted",
55
+ "BOOT_FAIL": "boot_fail",
56
+ "DEADLINE": "deadline",
57
+ "FAILED": "exit_error",
58
+ }
59
+
60
+
61
+ class SlurmQueryError(RuntimeError):
62
+ """Raised when a Slurm query command fails unexpectedly."""
63
+
64
+
65
+ @dataclass(frozen=True)
66
+ class PartitionInfo:
67
+ """Information about a Slurm partition from ``sinfo``.
68
+
69
+ Attributes:
70
+ name: Partition name.
71
+ avail: Availability status (e.g. ``"up"``).
72
+ timelimit: Raw time limit string from sinfo (e.g. ``"5-00:00:00"``).
73
+ timelimit_hours: Time limit in hours (for easy comparison).
74
+ nodes_total: Total node count in the partition.
75
+ """
76
+
77
+ name: str
78
+ avail: str
79
+ timelimit: str
80
+ timelimit_hours: float
81
+ nodes_total: int = 0
82
+
83
+
84
+ @dataclass(frozen=True)
85
+ class JobStatus:
86
+ """Result of a Slurm job status query.
87
+
88
+ Attributes:
89
+ run_state: Mapped runops RunState.
90
+ slurm_state: Raw Slurm state string.
91
+ failure_reason: Reason for failure (empty if not failed).
92
+ exit_code: Slurm exit code string (if available).
93
+ """
94
+
95
+ run_state: RunState
96
+ slurm_state: str
97
+ failure_reason: str = ""
98
+ exit_code: str = ""
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Helpers
103
+ # ---------------------------------------------------------------------------
104
+
105
+
106
+ def map_slurm_state(slurm_state: str) -> RunState:
107
+ """Map a raw Slurm job state string to a runops ``RunState``.
108
+
109
+ Slurm may append qualifiers like ``CANCELLED by 1000`` -- only the first
110
+ word is used for the lookup.
111
+
112
+ Args:
113
+ slurm_state: Raw state string from squeue or sacct (e.g.
114
+ ``"RUNNING"``, ``"CANCELLED by 1000"``).
115
+
116
+ Returns:
117
+ The corresponding ``RunState``.
118
+
119
+ Raises:
120
+ SlurmQueryError: If the state is not recognised.
121
+ """
122
+ # Take first token to handle "CANCELLED by UID" variants
123
+ key = slurm_state.strip().split()[0].rstrip("+")
124
+ try:
125
+ return _SLURM_STATE_MAP[key]
126
+ except KeyError:
127
+ raise SlurmQueryError(f"Unknown Slurm job state: {slurm_state!r}") from None
128
+
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # sinfo (partition queries)
132
+ # ---------------------------------------------------------------------------
133
+
134
+ _TIMELIMIT_RE = re.compile(r"(?:(\d+)-)?(\d+):(\d+):(\d+)")
135
+
136
+
137
+ def _parse_timelimit(timelimit: str) -> float:
138
+ """Parse a Slurm time limit string to hours.
139
+
140
+ Supports formats like ``5-00:00:00``, ``120:00:00``, ``infinite``.
141
+
142
+ Args:
143
+ timelimit: Raw time limit string from sinfo.
144
+
145
+ Returns:
146
+ Time limit in hours, or ``float('inf')`` for unlimited.
147
+ """
148
+ if timelimit.lower() in ("infinite", "n/a"):
149
+ return float("inf")
150
+ m = _TIMELIMIT_RE.match(timelimit.strip())
151
+ if not m:
152
+ return float("inf")
153
+ day_str, hour_str, min_str, sec_str = m.groups()
154
+ day = int(day_str) if day_str else 0
155
+ hour = int(hour_str)
156
+ minutes = int(min_str)
157
+ sec = int(sec_str)
158
+ return 24.0 * day + hour + minutes / 60.0 + sec / 3600.0
159
+
160
+
161
+ def sinfo_partitions(
162
+ *,
163
+ runner: CommandRunner | None = None,
164
+ ) -> OrderedDict[str, PartitionInfo]:
165
+ """Query ``sinfo`` for available partitions and their limits.
166
+
167
+ Args:
168
+ runner: Optional command runner for testing.
169
+
170
+ Returns:
171
+ Ordered dict mapping partition name to :class:`PartitionInfo`.
172
+
173
+ Raises:
174
+ SlurmNotFoundError: If ``sinfo`` is not on PATH.
175
+ SlurmQueryError: If sinfo returns an error.
176
+ """
177
+ run = runner or _default_runner
178
+ cmd = [
179
+ "sinfo",
180
+ "--noheader",
181
+ "--format=%P|%a|%l|%D",
182
+ ]
183
+
184
+ try:
185
+ result: CommandResult = run(cmd)
186
+ except SlurmNotFoundError:
187
+ raise
188
+
189
+ if result.returncode != 0:
190
+ raise SlurmQueryError(
191
+ f"sinfo failed (exit {result.returncode}):\n{result.stderr.strip()}"
192
+ )
193
+
194
+ partitions: OrderedDict[str, PartitionInfo] = OrderedDict()
195
+ for line in result.stdout.strip().splitlines():
196
+ parts = line.strip().split("|")
197
+ if len(parts) < 4:
198
+ continue
199
+ name = parts[0].rstrip("*") # default partition has trailing '*'
200
+ avail = parts[1]
201
+ timelimit = parts[2]
202
+ try:
203
+ nodes_total = int(parts[3])
204
+ except ValueError:
205
+ nodes_total = 0
206
+
207
+ partitions[name] = PartitionInfo(
208
+ name=name,
209
+ avail=avail,
210
+ timelimit=timelimit,
211
+ timelimit_hours=_parse_timelimit(timelimit),
212
+ nodes_total=nodes_total,
213
+ )
214
+
215
+ return partitions
216
+
217
+
218
+ # ---------------------------------------------------------------------------
219
+ # squeue
220
+ # ---------------------------------------------------------------------------
221
+
222
+
223
+ def squeue_status(
224
+ job_id: str,
225
+ *,
226
+ runner: CommandRunner | None = None,
227
+ ) -> str | None:
228
+ """Query ``squeue`` for an active job's state.
229
+
230
+ Args:
231
+ job_id: Slurm job ID.
232
+ runner: Optional command runner for testing.
233
+
234
+ Returns:
235
+ The raw Slurm state string (e.g. ``"RUNNING"``) if the job is still
236
+ in the queue, or ``None`` if it has left the queue.
237
+
238
+ Raises:
239
+ SlurmNotFoundError: If ``squeue`` is not on PATH.
240
+ SlurmQueryError: If squeue returns a non-zero exit code.
241
+ """
242
+ run = runner or _default_runner
243
+ cmd = [
244
+ "squeue",
245
+ "--job",
246
+ job_id,
247
+ "--noheader",
248
+ "--format=%T",
249
+ ]
250
+
251
+ try:
252
+ result: CommandResult = run(cmd)
253
+ except SlurmNotFoundError:
254
+ raise
255
+
256
+ if result.returncode != 0:
257
+ # squeue returns non-zero when the job is not found on some clusters
258
+ if "Invalid job id" in result.stderr:
259
+ return None
260
+ raise SlurmQueryError(
261
+ f"squeue failed (exit {result.returncode}):\n{result.stderr.strip()}"
262
+ )
263
+
264
+ state = result.stdout.strip()
265
+ if not state:
266
+ return None
267
+ return state
268
+
269
+
270
+ # ---------------------------------------------------------------------------
271
+ # sacct
272
+ # ---------------------------------------------------------------------------
273
+
274
+
275
+ def sacct_status(
276
+ job_id: str,
277
+ *,
278
+ runner: CommandRunner | None = None,
279
+ ) -> dict[str, str] | None:
280
+ """Query ``sacct`` for a historical job's state and exit code.
281
+
282
+ Uses ``--parsable2 --noheader`` with explicit format fields for reliable
283
+ parsing.
284
+
285
+ Args:
286
+ job_id: Slurm job ID.
287
+ runner: Optional command runner for testing.
288
+
289
+ Returns:
290
+ A dictionary with keys ``"state"`` and ``"exit_code"`` if the job is
291
+ found, or ``None`` if sacct has no record.
292
+
293
+ Raises:
294
+ SlurmNotFoundError: If ``sacct`` is not on PATH.
295
+ SlurmQueryError: If sacct returns a non-zero exit code.
296
+ """
297
+ run = runner or _default_runner
298
+ cmd = [
299
+ "sacct",
300
+ "--jobs",
301
+ job_id,
302
+ "--noheader",
303
+ "--parsable2",
304
+ "--format=JobID,State,ExitCode",
305
+ ]
306
+
307
+ try:
308
+ result: CommandResult = run(cmd)
309
+ except SlurmNotFoundError:
310
+ raise
311
+
312
+ if result.returncode != 0:
313
+ raise SlurmQueryError(
314
+ f"sacct failed (exit {result.returncode}):\n{result.stderr.strip()}"
315
+ )
316
+
317
+ # sacct may return multiple lines (one per step). We want the "batch"
318
+ # line or the main job line (the one whose JobID matches exactly).
319
+ for line in result.stdout.strip().splitlines():
320
+ parts = line.split("|")
321
+ if len(parts) < 3:
322
+ continue
323
+ sacct_job_id, state, exit_code = parts[0], parts[1], parts[2]
324
+ # Match the main job entry (not sub-steps like "12345.batch")
325
+ if sacct_job_id.strip() == job_id:
326
+ return {"state": state.strip(), "exit_code": exit_code.strip()}
327
+
328
+ return None
329
+
330
+
331
+ # ---------------------------------------------------------------------------
332
+ # Combined query
333
+ # ---------------------------------------------------------------------------
334
+
335
+
336
+ def query_job_status(
337
+ job_id: str,
338
+ *,
339
+ runner: CommandRunner | None = None,
340
+ ) -> JobStatus:
341
+ """Determine the current runops state of a Slurm job.
342
+
343
+ Strategy: try ``squeue`` first (cheap, covers active jobs). If the job
344
+ is no longer in the queue, fall back to ``sacct`` (covers completed /
345
+ historical jobs).
346
+
347
+ Args:
348
+ job_id: Slurm job ID.
349
+ runner: Optional command runner for testing.
350
+
351
+ Returns:
352
+ A :class:`JobStatus` with the mapped state, raw Slurm state,
353
+ failure reason, and exit code.
354
+
355
+ Raises:
356
+ SlurmNotFoundError: If Slurm commands are not on PATH.
357
+ SlurmQueryError: If neither squeue nor sacct can find the job, or
358
+ if the returned state is unrecognised.
359
+ """
360
+ # 1. Try squeue (active jobs)
361
+ sq_state = squeue_status(job_id, runner=runner)
362
+ if sq_state is not None:
363
+ raw = sq_state.strip().split()[0].rstrip("+")
364
+ return JobStatus(
365
+ run_state=map_slurm_state(sq_state),
366
+ slurm_state=raw,
367
+ )
368
+
369
+ # 2. Fall back to sacct (historical jobs)
370
+ sa_info = sacct_status(job_id, runner=runner)
371
+ if sa_info is not None:
372
+ raw = sa_info["state"].strip().split()[0].rstrip("+")
373
+ run_state = map_slurm_state(sa_info["state"])
374
+ return JobStatus(
375
+ run_state=run_state,
376
+ slurm_state=raw,
377
+ failure_reason=_FAILURE_REASON_MAP.get(raw, ""),
378
+ exit_code=sa_info.get("exit_code", ""),
379
+ )
380
+
381
+ raise SlurmQueryError(
382
+ f"Job {job_id} not found in squeue or sacct. "
383
+ "It may have been purged from the Slurm database."
384
+ )
runops/slurm/submit.py ADDED
@@ -0,0 +1,203 @@
1
+ """Slurm sbatch submission.
2
+
3
+ Provides functions to submit job scripts via sbatch and parse the resulting
4
+ job ID. All subprocess calls go through a single ``run_command`` callable
5
+ so that tests can inject a mock without touching the real Slurm installation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ import subprocess
12
+ from collections.abc import Callable
13
+ from pathlib import Path
14
+ from typing import NamedTuple
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Thin subprocess abstraction
18
+ # ---------------------------------------------------------------------------
19
+
20
+ _SBATCH_JOB_RE = re.compile(r"Submitted batch job (\d+)")
21
+
22
+
23
+ class CommandResult(NamedTuple):
24
+ """Result of a shell command execution."""
25
+
26
+ returncode: int
27
+ stdout: str
28
+ stderr: str
29
+
30
+
31
+ #: Type alias for the injectable command runner.
32
+ CommandRunner = Callable[[list[str]], CommandResult]
33
+
34
+
35
+ def _default_runner(cmd: list[str]) -> CommandResult:
36
+ """Run a command via ``subprocess.run``.
37
+
38
+ This is the production implementation used when no mock is injected.
39
+
40
+ Args:
41
+ cmd: Command and arguments to execute.
42
+
43
+ Returns:
44
+ A ``CommandResult`` with return code, stdout, and stderr.
45
+
46
+ Raises:
47
+ SlurmNotFoundError: If the command executable is not found on PATH.
48
+ """
49
+ try:
50
+ proc = subprocess.run(
51
+ cmd,
52
+ capture_output=True,
53
+ text=True,
54
+ check=False,
55
+ timeout=60,
56
+ )
57
+ except FileNotFoundError as exc:
58
+ raise SlurmNotFoundError(
59
+ f"Command not found: {cmd[0]!r}. Is Slurm installed and on PATH?"
60
+ ) from exc
61
+ except subprocess.TimeoutExpired as exc:
62
+ raise SlurmSubmitError(f"sbatch timed out after 60 seconds: {exc}") from exc
63
+ return CommandResult(
64
+ returncode=proc.returncode,
65
+ stdout=proc.stdout,
66
+ stderr=proc.stderr,
67
+ )
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # Exceptions
72
+ # ---------------------------------------------------------------------------
73
+
74
+
75
+ class SlurmNotFoundError(RuntimeError):
76
+ """Raised when the Slurm command is not found on PATH."""
77
+
78
+
79
+ class SlurmSubmitError(RuntimeError):
80
+ """Raised when sbatch fails or returns unexpected output."""
81
+
82
+
83
+ class SlurmCancelError(RuntimeError):
84
+ """Raised when scancel fails."""
85
+
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # Public API
89
+ # ---------------------------------------------------------------------------
90
+
91
+
92
+ def parse_job_id(sbatch_stdout: str) -> str:
93
+ """Extract the job ID from sbatch standard output.
94
+
95
+ Expected format: ``Submitted batch job 12345``
96
+
97
+ Args:
98
+ sbatch_stdout: The captured stdout from an sbatch invocation.
99
+
100
+ Returns:
101
+ The numeric job ID as a string.
102
+
103
+ Raises:
104
+ SlurmSubmitError: If the output does not match the expected pattern.
105
+ """
106
+ match = _SBATCH_JOB_RE.search(sbatch_stdout)
107
+ if match is None:
108
+ raise SlurmSubmitError(
109
+ f"Could not parse job ID from sbatch output: {sbatch_stdout!r}"
110
+ )
111
+ return match.group(1)
112
+
113
+
114
+ def sbatch_submit(
115
+ job_script: Path,
116
+ working_dir: Path,
117
+ *,
118
+ extra_args: list[str] | None = None,
119
+ afterok: str | None = None,
120
+ runner: CommandRunner | None = None,
121
+ ) -> str:
122
+ """Submit a job script via ``sbatch``.
123
+
124
+ Args:
125
+ job_script: Path to the job script file (e.g. ``submit/job.sh``).
126
+ working_dir: Working directory for the sbatch process (typically
127
+ the run directory's ``work/`` subdirectory).
128
+ extra_args: Additional sbatch arguments (e.g.
129
+ ``["--partition=gr10451a"]``). These override script directives.
130
+ afterok: If set, add ``--dependency=afterok:<job_id>`` so this job
131
+ starts only after the specified job completes successfully.
132
+ runner: Optional callable that executes a command list and returns
133
+ a ``CommandResult``. Defaults to the real subprocess runner.
134
+ Inject a mock here for testing.
135
+
136
+ Returns:
137
+ The Slurm job ID as a string.
138
+
139
+ Raises:
140
+ FileNotFoundError: If *job_script* does not exist.
141
+ SlurmNotFoundError: If ``sbatch`` is not on PATH.
142
+ SlurmSubmitError: If sbatch returns a non-zero exit code or its
143
+ output cannot be parsed.
144
+ """
145
+ if not job_script.exists():
146
+ raise FileNotFoundError(f"Job script not found: {job_script}")
147
+
148
+ run = runner or _default_runner
149
+ cmd = ["sbatch", f"--chdir={working_dir}"]
150
+ if afterok:
151
+ cmd.append(f"--dependency=afterok:{afterok}")
152
+ if extra_args:
153
+ cmd.extend(extra_args)
154
+ cmd.append(str(job_script))
155
+ result = run(cmd)
156
+
157
+ if result.returncode != 0:
158
+ raise SlurmSubmitError(
159
+ f"sbatch failed (exit {result.returncode}):\n{result.stderr.strip()}"
160
+ )
161
+
162
+ return parse_job_id(result.stdout)
163
+
164
+
165
+ def scancel_job(
166
+ job_id: str,
167
+ *,
168
+ runner: CommandRunner | None = None,
169
+ ) -> None:
170
+ """Cancel a Slurm job via ``scancel``.
171
+
172
+ Args:
173
+ job_id: The Slurm job ID to cancel.
174
+ runner: Optional callable that executes a command list and returns
175
+ a ``CommandResult``. Defaults to the real subprocess runner.
176
+
177
+ Raises:
178
+ SlurmNotFoundError: If ``scancel`` is not on PATH.
179
+ SlurmCancelError: If scancel returns a non-zero exit code.
180
+ """
181
+ run = runner or _default_runner
182
+ result = run(["scancel", job_id])
183
+ if result.returncode != 0:
184
+ raise SlurmCancelError(
185
+ f"scancel failed (exit {result.returncode}):\n{result.stderr.strip()}"
186
+ )
187
+
188
+
189
+ # Keep the old name as an alias so the existing test import still works,
190
+ # but point it at the new implementation.
191
+ def sbatch(job_script: Path) -> str:
192
+ """Submit a job script via sbatch (legacy wrapper).
193
+
194
+ Deprecated: prefer ``sbatch_submit`` which accepts a working directory
195
+ and an injectable runner.
196
+
197
+ Args:
198
+ job_script: Path to the job.sh file.
199
+
200
+ Returns:
201
+ The Slurm job_id as a string.
202
+ """
203
+ return sbatch_submit(job_script, job_script.parent)
@@ -0,0 +1,29 @@
1
+ """Template loading utilities for runops."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import jinja2
8
+
9
+ _TEMPLATES_DIR = Path(__file__).resolve().parent
10
+
11
+
12
+ def get_jinja_env() -> jinja2.Environment:
13
+ """Return a Jinja2 environment that loads from runops/templates/."""
14
+ return jinja2.Environment(
15
+ loader=jinja2.FileSystemLoader(str(_TEMPLATES_DIR)),
16
+ keep_trailing_newline=True,
17
+ undefined=jinja2.StrictUndefined,
18
+ )
19
+
20
+
21
+ def load_static(relative_path: str) -> str:
22
+ """Load a static (non-Jinja2) template file as text."""
23
+ return (_TEMPLATES_DIR / relative_path).read_text(encoding="utf-8")
24
+
25
+
26
+ def render(template_path: str, **kwargs: object) -> str:
27
+ """Render a Jinja2 template with the given variables."""
28
+ env = get_jinja_env()
29
+ return env.get_template(template_path).render(**kwargs)
@@ -0,0 +1,50 @@
1
+ ### BEACH (BEM + Accumulated CHarge Simulator)
2
+
3
+ #### 概要
4
+ BEACH は境界要素法 (BEM) ベースの表面帯電シミュレーション。
5
+ MPI + OpenMP ハイブリッド並列で実行し、宇宙機表面の帯電現象を計算する。
6
+
7
+ #### 入力ファイル
8
+ - **`input/beach.toml`** — メイン設定ファイル (TOML 形式)
9
+ - `[sim]`: `dt`, `max_step`, `batch_count`, `field_solver`
10
+ - `[mesh]`: `obj_path` (OBJ メッシュファイルパス)
11
+ - `[environment]`: プラズマ環境パラメータ (密度, 温度, etc.)
12
+ - `[output]`: `dir` (出力ディレクトリ)
13
+
14
+ #### 出力ファイル (`work/latest/` 以下)
15
+ - `summary.txt` — 計算結果サマリー (完了時に生成)
16
+ - `charges.csv` — 表面電荷分布
17
+ - `charge_history.csv` — 電荷時間履歴
18
+ - `potential_history.csv` — 電位時間履歴
19
+ - `mesh_triangles.csv`, `mesh_sources.csv` — メッシュ情報
20
+ - `performance_profile.csv` — 性能プロファイル
21
+
22
+ #### 完了判定
23
+ - `work/latest/summary.txt` が存在 → completed
24
+ - stderr に "error", "fatal", "killed" → failed
25
+ - `charges.csv` のみ存在 → running (途中)
26
+
27
+ #### パラメータサーベイでよく変えるパラメータ
28
+ - `sim.dt`, `sim.max_step`, `sim.batch_count`
29
+ - `environment.electron_density`, `environment.electron_temperature`
30
+ - `environment.ion_density`, `environment.ion_temperature`
31
+ - `mesh.obj_path` (異なるジオメトリの比較)
32
+
33
+ #### ドキュメント・参考
34
+ - BEACH ソースリポジトリの README / docs/
35
+ - OBJ メッシュファイルは Blender 等で作成
36
+ - パラメータの dot 記法例: `sim.dt=1.0e-6`, `environment.electron_density=1.0e12`
37
+
38
+ #### 実行コマンド
39
+ ```
40
+ srun beach input/beach.toml
41
+ ```
42
+
43
+ `beach.toml` の `output.dir` は `work/latest` に自動設定される。
44
+
45
+ #### 環境変数 (OpenMP)
46
+ ```
47
+ OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
48
+ OMP_PROC_BIND=spread
49
+ OMP_PLACES=cores
50
+ ```
@@ -0,0 +1,19 @@
1
+ # BEACH configuration
2
+ # See BEACH documentation for full parameter reference
3
+
4
+ [sim]
5
+ dt = 1.0e-6
6
+ max_step = 1000
7
+ batch_count = 100
8
+
9
+ [mesh]
10
+ # obj_path = "mesh.obj"
11
+
12
+ [environment]
13
+ electron_density = 1.0e12
14
+ electron_temperature = 1.0
15
+ ion_density = 1.0e12
16
+ ion_temperature = 1.0
17
+
18
+ [output]
19
+ # dir is set automatically by runops
@@ -0,0 +1,16 @@
1
+ [case]
2
+ name = ""
3
+ simulator = "beach"
4
+ launcher = "default"
5
+ description = ""
6
+
7
+ [params]
8
+ # "sim.dt" = 1.0e-6
9
+ # "sim.max_step" = 1000
10
+ # "environment.electron_density" = 1.0e12
11
+
12
+ [job]
13
+ partition = ""
14
+ nodes = 1
15
+ ntasks = 1
16
+ walltime = "01:00:00"