runops 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runops/__init__.py +5 -0
- runops/_data/README.md +476 -0
- runops/adapters/__init__.py +29 -0
- runops/adapters/_utils/__init__.py +36 -0
- runops/adapters/_utils/toml_utils.py +81 -0
- runops/adapters/base.py +335 -0
- runops/adapters/contrib/__init__.py +5 -0
- runops/adapters/contrib/beach.py +837 -0
- runops/adapters/contrib/emses.py +1010 -0
- runops/adapters/generic.py +439 -0
- runops/adapters/registry.py +244 -0
- runops/cli/__init__.py +3 -0
- runops/cli/analyze.py +222 -0
- runops/cli/clone.py +104 -0
- runops/cli/config.py +217 -0
- runops/cli/context.py +56 -0
- runops/cli/create.py +263 -0
- runops/cli/dashboard.py +179 -0
- runops/cli/extend.py +204 -0
- runops/cli/history.py +105 -0
- runops/cli/init.py +1432 -0
- runops/cli/jobs.py +145 -0
- runops/cli/knowledge.py +1017 -0
- runops/cli/list.py +102 -0
- runops/cli/log.py +163 -0
- runops/cli/main.py +96 -0
- runops/cli/manage.py +231 -0
- runops/cli/new.py +343 -0
- runops/cli/notes.py +257 -0
- runops/cli/run_lookup.py +148 -0
- runops/cli/setup.py +174 -0
- runops/cli/status.py +187 -0
- runops/cli/submit.py +297 -0
- runops/cli/update.py +113 -0
- runops/cli/update_harness.py +245 -0
- runops/cli/update_refs.py +370 -0
- runops/core/__init__.py +3 -0
- runops/core/actions.py +1186 -0
- runops/core/analysis.py +1090 -0
- runops/core/campaign.py +156 -0
- runops/core/case.py +307 -0
- runops/core/context.py +426 -0
- runops/core/discovery.py +192 -0
- runops/core/environment.py +266 -0
- runops/core/exceptions.py +93 -0
- runops/core/knowledge.py +595 -0
- runops/core/knowledge_source.py +1204 -0
- runops/core/manifest.py +219 -0
- runops/core/project.py +171 -0
- runops/core/provenance.py +147 -0
- runops/core/retry.py +193 -0
- runops/core/run.py +170 -0
- runops/core/run_creation.py +456 -0
- runops/core/site.py +337 -0
- runops/core/state.py +197 -0
- runops/core/survey.py +380 -0
- runops/core/validation.py +40 -0
- runops/harness/__init__.py +27 -0
- runops/harness/builder.py +327 -0
- runops/harness/claude.py +189 -0
- runops/jobgen/__init__.py +3 -0
- runops/jobgen/generator.py +295 -0
- runops/launchers/__init__.py +17 -0
- runops/launchers/base.py +313 -0
- runops/launchers/mpiexec.py +131 -0
- runops/launchers/mpirun.py +132 -0
- runops/launchers/srun.py +126 -0
- runops/sites/__init__.py +0 -0
- runops/sites/camphor.md +98 -0
- runops/sites/camphor.toml +27 -0
- runops/slurm/__init__.py +3 -0
- runops/slurm/query.py +384 -0
- runops/slurm/submit.py +203 -0
- runops/templates/__init__.py +29 -0
- runops/templates/adapters/beach/agent_guide.md +50 -0
- runops/templates/adapters/beach/beach.toml +19 -0
- runops/templates/adapters/beach/case.toml +16 -0
- runops/templates/adapters/beach/summarize.py +272 -0
- runops/templates/adapters/emses/agent_guide.md +39 -0
- runops/templates/adapters/emses/case.toml +18 -0
- runops/templates/adapters/emses/plasma.toml +118 -0
- runops/templates/adapters/emses/summarize.py +413 -0
- runops/templates/adapters/generic/case.toml.j2 +13 -0
- runops/templates/adapters/generic/summarize.py +21 -0
- runops/templates/agent.md +156 -0
- runops/templates/rules/cookbook.md +22 -0
- runops/templates/scaffold/campaign.toml.j2 +10 -0
- runops/templates/scaffold/cases_claude.md +22 -0
- runops/templates/scaffold/facts.toml +2 -0
- runops/templates/scaffold/gitignore.txt +30 -0
- runops/templates/scaffold/notes/README.md +69 -0
- runops/templates/scaffold/rules/plan-before-act.md +17 -0
- runops/templates/scaffold/rules/runops-workflow.md +84 -0
- runops/templates/scaffold/rules/upstream-feedback.md +85 -0
- runops/templates/scaffold/runs_claude.md +24 -0
- runops/templates/scaffold/vscode_settings.json +9 -0
- runops/templates/skills/analyze/SKILL.md +40 -0
- runops/templates/skills/check-status/SKILL.md +29 -0
- runops/templates/skills/cleanup/SKILL.md +43 -0
- runops/templates/skills/create-run/SKILL.md +135 -0
- runops/templates/skills/debug-failed/SKILL.md +38 -0
- runops/templates/skills/learn/SKILL.md +54 -0
- runops/templates/skills/new-case/SKILL.md +108 -0
- runops/templates/skills/note/SKILL.md +107 -0
- runops/templates/skills/run-all/SKILL.md +47 -0
- runops/templates/skills/runops-reference/SKILL.md +203 -0
- runops/templates/skills/setup-campaign/SKILL.md +111 -0
- runops/templates/skills/setup-env/SKILL.md +32 -0
- runops/templates/skills/survey-design/SKILL.md +73 -0
- runops/templates/survey.toml.j2 +22 -0
- runops-0.2.0.dist-info/METADATA +491 -0
- runops-0.2.0.dist-info/RECORD +115 -0
- runops-0.2.0.dist-info/WHEEL +4 -0
- runops-0.2.0.dist-info/entry_points.txt +2 -0
- runops-0.2.0.dist-info/licenses/LICENSE +201 -0
runops/cli/create.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""CLI commands for run and survey creation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Any, Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from runops.core.actions import ActionStatus, execute_action
|
|
11
|
+
from runops.core.case import JobData, load_case, resolve_case
|
|
12
|
+
from runops.core.exceptions import SimctlError
|
|
13
|
+
from runops.core.project import find_project_root, load_project
|
|
14
|
+
from runops.core.survey import expand_survey, generate_display_name, load_survey
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _echo_warnings(warnings: list[str], *, context: str = "") -> None:
|
|
18
|
+
"""Print non-fatal validation warnings emitted during run creation."""
|
|
19
|
+
prefix = f"{context}: " if context else ""
|
|
20
|
+
for warning in warnings:
|
|
21
|
+
typer.echo(f" Warning: {prefix}{warning}", err=True)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create(
|
|
25
|
+
case_name: Annotated[
|
|
26
|
+
str,
|
|
27
|
+
typer.Argument(
|
|
28
|
+
help="Case name to create a run from.",
|
|
29
|
+
),
|
|
30
|
+
],
|
|
31
|
+
dest: Annotated[
|
|
32
|
+
Optional[Path],
|
|
33
|
+
typer.Option("--dest", "-d", help="Destination directory (defaults to cwd)."),
|
|
34
|
+
] = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Create a run in the current directory.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
cd runs/experiment && runops runs create flat_surface
|
|
40
|
+
"""
|
|
41
|
+
target_dir = (dest or Path.cwd()).resolve()
|
|
42
|
+
|
|
43
|
+
if case_name == "survey" and (target_dir / "survey.toml").is_file():
|
|
44
|
+
typer.echo(
|
|
45
|
+
"Error: 'runops runs create survey' has been removed. "
|
|
46
|
+
"Use 'runops runs sweep [DIR]' instead.",
|
|
47
|
+
err=True,
|
|
48
|
+
)
|
|
49
|
+
raise typer.Exit(code=1)
|
|
50
|
+
|
|
51
|
+
_create_single(case_name, target_dir)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _create_single(case_name: str, target_dir: Path) -> None:
|
|
55
|
+
"""Create a single run from a case template."""
|
|
56
|
+
try:
|
|
57
|
+
project_root = find_project_root(target_dir)
|
|
58
|
+
result = execute_action(
|
|
59
|
+
"create_run",
|
|
60
|
+
project_root=project_root,
|
|
61
|
+
case_name=case_name,
|
|
62
|
+
dest_dir=target_dir,
|
|
63
|
+
)
|
|
64
|
+
except SimctlError as exc:
|
|
65
|
+
typer.echo(f"Error creating run: {exc}", err=True)
|
|
66
|
+
raise typer.Exit(code=1) from exc
|
|
67
|
+
|
|
68
|
+
if result.status is not ActionStatus.SUCCESS:
|
|
69
|
+
typer.echo(f"Error creating run: {result.message}", err=True)
|
|
70
|
+
raise typer.Exit(code=1)
|
|
71
|
+
|
|
72
|
+
_echo_warnings(list(result.data.get("warnings", [])))
|
|
73
|
+
typer.echo(f"Created run: {result.data.get('run_id', '???')}")
|
|
74
|
+
typer.echo(f" Path: {result.data.get('run_dir', target_dir)}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _create_survey(survey_dir: Path) -> None:
|
|
78
|
+
"""Expand survey.toml into multiple runs."""
|
|
79
|
+
try:
|
|
80
|
+
project_root = find_project_root(survey_dir)
|
|
81
|
+
result = execute_action(
|
|
82
|
+
"create_survey",
|
|
83
|
+
project_root=project_root,
|
|
84
|
+
survey_dir=survey_dir,
|
|
85
|
+
)
|
|
86
|
+
except SimctlError as exc:
|
|
87
|
+
typer.echo(f"Error: {exc}", err=True)
|
|
88
|
+
raise typer.Exit(code=1) from exc
|
|
89
|
+
|
|
90
|
+
if result.status is not ActionStatus.SUCCESS:
|
|
91
|
+
typer.echo(f"Error: {result.message}", err=True)
|
|
92
|
+
raise typer.Exit(code=1)
|
|
93
|
+
|
|
94
|
+
created_runs = list(result.data.get("runs", []))
|
|
95
|
+
if not created_runs:
|
|
96
|
+
typer.echo("No parameter combinations to expand.")
|
|
97
|
+
raise typer.Exit(code=0)
|
|
98
|
+
|
|
99
|
+
for created_run in created_runs:
|
|
100
|
+
_echo_warnings(
|
|
101
|
+
list(created_run.get("warnings", [])),
|
|
102
|
+
context=str(created_run.get("display_name", "")),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
typer.echo(f"Created {len(created_runs)} runs in {survey_dir}")
|
|
106
|
+
for created_run in created_runs:
|
|
107
|
+
display_name = str(created_run.get("display_name", ""))
|
|
108
|
+
name_part = f" ({display_name})" if display_name else ""
|
|
109
|
+
typer.echo(f" {created_run.get('run_id', '???')}{name_part}")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def sweep(
|
|
113
|
+
survey_dir: Annotated[
|
|
114
|
+
Optional[Path],
|
|
115
|
+
typer.Argument(help="Directory containing survey.toml (defaults to cwd)."),
|
|
116
|
+
] = None,
|
|
117
|
+
dry_run: Annotated[
|
|
118
|
+
bool,
|
|
119
|
+
typer.Option(
|
|
120
|
+
"--dry-run",
|
|
121
|
+
"-n",
|
|
122
|
+
help=(
|
|
123
|
+
"Print the runs that would be generated (count, parameter "
|
|
124
|
+
"combinations, estimated total resource cost) without "
|
|
125
|
+
"writing any files."
|
|
126
|
+
),
|
|
127
|
+
),
|
|
128
|
+
] = False,
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Generate all runs from a survey.toml parameter sweep.
|
|
131
|
+
|
|
132
|
+
With ``--dry-run`` the survey is parsed and expanded but no run
|
|
133
|
+
directories are created — useful to verify the parameter combinations
|
|
134
|
+
and total resource cost before committing files / queue time.
|
|
135
|
+
"""
|
|
136
|
+
target = (survey_dir or Path.cwd()).resolve()
|
|
137
|
+
if dry_run:
|
|
138
|
+
_sweep_dry_run(target)
|
|
139
|
+
else:
|
|
140
|
+
_create_survey(target)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _sweep_dry_run(survey_dir: Path) -> None:
|
|
144
|
+
"""Print the planned runs without writing files."""
|
|
145
|
+
try:
|
|
146
|
+
survey_data = load_survey(survey_dir)
|
|
147
|
+
except SimctlError as exc:
|
|
148
|
+
typer.echo(f"Error: {exc}", err=True)
|
|
149
|
+
raise typer.Exit(code=1) from exc
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
project_root = find_project_root(survey_dir)
|
|
153
|
+
except SimctlError as exc:
|
|
154
|
+
typer.echo(f"Error: {exc}", err=True)
|
|
155
|
+
raise typer.Exit(code=1) from exc
|
|
156
|
+
|
|
157
|
+
# Look up the base case so we can show the resolved job (the survey's
|
|
158
|
+
# own [job] block, when present, overrides the case's).
|
|
159
|
+
try:
|
|
160
|
+
project = load_project(project_root)
|
|
161
|
+
case_dir = resolve_case(survey_data.base_case, project.root_dir)
|
|
162
|
+
case_data = load_case(case_dir)
|
|
163
|
+
except SimctlError as exc:
|
|
164
|
+
typer.echo(f"Error resolving base case: {exc}", err=True)
|
|
165
|
+
raise typer.Exit(code=1) from exc
|
|
166
|
+
|
|
167
|
+
effective_job = survey_data.job if survey_data.job.partition else case_data.job
|
|
168
|
+
|
|
169
|
+
combinations = expand_survey(survey_data.axes, survey_data.linked)
|
|
170
|
+
if not combinations:
|
|
171
|
+
typer.echo("No parameter combinations to expand.")
|
|
172
|
+
raise typer.Exit(code=0)
|
|
173
|
+
|
|
174
|
+
n = len(combinations)
|
|
175
|
+
typer.echo(f"[dry-run] {n} runs would be created in {survey_dir}")
|
|
176
|
+
typer.echo(f" base case : {survey_data.base_case}")
|
|
177
|
+
typer.echo(f" simulator : {survey_data.simulator}")
|
|
178
|
+
typer.echo(f" launcher : {survey_data.launcher}")
|
|
179
|
+
if survey_data.naming_template:
|
|
180
|
+
typer.echo(f" display : {survey_data.naming_template}")
|
|
181
|
+
typer.echo(_format_job_summary(effective_job))
|
|
182
|
+
typer.echo(_format_resource_estimate(effective_job, n))
|
|
183
|
+
|
|
184
|
+
# Print one line per combination so the user can scan parameters.
|
|
185
|
+
typer.echo("")
|
|
186
|
+
typer.echo("Planned runs:")
|
|
187
|
+
for combo in combinations:
|
|
188
|
+
display_name = generate_display_name(survey_data.naming_template, combo)
|
|
189
|
+
params_str = _format_combo(combo)
|
|
190
|
+
if display_name:
|
|
191
|
+
typer.echo(f" {display_name:<24} {params_str}")
|
|
192
|
+
else:
|
|
193
|
+
typer.echo(f" {params_str}")
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _format_combo(combo: dict[str, Any]) -> str:
|
|
197
|
+
"""Format a combo dict as ``key1=value1, key2=value2``."""
|
|
198
|
+
parts = []
|
|
199
|
+
for key in sorted(combo.keys()):
|
|
200
|
+
value = combo[key]
|
|
201
|
+
# Truncate long lists for readability.
|
|
202
|
+
if isinstance(value, list) and len(value) > 4:
|
|
203
|
+
shown = f"[{value[0]}, ..., {value[-1]} ({len(value)} items)]"
|
|
204
|
+
else:
|
|
205
|
+
shown = repr(value)
|
|
206
|
+
parts.append(f"{key}={shown}")
|
|
207
|
+
return ", ".join(parts)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _format_job_summary(job: JobData) -> str:
|
|
211
|
+
"""Format a JobData line for the dry-run output."""
|
|
212
|
+
if job.processes > 1 or job.threads > 1 or job.cores > 1:
|
|
213
|
+
# rsc-style site
|
|
214
|
+
return (
|
|
215
|
+
f" job : partition={job.partition or '(default)'} "
|
|
216
|
+
f"p={job.processes} t={job.threads} c={job.cores} "
|
|
217
|
+
f"walltime={job.walltime}"
|
|
218
|
+
)
|
|
219
|
+
return (
|
|
220
|
+
f" job : partition={job.partition or '(default)'} "
|
|
221
|
+
f"nodes={job.nodes} ntasks={job.ntasks} walltime={job.walltime}"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _format_resource_estimate(job: JobData, n_runs: int) -> str:
|
|
226
|
+
"""Best-effort estimate of total core-hours for the planned sweep."""
|
|
227
|
+
# Pick the larger of (rsc processes) or (standard ntasks) so the
|
|
228
|
+
# estimate works regardless of which site mode the job uses.
|
|
229
|
+
cores_per_run = max(job.processes, job.ntasks)
|
|
230
|
+
walltime_hours = _walltime_to_hours(job.walltime)
|
|
231
|
+
if cores_per_run <= 1 or walltime_hours <= 0:
|
|
232
|
+
return " estimate : (cannot estimate — incomplete job spec)"
|
|
233
|
+
total_core_hours = cores_per_run * walltime_hours * n_runs
|
|
234
|
+
return (
|
|
235
|
+
f" estimate : {n_runs} runs x {cores_per_run} cores x "
|
|
236
|
+
f"{walltime_hours:.1f} h walltime ~= {total_core_hours:,.0f} core-hours"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _walltime_to_hours(walltime: str) -> float:
|
|
241
|
+
"""Parse a walltime string to hours. Returns 0.0 on failure."""
|
|
242
|
+
if not walltime:
|
|
243
|
+
return 0.0
|
|
244
|
+
# Strip optional ``D-`` day prefix.
|
|
245
|
+
days = 0
|
|
246
|
+
rest = walltime
|
|
247
|
+
if "-" in walltime:
|
|
248
|
+
head, _, rest = walltime.partition("-")
|
|
249
|
+
try:
|
|
250
|
+
days = int(head)
|
|
251
|
+
except ValueError:
|
|
252
|
+
return 0.0
|
|
253
|
+
parts = rest.split(":")
|
|
254
|
+
try:
|
|
255
|
+
if len(parts) == 3:
|
|
256
|
+
h, m, s = (int(p) for p in parts)
|
|
257
|
+
elif len(parts) == 2:
|
|
258
|
+
h, m, s = 0, int(parts[0]), int(parts[1])
|
|
259
|
+
else:
|
|
260
|
+
return 0.0
|
|
261
|
+
except ValueError:
|
|
262
|
+
return 0.0
|
|
263
|
+
return days * 24 + h + m / 60 + s / 3600
|
runops/cli/dashboard.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""CLI command for the multi-run dashboard view."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated, Optional
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
|
|
11
|
+
from runops.cli.run_lookup import resolve_run_targets
|
|
12
|
+
from runops.core.exceptions import SimctlError
|
|
13
|
+
from runops.core.manifest import read_manifest
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def dashboard(
|
|
17
|
+
targets: Annotated[
|
|
18
|
+
Optional[list[str]],
|
|
19
|
+
typer.Argument(
|
|
20
|
+
help=(
|
|
21
|
+
"Run identifiers, run directories, or directories containing "
|
|
22
|
+
"runs (recursive). Defaults to cwd / project runs/."
|
|
23
|
+
),
|
|
24
|
+
),
|
|
25
|
+
] = None,
|
|
26
|
+
watch: Annotated[
|
|
27
|
+
Optional[float],
|
|
28
|
+
typer.Option(
|
|
29
|
+
"--watch",
|
|
30
|
+
"-w",
|
|
31
|
+
help=(
|
|
32
|
+
"Refresh the dashboard every N seconds (clears screen between "
|
|
33
|
+
"refreshes). Press Ctrl-C to stop."
|
|
34
|
+
),
|
|
35
|
+
),
|
|
36
|
+
] = None,
|
|
37
|
+
all_states: Annotated[
|
|
38
|
+
bool,
|
|
39
|
+
typer.Option(
|
|
40
|
+
"--all",
|
|
41
|
+
"-a",
|
|
42
|
+
help="Show all runs, not just the ones in submitted/running state.",
|
|
43
|
+
),
|
|
44
|
+
] = False,
|
|
45
|
+
) -> None:
|
|
46
|
+
"""Multi-run progress dashboard.
|
|
47
|
+
|
|
48
|
+
Aggregates per-run progress (state, step, %, last diagnostic) into a
|
|
49
|
+
single table. Useful while a survey of dozens of runs is in flight:
|
|
50
|
+
instead of opening ``runops runs log`` for each run individually,
|
|
51
|
+
one ``runops runs dashboard`` call shows the whole survey.
|
|
52
|
+
|
|
53
|
+
Examples:
|
|
54
|
+
runops runs dashboard runs/series_A # one survey
|
|
55
|
+
runops runs dashboard -w 30 runs/series_A # auto-refresh every 30 s
|
|
56
|
+
runops runs dashboard --all runs/ # whole project, including
|
|
57
|
+
# completed/failed runs
|
|
58
|
+
"""
|
|
59
|
+
cwd = Path.cwd().resolve()
|
|
60
|
+
run_dirs = resolve_run_targets(targets, search_dir=cwd)
|
|
61
|
+
|
|
62
|
+
if watch is not None and watch > 0:
|
|
63
|
+
_watch_loop(run_dirs, all_states=all_states, interval=watch)
|
|
64
|
+
else:
|
|
65
|
+
_print_dashboard(run_dirs, all_states=all_states)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _print_dashboard(run_dirs: list[Path], *, all_states: bool) -> None:
|
|
69
|
+
"""Render the dashboard table once."""
|
|
70
|
+
if not run_dirs:
|
|
71
|
+
typer.echo("No runs found.")
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
active_states = {"submitted", "running"}
|
|
75
|
+
rows: list[tuple[str, str, str, str, str, str]] = []
|
|
76
|
+
|
|
77
|
+
for run_dir in run_dirs:
|
|
78
|
+
try:
|
|
79
|
+
manifest = read_manifest(run_dir)
|
|
80
|
+
except SimctlError:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
status = str(manifest.run.get("status", "unknown"))
|
|
84
|
+
if not all_states and status not in active_states:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
run_id = str(manifest.run.get("id", run_dir.name))
|
|
88
|
+
display_name = str(manifest.run.get("display_name", ""))
|
|
89
|
+
|
|
90
|
+
step_str, pct_str = _progress_for_run(run_dir, manifest.simulator)
|
|
91
|
+
|
|
92
|
+
# Show the latest known Slurm state if recorded.
|
|
93
|
+
last_slurm = str(manifest.run.get("last_slurm_state", "")) or "-"
|
|
94
|
+
|
|
95
|
+
rows.append((run_id, display_name, status, step_str, pct_str, last_slurm))
|
|
96
|
+
|
|
97
|
+
if not rows:
|
|
98
|
+
typer.echo("No active runs found.")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
# Sort by run_id for stable output.
|
|
102
|
+
rows.sort(key=lambda r: r[0])
|
|
103
|
+
|
|
104
|
+
headers = ("RUN_ID", "NAME", "STATE", "STEP", "%", "SLURM")
|
|
105
|
+
widths = [len(h) for h in headers]
|
|
106
|
+
for row in rows:
|
|
107
|
+
for i, cell in enumerate(row):
|
|
108
|
+
widths[i] = max(widths[i], len(cell))
|
|
109
|
+
|
|
110
|
+
fmt = " ".join(f"{{:<{w}}}" for w in widths)
|
|
111
|
+
typer.echo(fmt.format(*headers))
|
|
112
|
+
typer.echo(fmt.format(*("-" * w for w in widths)))
|
|
113
|
+
for row in rows:
|
|
114
|
+
typer.echo(fmt.format(*row))
|
|
115
|
+
|
|
116
|
+
n_active = sum(1 for r in rows if r[2] in active_states)
|
|
117
|
+
typer.echo(f"\n{n_active} active, {len(rows)} total")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _progress_for_run(
|
|
121
|
+
run_dir: Path,
|
|
122
|
+
simulator: dict[str, str],
|
|
123
|
+
) -> tuple[str, str]:
|
|
124
|
+
"""Best-effort progress lookup for a single run.
|
|
125
|
+
|
|
126
|
+
Returns ``(step_str, pct_str)`` where each part is a short string
|
|
127
|
+
suitable for table cells. Returns ``("-", "-")`` when the adapter
|
|
128
|
+
has no progress to report.
|
|
129
|
+
"""
|
|
130
|
+
adapter_name = simulator.get("adapter", "") or simulator.get("name", "")
|
|
131
|
+
if not adapter_name:
|
|
132
|
+
return "-", "-"
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
import runops.adapters # noqa: F401 (registers adapters)
|
|
136
|
+
from runops.adapters.registry import get as get_adapter
|
|
137
|
+
|
|
138
|
+
adapter_cls = get_adapter(adapter_name)
|
|
139
|
+
adapter = adapter_cls()
|
|
140
|
+
summary = adapter.summarize(run_dir)
|
|
141
|
+
except Exception:
|
|
142
|
+
return "-", "-"
|
|
143
|
+
|
|
144
|
+
last_step = summary.get("last_step")
|
|
145
|
+
nstep = summary.get("nstep")
|
|
146
|
+
if last_step is None or not nstep:
|
|
147
|
+
return "-", "-"
|
|
148
|
+
|
|
149
|
+
pct = float(last_step) / float(nstep) * 100
|
|
150
|
+
return f"{int(last_step):d}/{int(nstep):d}", f"{pct:5.1f}%"
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _watch_loop(
|
|
154
|
+
run_dirs: list[Path],
|
|
155
|
+
*,
|
|
156
|
+
all_states: bool,
|
|
157
|
+
interval: float,
|
|
158
|
+
) -> None:
|
|
159
|
+
"""Refresh the dashboard every ``interval`` seconds.
|
|
160
|
+
|
|
161
|
+
Stops cleanly on Ctrl-C. Each refresh re-reads the manifests and
|
|
162
|
+
re-runs the per-run progress lookup, so newly-submitted or newly-
|
|
163
|
+
completed runs are picked up automatically.
|
|
164
|
+
"""
|
|
165
|
+
from datetime import datetime
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
while True:
|
|
169
|
+
typer.echo("\x1b[2J\x1b[H", nl=False)
|
|
170
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
171
|
+
typer.echo(
|
|
172
|
+
f"runops runs dashboard (watch every {interval:g}s) — {timestamp}"
|
|
173
|
+
)
|
|
174
|
+
typer.echo("")
|
|
175
|
+
_print_dashboard(run_dirs, all_states=all_states)
|
|
176
|
+
time.sleep(interval)
|
|
177
|
+
except KeyboardInterrupt:
|
|
178
|
+
typer.echo("\nStopped.")
|
|
179
|
+
raise typer.Exit(code=0) from None
|
runops/cli/extend.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""CLI command for extending/continuing a simulation from a snapshot."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated, Any, Optional
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
|
|
11
|
+
from runops.cli.run_lookup import resolve_run_or_cwd
|
|
12
|
+
from runops.core.discovery import collect_existing_run_ids
|
|
13
|
+
from runops.core.exceptions import SimctlError
|
|
14
|
+
from runops.core.manifest import ManifestData, read_manifest, write_manifest
|
|
15
|
+
from runops.core.project import find_project_root, load_project
|
|
16
|
+
from runops.core.run import create_run
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def extend(
|
|
20
|
+
run: Annotated[
|
|
21
|
+
Optional[str],
|
|
22
|
+
typer.Argument(help="Source run to continue from (defaults to cwd)."),
|
|
23
|
+
] = None,
|
|
24
|
+
dest: Annotated[
|
|
25
|
+
Optional[Path],
|
|
26
|
+
typer.Option(
|
|
27
|
+
"--dest", "-d", help="Destination directory (defaults to source's parent)."
|
|
28
|
+
),
|
|
29
|
+
] = None,
|
|
30
|
+
nstep: Annotated[
|
|
31
|
+
Optional[int],
|
|
32
|
+
typer.Option("--nstep", help="Override total step count for continuation."),
|
|
33
|
+
] = None,
|
|
34
|
+
submit: Annotated[
|
|
35
|
+
bool,
|
|
36
|
+
typer.Option("--run", help="Automatically submit the continuation run."),
|
|
37
|
+
] = False,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Create a continuation run from a completed simulation's snapshot.
|
|
40
|
+
|
|
41
|
+
Copies input files, links snapshots, and updates restart parameters.
|
|
42
|
+
The adapter handles simulator-specific continuation setup.
|
|
43
|
+
|
|
44
|
+
Examples:
|
|
45
|
+
runops runs extend # continue cwd run
|
|
46
|
+
runops runs extend R0001 --nstep 200000 # continue with more steps
|
|
47
|
+
runops runs extend --run # continue and submit
|
|
48
|
+
"""
|
|
49
|
+
source_dir = resolve_run_or_cwd(run, search_dir=Path.cwd())
|
|
50
|
+
|
|
51
|
+
# Read source manifest
|
|
52
|
+
try:
|
|
53
|
+
source_manifest = read_manifest(source_dir)
|
|
54
|
+
except SimctlError as e:
|
|
55
|
+
typer.echo(f"Error: {e}", err=True)
|
|
56
|
+
raise typer.Exit(code=1) from None
|
|
57
|
+
|
|
58
|
+
source_id = source_manifest.run.get("id", source_dir.name)
|
|
59
|
+
source_status = source_manifest.run.get("status", "")
|
|
60
|
+
|
|
61
|
+
if source_status not in ("completed", "running", "failed"):
|
|
62
|
+
typer.echo(
|
|
63
|
+
f"Warning: source run {source_id} is '{source_status}'. "
|
|
64
|
+
"Continuation is typically from completed runs."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Determine destination
|
|
68
|
+
target_dir = dest or source_dir.parent
|
|
69
|
+
target_dir = target_dir.resolve()
|
|
70
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
|
|
72
|
+
# Load project for adapter/launcher
|
|
73
|
+
try:
|
|
74
|
+
project_root = find_project_root(target_dir)
|
|
75
|
+
project = load_project(project_root)
|
|
76
|
+
except SimctlError as e:
|
|
77
|
+
typer.echo(f"Error: {e}", err=True)
|
|
78
|
+
raise typer.Exit(code=1) from None
|
|
79
|
+
|
|
80
|
+
# Get adapter
|
|
81
|
+
adapter_name = source_manifest.simulator.get("adapter", "")
|
|
82
|
+
if not adapter_name:
|
|
83
|
+
adapter_name = source_manifest.simulator.get("name", "")
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
from runops.adapters.registry import get as get_adapter
|
|
87
|
+
from runops.adapters.registry import load_from_config
|
|
88
|
+
|
|
89
|
+
load_from_config(project.simulators)
|
|
90
|
+
adapter_cls = get_adapter(adapter_name)
|
|
91
|
+
adapter = adapter_cls()
|
|
92
|
+
except (KeyError, Exception) as e:
|
|
93
|
+
typer.echo(f"Error loading adapter '{adapter_name}': {e}", err=True)
|
|
94
|
+
raise typer.Exit(code=1) from None
|
|
95
|
+
|
|
96
|
+
# Collect existing run IDs
|
|
97
|
+
runs_dir = project_root / "runs"
|
|
98
|
+
existing_ids = collect_existing_run_ids(runs_dir)
|
|
99
|
+
|
|
100
|
+
# Create new run directory
|
|
101
|
+
params = dict(source_manifest.params_snapshot)
|
|
102
|
+
try:
|
|
103
|
+
run_info = create_run(
|
|
104
|
+
target_dir,
|
|
105
|
+
existing_ids,
|
|
106
|
+
display_name=f"extend_{source_id}",
|
|
107
|
+
params=params,
|
|
108
|
+
)
|
|
109
|
+
except SimctlError as e:
|
|
110
|
+
typer.echo(f"Error creating run: {e}", err=True)
|
|
111
|
+
raise typer.Exit(code=1) from None
|
|
112
|
+
|
|
113
|
+
new_dir = run_info.run_dir
|
|
114
|
+
new_input = new_dir / "input"
|
|
115
|
+
new_input.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
|
|
117
|
+
# Copy input files from source
|
|
118
|
+
source_input = source_dir / "input"
|
|
119
|
+
if source_input.is_dir():
|
|
120
|
+
for item in source_input.iterdir():
|
|
121
|
+
dest_item = new_input / item.name
|
|
122
|
+
if item.is_file():
|
|
123
|
+
shutil.copy2(item, dest_item)
|
|
124
|
+
elif item.is_dir():
|
|
125
|
+
shutil.copytree(item, dest_item, dirs_exist_ok=True)
|
|
126
|
+
|
|
127
|
+
# Let adapter set up continuation (snapshot links, parameter updates)
|
|
128
|
+
continuation_info: dict[str, Any] = {}
|
|
129
|
+
if hasattr(adapter, "setup_continuation"):
|
|
130
|
+
try:
|
|
131
|
+
continuation_info = adapter.setup_continuation(
|
|
132
|
+
source_dir=source_dir,
|
|
133
|
+
new_dir=new_dir,
|
|
134
|
+
nstep_override=nstep,
|
|
135
|
+
)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
typer.echo(f"Error in adapter continuation setup: {e}", err=True)
|
|
138
|
+
raise typer.Exit(code=1) from None
|
|
139
|
+
|
|
140
|
+
# Copy job script from source if exists
|
|
141
|
+
source_submit = source_dir / "submit"
|
|
142
|
+
new_submit = new_dir / "submit"
|
|
143
|
+
new_submit.mkdir(parents=True, exist_ok=True)
|
|
144
|
+
source_job = source_submit / "job.sh"
|
|
145
|
+
if source_job.is_file():
|
|
146
|
+
shutil.copy2(source_job, new_submit / "job.sh")
|
|
147
|
+
|
|
148
|
+
# Create work directory
|
|
149
|
+
(new_dir / "work").mkdir(exist_ok=True)
|
|
150
|
+
|
|
151
|
+
# Write manifest
|
|
152
|
+
new_manifest = ManifestData(
|
|
153
|
+
run={
|
|
154
|
+
"id": run_info.run_id,
|
|
155
|
+
"display_name": run_info.display_name,
|
|
156
|
+
"status": "created",
|
|
157
|
+
"created_at": run_info.created_at,
|
|
158
|
+
},
|
|
159
|
+
path={"run_dir": str(new_dir)},
|
|
160
|
+
origin={
|
|
161
|
+
"case": source_manifest.origin.get("case", ""),
|
|
162
|
+
"survey": "",
|
|
163
|
+
"parent_run": source_id,
|
|
164
|
+
},
|
|
165
|
+
classification=dict(source_manifest.classification),
|
|
166
|
+
simulator=dict(source_manifest.simulator),
|
|
167
|
+
launcher=dict(source_manifest.launcher),
|
|
168
|
+
simulator_source=dict(source_manifest.simulator_source),
|
|
169
|
+
job={
|
|
170
|
+
"scheduler": "slurm",
|
|
171
|
+
"job_id": "",
|
|
172
|
+
"partition": source_manifest.job.get("partition", ""),
|
|
173
|
+
"nodes": source_manifest.job.get("nodes", 1),
|
|
174
|
+
"ntasks": source_manifest.job.get("ntasks", 1),
|
|
175
|
+
"walltime": source_manifest.job.get("walltime", "01:00:00"),
|
|
176
|
+
"submitted_at": "",
|
|
177
|
+
},
|
|
178
|
+
variation={"changed_keys": []},
|
|
179
|
+
params_snapshot=params,
|
|
180
|
+
files={
|
|
181
|
+
"input_dir": "input",
|
|
182
|
+
"submit_dir": "submit",
|
|
183
|
+
"work_dir": "work",
|
|
184
|
+
"analysis_dir": "analysis",
|
|
185
|
+
"status_dir": "status",
|
|
186
|
+
},
|
|
187
|
+
)
|
|
188
|
+
write_manifest(new_dir, new_manifest)
|
|
189
|
+
|
|
190
|
+
typer.echo(f"Created continuation run: {run_info.run_id}")
|
|
191
|
+
typer.echo(f" Source: {source_id}")
|
|
192
|
+
typer.echo(f" Path: {new_dir}")
|
|
193
|
+
if continuation_info:
|
|
194
|
+
for key, val in continuation_info.items():
|
|
195
|
+
typer.echo(f" {key}: {val}")
|
|
196
|
+
|
|
197
|
+
# Auto-submit if requested
|
|
198
|
+
if submit:
|
|
199
|
+
from runops.cli.submit import _submit_single_run
|
|
200
|
+
|
|
201
|
+
job_id = _submit_single_run(new_dir)
|
|
202
|
+
if job_id is None:
|
|
203
|
+
typer.echo("Warning: auto-submit failed")
|
|
204
|
+
raise typer.Exit(code=1)
|