runops 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. runops/__init__.py +5 -0
  2. runops/_data/README.md +476 -0
  3. runops/adapters/__init__.py +29 -0
  4. runops/adapters/_utils/__init__.py +36 -0
  5. runops/adapters/_utils/toml_utils.py +81 -0
  6. runops/adapters/base.py +335 -0
  7. runops/adapters/contrib/__init__.py +5 -0
  8. runops/adapters/contrib/beach.py +837 -0
  9. runops/adapters/contrib/emses.py +1010 -0
  10. runops/adapters/generic.py +439 -0
  11. runops/adapters/registry.py +244 -0
  12. runops/cli/__init__.py +3 -0
  13. runops/cli/analyze.py +222 -0
  14. runops/cli/clone.py +104 -0
  15. runops/cli/config.py +217 -0
  16. runops/cli/context.py +56 -0
  17. runops/cli/create.py +263 -0
  18. runops/cli/dashboard.py +179 -0
  19. runops/cli/extend.py +204 -0
  20. runops/cli/history.py +105 -0
  21. runops/cli/init.py +1432 -0
  22. runops/cli/jobs.py +145 -0
  23. runops/cli/knowledge.py +1017 -0
  24. runops/cli/list.py +102 -0
  25. runops/cli/log.py +163 -0
  26. runops/cli/main.py +96 -0
  27. runops/cli/manage.py +231 -0
  28. runops/cli/new.py +343 -0
  29. runops/cli/notes.py +257 -0
  30. runops/cli/run_lookup.py +148 -0
  31. runops/cli/setup.py +174 -0
  32. runops/cli/status.py +187 -0
  33. runops/cli/submit.py +297 -0
  34. runops/cli/update.py +113 -0
  35. runops/cli/update_harness.py +245 -0
  36. runops/cli/update_refs.py +370 -0
  37. runops/core/__init__.py +3 -0
  38. runops/core/actions.py +1186 -0
  39. runops/core/analysis.py +1090 -0
  40. runops/core/campaign.py +156 -0
  41. runops/core/case.py +307 -0
  42. runops/core/context.py +426 -0
  43. runops/core/discovery.py +192 -0
  44. runops/core/environment.py +266 -0
  45. runops/core/exceptions.py +93 -0
  46. runops/core/knowledge.py +595 -0
  47. runops/core/knowledge_source.py +1204 -0
  48. runops/core/manifest.py +219 -0
  49. runops/core/project.py +171 -0
  50. runops/core/provenance.py +147 -0
  51. runops/core/retry.py +193 -0
  52. runops/core/run.py +170 -0
  53. runops/core/run_creation.py +456 -0
  54. runops/core/site.py +337 -0
  55. runops/core/state.py +197 -0
  56. runops/core/survey.py +380 -0
  57. runops/core/validation.py +40 -0
  58. runops/harness/__init__.py +27 -0
  59. runops/harness/builder.py +327 -0
  60. runops/harness/claude.py +189 -0
  61. runops/jobgen/__init__.py +3 -0
  62. runops/jobgen/generator.py +295 -0
  63. runops/launchers/__init__.py +17 -0
  64. runops/launchers/base.py +313 -0
  65. runops/launchers/mpiexec.py +131 -0
  66. runops/launchers/mpirun.py +132 -0
  67. runops/launchers/srun.py +126 -0
  68. runops/sites/__init__.py +0 -0
  69. runops/sites/camphor.md +98 -0
  70. runops/sites/camphor.toml +27 -0
  71. runops/slurm/__init__.py +3 -0
  72. runops/slurm/query.py +384 -0
  73. runops/slurm/submit.py +203 -0
  74. runops/templates/__init__.py +29 -0
  75. runops/templates/adapters/beach/agent_guide.md +50 -0
  76. runops/templates/adapters/beach/beach.toml +19 -0
  77. runops/templates/adapters/beach/case.toml +16 -0
  78. runops/templates/adapters/beach/summarize.py +272 -0
  79. runops/templates/adapters/emses/agent_guide.md +39 -0
  80. runops/templates/adapters/emses/case.toml +18 -0
  81. runops/templates/adapters/emses/plasma.toml +118 -0
  82. runops/templates/adapters/emses/summarize.py +413 -0
  83. runops/templates/adapters/generic/case.toml.j2 +13 -0
  84. runops/templates/adapters/generic/summarize.py +21 -0
  85. runops/templates/agent.md +156 -0
  86. runops/templates/rules/cookbook.md +22 -0
  87. runops/templates/scaffold/campaign.toml.j2 +10 -0
  88. runops/templates/scaffold/cases_claude.md +22 -0
  89. runops/templates/scaffold/facts.toml +2 -0
  90. runops/templates/scaffold/gitignore.txt +30 -0
  91. runops/templates/scaffold/notes/README.md +69 -0
  92. runops/templates/scaffold/rules/plan-before-act.md +17 -0
  93. runops/templates/scaffold/rules/runops-workflow.md +84 -0
  94. runops/templates/scaffold/rules/upstream-feedback.md +85 -0
  95. runops/templates/scaffold/runs_claude.md +24 -0
  96. runops/templates/scaffold/vscode_settings.json +9 -0
  97. runops/templates/skills/analyze/SKILL.md +40 -0
  98. runops/templates/skills/check-status/SKILL.md +29 -0
  99. runops/templates/skills/cleanup/SKILL.md +43 -0
  100. runops/templates/skills/create-run/SKILL.md +135 -0
  101. runops/templates/skills/debug-failed/SKILL.md +38 -0
  102. runops/templates/skills/learn/SKILL.md +54 -0
  103. runops/templates/skills/new-case/SKILL.md +108 -0
  104. runops/templates/skills/note/SKILL.md +107 -0
  105. runops/templates/skills/run-all/SKILL.md +47 -0
  106. runops/templates/skills/runops-reference/SKILL.md +203 -0
  107. runops/templates/skills/setup-campaign/SKILL.md +111 -0
  108. runops/templates/skills/setup-env/SKILL.md +32 -0
  109. runops/templates/skills/survey-design/SKILL.md +73 -0
  110. runops/templates/survey.toml.j2 +22 -0
  111. runops-0.2.0.dist-info/METADATA +491 -0
  112. runops-0.2.0.dist-info/RECORD +115 -0
  113. runops-0.2.0.dist-info/WHEEL +4 -0
  114. runops-0.2.0.dist-info/entry_points.txt +2 -0
  115. runops-0.2.0.dist-info/licenses/LICENSE +201 -0
runops/cli/create.py ADDED
@@ -0,0 +1,263 @@
1
+ """CLI commands for run and survey creation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Any, Optional
7
+
8
+ import typer
9
+
10
+ from runops.core.actions import ActionStatus, execute_action
11
+ from runops.core.case import JobData, load_case, resolve_case
12
+ from runops.core.exceptions import SimctlError
13
+ from runops.core.project import find_project_root, load_project
14
+ from runops.core.survey import expand_survey, generate_display_name, load_survey
15
+
16
+
17
+ def _echo_warnings(warnings: list[str], *, context: str = "") -> None:
18
+ """Print non-fatal validation warnings emitted during run creation."""
19
+ prefix = f"{context}: " if context else ""
20
+ for warning in warnings:
21
+ typer.echo(f" Warning: {prefix}{warning}", err=True)
22
+
23
+
24
+ def create(
25
+ case_name: Annotated[
26
+ str,
27
+ typer.Argument(
28
+ help="Case name to create a run from.",
29
+ ),
30
+ ],
31
+ dest: Annotated[
32
+ Optional[Path],
33
+ typer.Option("--dest", "-d", help="Destination directory (defaults to cwd)."),
34
+ ] = None,
35
+ ) -> None:
36
+ """Create a run in the current directory.
37
+
38
+ Examples:
39
+ cd runs/experiment && runops runs create flat_surface
40
+ """
41
+ target_dir = (dest or Path.cwd()).resolve()
42
+
43
+ if case_name == "survey" and (target_dir / "survey.toml").is_file():
44
+ typer.echo(
45
+ "Error: 'runops runs create survey' has been removed. "
46
+ "Use 'runops runs sweep [DIR]' instead.",
47
+ err=True,
48
+ )
49
+ raise typer.Exit(code=1)
50
+
51
+ _create_single(case_name, target_dir)
52
+
53
+
54
+ def _create_single(case_name: str, target_dir: Path) -> None:
55
+ """Create a single run from a case template."""
56
+ try:
57
+ project_root = find_project_root(target_dir)
58
+ result = execute_action(
59
+ "create_run",
60
+ project_root=project_root,
61
+ case_name=case_name,
62
+ dest_dir=target_dir,
63
+ )
64
+ except SimctlError as exc:
65
+ typer.echo(f"Error creating run: {exc}", err=True)
66
+ raise typer.Exit(code=1) from exc
67
+
68
+ if result.status is not ActionStatus.SUCCESS:
69
+ typer.echo(f"Error creating run: {result.message}", err=True)
70
+ raise typer.Exit(code=1)
71
+
72
+ _echo_warnings(list(result.data.get("warnings", [])))
73
+ typer.echo(f"Created run: {result.data.get('run_id', '???')}")
74
+ typer.echo(f" Path: {result.data.get('run_dir', target_dir)}")
75
+
76
+
77
+ def _create_survey(survey_dir: Path) -> None:
78
+ """Expand survey.toml into multiple runs."""
79
+ try:
80
+ project_root = find_project_root(survey_dir)
81
+ result = execute_action(
82
+ "create_survey",
83
+ project_root=project_root,
84
+ survey_dir=survey_dir,
85
+ )
86
+ except SimctlError as exc:
87
+ typer.echo(f"Error: {exc}", err=True)
88
+ raise typer.Exit(code=1) from exc
89
+
90
+ if result.status is not ActionStatus.SUCCESS:
91
+ typer.echo(f"Error: {result.message}", err=True)
92
+ raise typer.Exit(code=1)
93
+
94
+ created_runs = list(result.data.get("runs", []))
95
+ if not created_runs:
96
+ typer.echo("No parameter combinations to expand.")
97
+ raise typer.Exit(code=0)
98
+
99
+ for created_run in created_runs:
100
+ _echo_warnings(
101
+ list(created_run.get("warnings", [])),
102
+ context=str(created_run.get("display_name", "")),
103
+ )
104
+
105
+ typer.echo(f"Created {len(created_runs)} runs in {survey_dir}")
106
+ for created_run in created_runs:
107
+ display_name = str(created_run.get("display_name", ""))
108
+ name_part = f" ({display_name})" if display_name else ""
109
+ typer.echo(f" {created_run.get('run_id', '???')}{name_part}")
110
+
111
+
112
+ def sweep(
113
+ survey_dir: Annotated[
114
+ Optional[Path],
115
+ typer.Argument(help="Directory containing survey.toml (defaults to cwd)."),
116
+ ] = None,
117
+ dry_run: Annotated[
118
+ bool,
119
+ typer.Option(
120
+ "--dry-run",
121
+ "-n",
122
+ help=(
123
+ "Print the runs that would be generated (count, parameter "
124
+ "combinations, estimated total resource cost) without "
125
+ "writing any files."
126
+ ),
127
+ ),
128
+ ] = False,
129
+ ) -> None:
130
+ """Generate all runs from a survey.toml parameter sweep.
131
+
132
+ With ``--dry-run`` the survey is parsed and expanded but no run
133
+ directories are created — useful to verify the parameter combinations
134
+ and total resource cost before committing files / queue time.
135
+ """
136
+ target = (survey_dir or Path.cwd()).resolve()
137
+ if dry_run:
138
+ _sweep_dry_run(target)
139
+ else:
140
+ _create_survey(target)
141
+
142
+
143
+ def _sweep_dry_run(survey_dir: Path) -> None:
144
+ """Print the planned runs without writing files."""
145
+ try:
146
+ survey_data = load_survey(survey_dir)
147
+ except SimctlError as exc:
148
+ typer.echo(f"Error: {exc}", err=True)
149
+ raise typer.Exit(code=1) from exc
150
+
151
+ try:
152
+ project_root = find_project_root(survey_dir)
153
+ except SimctlError as exc:
154
+ typer.echo(f"Error: {exc}", err=True)
155
+ raise typer.Exit(code=1) from exc
156
+
157
+ # Look up the base case so we can show the resolved job (the survey's
158
+ # own [job] block, when present, overrides the case's).
159
+ try:
160
+ project = load_project(project_root)
161
+ case_dir = resolve_case(survey_data.base_case, project.root_dir)
162
+ case_data = load_case(case_dir)
163
+ except SimctlError as exc:
164
+ typer.echo(f"Error resolving base case: {exc}", err=True)
165
+ raise typer.Exit(code=1) from exc
166
+
167
+ effective_job = survey_data.job if survey_data.job.partition else case_data.job
168
+
169
+ combinations = expand_survey(survey_data.axes, survey_data.linked)
170
+ if not combinations:
171
+ typer.echo("No parameter combinations to expand.")
172
+ raise typer.Exit(code=0)
173
+
174
+ n = len(combinations)
175
+ typer.echo(f"[dry-run] {n} runs would be created in {survey_dir}")
176
+ typer.echo(f" base case : {survey_data.base_case}")
177
+ typer.echo(f" simulator : {survey_data.simulator}")
178
+ typer.echo(f" launcher : {survey_data.launcher}")
179
+ if survey_data.naming_template:
180
+ typer.echo(f" display : {survey_data.naming_template}")
181
+ typer.echo(_format_job_summary(effective_job))
182
+ typer.echo(_format_resource_estimate(effective_job, n))
183
+
184
+ # Print one line per combination so the user can scan parameters.
185
+ typer.echo("")
186
+ typer.echo("Planned runs:")
187
+ for combo in combinations:
188
+ display_name = generate_display_name(survey_data.naming_template, combo)
189
+ params_str = _format_combo(combo)
190
+ if display_name:
191
+ typer.echo(f" {display_name:<24} {params_str}")
192
+ else:
193
+ typer.echo(f" {params_str}")
194
+
195
+
196
+ def _format_combo(combo: dict[str, Any]) -> str:
197
+ """Format a combo dict as ``key1=value1, key2=value2``."""
198
+ parts = []
199
+ for key in sorted(combo.keys()):
200
+ value = combo[key]
201
+ # Truncate long lists for readability.
202
+ if isinstance(value, list) and len(value) > 4:
203
+ shown = f"[{value[0]}, ..., {value[-1]} ({len(value)} items)]"
204
+ else:
205
+ shown = repr(value)
206
+ parts.append(f"{key}={shown}")
207
+ return ", ".join(parts)
208
+
209
+
210
+ def _format_job_summary(job: JobData) -> str:
211
+ """Format a JobData line for the dry-run output."""
212
+ if job.processes > 1 or job.threads > 1 or job.cores > 1:
213
+ # rsc-style site
214
+ return (
215
+ f" job : partition={job.partition or '(default)'} "
216
+ f"p={job.processes} t={job.threads} c={job.cores} "
217
+ f"walltime={job.walltime}"
218
+ )
219
+ return (
220
+ f" job : partition={job.partition or '(default)'} "
221
+ f"nodes={job.nodes} ntasks={job.ntasks} walltime={job.walltime}"
222
+ )
223
+
224
+
225
+ def _format_resource_estimate(job: JobData, n_runs: int) -> str:
226
+ """Best-effort estimate of total core-hours for the planned sweep."""
227
+ # Pick the larger of (rsc processes) or (standard ntasks) so the
228
+ # estimate works regardless of which site mode the job uses.
229
+ cores_per_run = max(job.processes, job.ntasks)
230
+ walltime_hours = _walltime_to_hours(job.walltime)
231
+ if cores_per_run <= 1 or walltime_hours <= 0:
232
+ return " estimate : (cannot estimate — incomplete job spec)"
233
+ total_core_hours = cores_per_run * walltime_hours * n_runs
234
+ return (
235
+ f" estimate : {n_runs} runs x {cores_per_run} cores x "
236
+ f"{walltime_hours:.1f} h walltime ~= {total_core_hours:,.0f} core-hours"
237
+ )
238
+
239
+
240
+ def _walltime_to_hours(walltime: str) -> float:
241
+ """Parse a walltime string to hours. Returns 0.0 on failure."""
242
+ if not walltime:
243
+ return 0.0
244
+ # Strip optional ``D-`` day prefix.
245
+ days = 0
246
+ rest = walltime
247
+ if "-" in walltime:
248
+ head, _, rest = walltime.partition("-")
249
+ try:
250
+ days = int(head)
251
+ except ValueError:
252
+ return 0.0
253
+ parts = rest.split(":")
254
+ try:
255
+ if len(parts) == 3:
256
+ h, m, s = (int(p) for p in parts)
257
+ elif len(parts) == 2:
258
+ h, m, s = 0, int(parts[0]), int(parts[1])
259
+ else:
260
+ return 0.0
261
+ except ValueError:
262
+ return 0.0
263
+ return days * 24 + h + m / 60 + s / 3600
@@ -0,0 +1,179 @@
1
+ """CLI command for the multi-run dashboard view."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Annotated, Optional
8
+
9
+ import typer
10
+
11
+ from runops.cli.run_lookup import resolve_run_targets
12
+ from runops.core.exceptions import SimctlError
13
+ from runops.core.manifest import read_manifest
14
+
15
+
16
+ def dashboard(
17
+ targets: Annotated[
18
+ Optional[list[str]],
19
+ typer.Argument(
20
+ help=(
21
+ "Run identifiers, run directories, or directories containing "
22
+ "runs (recursive). Defaults to cwd / project runs/."
23
+ ),
24
+ ),
25
+ ] = None,
26
+ watch: Annotated[
27
+ Optional[float],
28
+ typer.Option(
29
+ "--watch",
30
+ "-w",
31
+ help=(
32
+ "Refresh the dashboard every N seconds (clears screen between "
33
+ "refreshes). Press Ctrl-C to stop."
34
+ ),
35
+ ),
36
+ ] = None,
37
+ all_states: Annotated[
38
+ bool,
39
+ typer.Option(
40
+ "--all",
41
+ "-a",
42
+ help="Show all runs, not just the ones in submitted/running state.",
43
+ ),
44
+ ] = False,
45
+ ) -> None:
46
+ """Multi-run progress dashboard.
47
+
48
+ Aggregates per-run progress (state, step, %, last diagnostic) into a
49
+ single table. Useful while a survey of dozens of runs is in flight:
50
+ instead of opening ``runops runs log`` for each run individually,
51
+ one ``runops runs dashboard`` call shows the whole survey.
52
+
53
+ Examples:
54
+ runops runs dashboard runs/series_A # one survey
55
+ runops runs dashboard -w 30 runs/series_A # auto-refresh every 30 s
56
+ runops runs dashboard --all runs/ # whole project, including
57
+ # completed/failed runs
58
+ """
59
+ cwd = Path.cwd().resolve()
60
+ run_dirs = resolve_run_targets(targets, search_dir=cwd)
61
+
62
+ if watch is not None and watch > 0:
63
+ _watch_loop(run_dirs, all_states=all_states, interval=watch)
64
+ else:
65
+ _print_dashboard(run_dirs, all_states=all_states)
66
+
67
+
68
+ def _print_dashboard(run_dirs: list[Path], *, all_states: bool) -> None:
69
+ """Render the dashboard table once."""
70
+ if not run_dirs:
71
+ typer.echo("No runs found.")
72
+ return
73
+
74
+ active_states = {"submitted", "running"}
75
+ rows: list[tuple[str, str, str, str, str, str]] = []
76
+
77
+ for run_dir in run_dirs:
78
+ try:
79
+ manifest = read_manifest(run_dir)
80
+ except SimctlError:
81
+ continue
82
+
83
+ status = str(manifest.run.get("status", "unknown"))
84
+ if not all_states and status not in active_states:
85
+ continue
86
+
87
+ run_id = str(manifest.run.get("id", run_dir.name))
88
+ display_name = str(manifest.run.get("display_name", ""))
89
+
90
+ step_str, pct_str = _progress_for_run(run_dir, manifest.simulator)
91
+
92
+ # Show the latest known Slurm state if recorded.
93
+ last_slurm = str(manifest.run.get("last_slurm_state", "")) or "-"
94
+
95
+ rows.append((run_id, display_name, status, step_str, pct_str, last_slurm))
96
+
97
+ if not rows:
98
+ typer.echo("No active runs found.")
99
+ return
100
+
101
+ # Sort by run_id for stable output.
102
+ rows.sort(key=lambda r: r[0])
103
+
104
+ headers = ("RUN_ID", "NAME", "STATE", "STEP", "%", "SLURM")
105
+ widths = [len(h) for h in headers]
106
+ for row in rows:
107
+ for i, cell in enumerate(row):
108
+ widths[i] = max(widths[i], len(cell))
109
+
110
+ fmt = " ".join(f"{{:<{w}}}" for w in widths)
111
+ typer.echo(fmt.format(*headers))
112
+ typer.echo(fmt.format(*("-" * w for w in widths)))
113
+ for row in rows:
114
+ typer.echo(fmt.format(*row))
115
+
116
+ n_active = sum(1 for r in rows if r[2] in active_states)
117
+ typer.echo(f"\n{n_active} active, {len(rows)} total")
118
+
119
+
120
+ def _progress_for_run(
121
+ run_dir: Path,
122
+ simulator: dict[str, str],
123
+ ) -> tuple[str, str]:
124
+ """Best-effort progress lookup for a single run.
125
+
126
+ Returns ``(step_str, pct_str)`` where each part is a short string
127
+ suitable for table cells. Returns ``("-", "-")`` when the adapter
128
+ has no progress to report.
129
+ """
130
+ adapter_name = simulator.get("adapter", "") or simulator.get("name", "")
131
+ if not adapter_name:
132
+ return "-", "-"
133
+
134
+ try:
135
+ import runops.adapters # noqa: F401 (registers adapters)
136
+ from runops.adapters.registry import get as get_adapter
137
+
138
+ adapter_cls = get_adapter(adapter_name)
139
+ adapter = adapter_cls()
140
+ summary = adapter.summarize(run_dir)
141
+ except Exception:
142
+ return "-", "-"
143
+
144
+ last_step = summary.get("last_step")
145
+ nstep = summary.get("nstep")
146
+ if last_step is None or not nstep:
147
+ return "-", "-"
148
+
149
+ pct = float(last_step) / float(nstep) * 100
150
+ return f"{int(last_step):d}/{int(nstep):d}", f"{pct:5.1f}%"
151
+
152
+
153
+ def _watch_loop(
154
+ run_dirs: list[Path],
155
+ *,
156
+ all_states: bool,
157
+ interval: float,
158
+ ) -> None:
159
+ """Refresh the dashboard every ``interval`` seconds.
160
+
161
+ Stops cleanly on Ctrl-C. Each refresh re-reads the manifests and
162
+ re-runs the per-run progress lookup, so newly-submitted or newly-
163
+ completed runs are picked up automatically.
164
+ """
165
+ from datetime import datetime
166
+
167
+ try:
168
+ while True:
169
+ typer.echo("\x1b[2J\x1b[H", nl=False)
170
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
171
+ typer.echo(
172
+ f"runops runs dashboard (watch every {interval:g}s) — {timestamp}"
173
+ )
174
+ typer.echo("")
175
+ _print_dashboard(run_dirs, all_states=all_states)
176
+ time.sleep(interval)
177
+ except KeyboardInterrupt:
178
+ typer.echo("\nStopped.")
179
+ raise typer.Exit(code=0) from None
runops/cli/extend.py ADDED
@@ -0,0 +1,204 @@
1
+ """CLI command for extending/continuing a simulation from a snapshot."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ from pathlib import Path
7
+ from typing import Annotated, Any, Optional
8
+
9
+ import typer
10
+
11
+ from runops.cli.run_lookup import resolve_run_or_cwd
12
+ from runops.core.discovery import collect_existing_run_ids
13
+ from runops.core.exceptions import SimctlError
14
+ from runops.core.manifest import ManifestData, read_manifest, write_manifest
15
+ from runops.core.project import find_project_root, load_project
16
+ from runops.core.run import create_run
17
+
18
+
19
+ def extend(
20
+ run: Annotated[
21
+ Optional[str],
22
+ typer.Argument(help="Source run to continue from (defaults to cwd)."),
23
+ ] = None,
24
+ dest: Annotated[
25
+ Optional[Path],
26
+ typer.Option(
27
+ "--dest", "-d", help="Destination directory (defaults to source's parent)."
28
+ ),
29
+ ] = None,
30
+ nstep: Annotated[
31
+ Optional[int],
32
+ typer.Option("--nstep", help="Override total step count for continuation."),
33
+ ] = None,
34
+ submit: Annotated[
35
+ bool,
36
+ typer.Option("--run", help="Automatically submit the continuation run."),
37
+ ] = False,
38
+ ) -> None:
39
+ """Create a continuation run from a completed simulation's snapshot.
40
+
41
+ Copies input files, links snapshots, and updates restart parameters.
42
+ The adapter handles simulator-specific continuation setup.
43
+
44
+ Examples:
45
+ runops runs extend # continue cwd run
46
+ runops runs extend R0001 --nstep 200000 # continue with more steps
47
+ runops runs extend --run # continue and submit
48
+ """
49
+ source_dir = resolve_run_or_cwd(run, search_dir=Path.cwd())
50
+
51
+ # Read source manifest
52
+ try:
53
+ source_manifest = read_manifest(source_dir)
54
+ except SimctlError as e:
55
+ typer.echo(f"Error: {e}", err=True)
56
+ raise typer.Exit(code=1) from None
57
+
58
+ source_id = source_manifest.run.get("id", source_dir.name)
59
+ source_status = source_manifest.run.get("status", "")
60
+
61
+ if source_status not in ("completed", "running", "failed"):
62
+ typer.echo(
63
+ f"Warning: source run {source_id} is '{source_status}'. "
64
+ "Continuation is typically from completed runs."
65
+ )
66
+
67
+ # Determine destination
68
+ target_dir = dest or source_dir.parent
69
+ target_dir = target_dir.resolve()
70
+ target_dir.mkdir(parents=True, exist_ok=True)
71
+
72
+ # Load project for adapter/launcher
73
+ try:
74
+ project_root = find_project_root(target_dir)
75
+ project = load_project(project_root)
76
+ except SimctlError as e:
77
+ typer.echo(f"Error: {e}", err=True)
78
+ raise typer.Exit(code=1) from None
79
+
80
+ # Get adapter
81
+ adapter_name = source_manifest.simulator.get("adapter", "")
82
+ if not adapter_name:
83
+ adapter_name = source_manifest.simulator.get("name", "")
84
+
85
+ try:
86
+ from runops.adapters.registry import get as get_adapter
87
+ from runops.adapters.registry import load_from_config
88
+
89
+ load_from_config(project.simulators)
90
+ adapter_cls = get_adapter(adapter_name)
91
+ adapter = adapter_cls()
92
+ except (KeyError, Exception) as e:
93
+ typer.echo(f"Error loading adapter '{adapter_name}': {e}", err=True)
94
+ raise typer.Exit(code=1) from None
95
+
96
+ # Collect existing run IDs
97
+ runs_dir = project_root / "runs"
98
+ existing_ids = collect_existing_run_ids(runs_dir)
99
+
100
+ # Create new run directory
101
+ params = dict(source_manifest.params_snapshot)
102
+ try:
103
+ run_info = create_run(
104
+ target_dir,
105
+ existing_ids,
106
+ display_name=f"extend_{source_id}",
107
+ params=params,
108
+ )
109
+ except SimctlError as e:
110
+ typer.echo(f"Error creating run: {e}", err=True)
111
+ raise typer.Exit(code=1) from None
112
+
113
+ new_dir = run_info.run_dir
114
+ new_input = new_dir / "input"
115
+ new_input.mkdir(parents=True, exist_ok=True)
116
+
117
+ # Copy input files from source
118
+ source_input = source_dir / "input"
119
+ if source_input.is_dir():
120
+ for item in source_input.iterdir():
121
+ dest_item = new_input / item.name
122
+ if item.is_file():
123
+ shutil.copy2(item, dest_item)
124
+ elif item.is_dir():
125
+ shutil.copytree(item, dest_item, dirs_exist_ok=True)
126
+
127
+ # Let adapter set up continuation (snapshot links, parameter updates)
128
+ continuation_info: dict[str, Any] = {}
129
+ if hasattr(adapter, "setup_continuation"):
130
+ try:
131
+ continuation_info = adapter.setup_continuation(
132
+ source_dir=source_dir,
133
+ new_dir=new_dir,
134
+ nstep_override=nstep,
135
+ )
136
+ except Exception as e:
137
+ typer.echo(f"Error in adapter continuation setup: {e}", err=True)
138
+ raise typer.Exit(code=1) from None
139
+
140
+ # Copy job script from source if exists
141
+ source_submit = source_dir / "submit"
142
+ new_submit = new_dir / "submit"
143
+ new_submit.mkdir(parents=True, exist_ok=True)
144
+ source_job = source_submit / "job.sh"
145
+ if source_job.is_file():
146
+ shutil.copy2(source_job, new_submit / "job.sh")
147
+
148
+ # Create work directory
149
+ (new_dir / "work").mkdir(exist_ok=True)
150
+
151
+ # Write manifest
152
+ new_manifest = ManifestData(
153
+ run={
154
+ "id": run_info.run_id,
155
+ "display_name": run_info.display_name,
156
+ "status": "created",
157
+ "created_at": run_info.created_at,
158
+ },
159
+ path={"run_dir": str(new_dir)},
160
+ origin={
161
+ "case": source_manifest.origin.get("case", ""),
162
+ "survey": "",
163
+ "parent_run": source_id,
164
+ },
165
+ classification=dict(source_manifest.classification),
166
+ simulator=dict(source_manifest.simulator),
167
+ launcher=dict(source_manifest.launcher),
168
+ simulator_source=dict(source_manifest.simulator_source),
169
+ job={
170
+ "scheduler": "slurm",
171
+ "job_id": "",
172
+ "partition": source_manifest.job.get("partition", ""),
173
+ "nodes": source_manifest.job.get("nodes", 1),
174
+ "ntasks": source_manifest.job.get("ntasks", 1),
175
+ "walltime": source_manifest.job.get("walltime", "01:00:00"),
176
+ "submitted_at": "",
177
+ },
178
+ variation={"changed_keys": []},
179
+ params_snapshot=params,
180
+ files={
181
+ "input_dir": "input",
182
+ "submit_dir": "submit",
183
+ "work_dir": "work",
184
+ "analysis_dir": "analysis",
185
+ "status_dir": "status",
186
+ },
187
+ )
188
+ write_manifest(new_dir, new_manifest)
189
+
190
+ typer.echo(f"Created continuation run: {run_info.run_id}")
191
+ typer.echo(f" Source: {source_id}")
192
+ typer.echo(f" Path: {new_dir}")
193
+ if continuation_info:
194
+ for key, val in continuation_info.items():
195
+ typer.echo(f" {key}: {val}")
196
+
197
+ # Auto-submit if requested
198
+ if submit:
199
+ from runops.cli.submit import _submit_single_run
200
+
201
+ job_id = _submit_single_run(new_dir)
202
+ if job_id is None:
203
+ typer.echo("Warning: auto-submit failed")
204
+ raise typer.Exit(code=1)