@zigrivers/scaffold 3.13.0 → 3.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/README.md +32 -10
  2. package/content/knowledge/research/research-architecture.md +385 -0
  3. package/content/knowledge/research/research-conventions.md +248 -0
  4. package/content/knowledge/research/research-dev-environment.md +303 -0
  5. package/content/knowledge/research/research-experiment-loop.md +429 -0
  6. package/content/knowledge/research/research-experiment-tracking.md +336 -0
  7. package/content/knowledge/research/research-ml-architecture-search.md +383 -0
  8. package/content/knowledge/research/research-ml-evaluation.md +407 -0
  9. package/content/knowledge/research/research-ml-experiment-tracking.md +466 -0
  10. package/content/knowledge/research/research-ml-training-patterns.md +413 -0
  11. package/content/knowledge/research/research-observability.md +395 -0
  12. package/content/knowledge/research/research-overfitting-prevention.md +306 -0
  13. package/content/knowledge/research/research-project-structure.md +264 -0
  14. package/content/knowledge/research/research-quant-backtesting.md +326 -0
  15. package/content/knowledge/research/research-quant-market-data.md +366 -0
  16. package/content/knowledge/research/research-quant-metrics.md +335 -0
  17. package/content/knowledge/research/research-quant-requirements.md +223 -0
  18. package/content/knowledge/research/research-quant-risk.md +469 -0
  19. package/content/knowledge/research/research-quant-strategy-patterns.md +412 -0
  20. package/content/knowledge/research/research-requirements.md +201 -0
  21. package/content/knowledge/research/research-security.md +374 -0
  22. package/content/knowledge/research/research-sim-compute-management.md +538 -0
  23. package/content/knowledge/research/research-sim-engine-patterns.md +448 -0
  24. package/content/knowledge/research/research-sim-parameter-spaces.md +425 -0
  25. package/content/knowledge/research/research-sim-validation.md +456 -0
  26. package/content/knowledge/research/research-testing.md +334 -0
  27. package/content/methodology/research-ml-research.yml +23 -0
  28. package/content/methodology/research-overlay.yml +65 -0
  29. package/content/methodology/research-quant-finance.yml +29 -0
  30. package/content/methodology/research-simulation.yml +23 -0
  31. package/dist/cli/commands/adopt.d.ts.map +1 -1
  32. package/dist/cli/commands/adopt.js +30 -8
  33. package/dist/cli/commands/adopt.js.map +1 -1
  34. package/dist/cli/commands/adopt.serialization.test.js +49 -0
  35. package/dist/cli/commands/adopt.serialization.test.js.map +1 -1
  36. package/dist/cli/commands/adopt.test.js +8 -0
  37. package/dist/cli/commands/adopt.test.js.map +1 -1
  38. package/dist/cli/commands/build.d.ts.map +1 -1
  39. package/dist/cli/commands/build.js +191 -180
  40. package/dist/cli/commands/build.js.map +1 -1
  41. package/dist/cli/commands/complete.d.ts.map +1 -1
  42. package/dist/cli/commands/complete.js +16 -12
  43. package/dist/cli/commands/complete.js.map +1 -1
  44. package/dist/cli/commands/complete.test.js +14 -5
  45. package/dist/cli/commands/complete.test.js.map +1 -1
  46. package/dist/cli/commands/init.d.ts +4 -0
  47. package/dist/cli/commands/init.d.ts.map +1 -1
  48. package/dist/cli/commands/init.js +75 -51
  49. package/dist/cli/commands/init.js.map +1 -1
  50. package/dist/cli/commands/init.test.js +33 -27
  51. package/dist/cli/commands/init.test.js.map +1 -1
  52. package/dist/cli/commands/reset.d.ts.map +1 -1
  53. package/dist/cli/commands/reset.js +44 -40
  54. package/dist/cli/commands/reset.js.map +1 -1
  55. package/dist/cli/commands/reset.test.js +42 -20
  56. package/dist/cli/commands/reset.test.js.map +1 -1
  57. package/dist/cli/commands/rework.d.ts.map +1 -1
  58. package/dist/cli/commands/rework.js +16 -12
  59. package/dist/cli/commands/rework.js.map +1 -1
  60. package/dist/cli/commands/rework.test.js +12 -3
  61. package/dist/cli/commands/rework.test.js.map +1 -1
  62. package/dist/cli/commands/run.d.ts.map +1 -1
  63. package/dist/cli/commands/run.js +318 -298
  64. package/dist/cli/commands/run.js.map +1 -1
  65. package/dist/cli/commands/run.test.js +92 -120
  66. package/dist/cli/commands/run.test.js.map +1 -1
  67. package/dist/cli/commands/skip.d.ts.map +1 -1
  68. package/dist/cli/commands/skip.js +19 -15
  69. package/dist/cli/commands/skip.js.map +1 -1
  70. package/dist/cli/commands/skip.test.js +22 -11
  71. package/dist/cli/commands/skip.test.js.map +1 -1
  72. package/dist/cli/commands/update.d.ts.map +1 -1
  73. package/dist/cli/commands/update.js +3 -1
  74. package/dist/cli/commands/update.js.map +1 -1
  75. package/dist/cli/commands/update.test.js +8 -4
  76. package/dist/cli/commands/update.test.js.map +1 -1
  77. package/dist/cli/commands/version.d.ts.map +1 -1
  78. package/dist/cli/commands/version.js +3 -1
  79. package/dist/cli/commands/version.js.map +1 -1
  80. package/dist/cli/commands/version.test.js +9 -5
  81. package/dist/cli/commands/version.test.js.map +1 -1
  82. package/dist/cli/index.d.ts.map +1 -1
  83. package/dist/cli/index.js +2 -0
  84. package/dist/cli/index.js.map +1 -1
  85. package/dist/cli/init-flag-families.d.ts +6 -1
  86. package/dist/cli/init-flag-families.d.ts.map +1 -1
  87. package/dist/cli/init-flag-families.js +32 -1
  88. package/dist/cli/init-flag-families.js.map +1 -1
  89. package/dist/cli/init-flag-families.test.js +47 -0
  90. package/dist/cli/init-flag-families.test.js.map +1 -1
  91. package/dist/cli/output/interactive.d.ts +1 -0
  92. package/dist/cli/output/interactive.d.ts.map +1 -1
  93. package/dist/cli/output/interactive.js +5 -0
  94. package/dist/cli/output/interactive.js.map +1 -1
  95. package/dist/cli/shutdown.d.ts +51 -0
  96. package/dist/cli/shutdown.d.ts.map +1 -0
  97. package/dist/cli/shutdown.js +199 -0
  98. package/dist/cli/shutdown.js.map +1 -0
  99. package/dist/cli/shutdown.test.d.ts +2 -0
  100. package/dist/cli/shutdown.test.d.ts.map +1 -0
  101. package/dist/cli/shutdown.test.js +316 -0
  102. package/dist/cli/shutdown.test.js.map +1 -0
  103. package/dist/config/schema.d.ts +272 -16
  104. package/dist/config/schema.d.ts.map +1 -1
  105. package/dist/config/schema.js +25 -1
  106. package/dist/config/schema.js.map +1 -1
  107. package/dist/config/schema.test.js +103 -3
  108. package/dist/config/schema.test.js.map +1 -1
  109. package/dist/core/assembly/overlay-loader.d.ts +12 -0
  110. package/dist/core/assembly/overlay-loader.d.ts.map +1 -1
  111. package/dist/core/assembly/overlay-loader.js +30 -0
  112. package/dist/core/assembly/overlay-loader.js.map +1 -1
  113. package/dist/core/assembly/overlay-loader.test.js +66 -1
  114. package/dist/core/assembly/overlay-loader.test.js.map +1 -1
  115. package/dist/core/assembly/overlay-state-resolver.d.ts.map +1 -1
  116. package/dist/core/assembly/overlay-state-resolver.js +48 -19
  117. package/dist/core/assembly/overlay-state-resolver.js.map +1 -1
  118. package/dist/core/assembly/overlay-state-resolver.test.js +80 -0
  119. package/dist/core/assembly/overlay-state-resolver.test.js.map +1 -1
  120. package/dist/e2e/init.test.js +5 -4
  121. package/dist/e2e/init.test.js.map +1 -1
  122. package/dist/e2e/project-type-overlays.test.js +119 -0
  123. package/dist/e2e/project-type-overlays.test.js.map +1 -1
  124. package/dist/project/adopt.d.ts.map +1 -1
  125. package/dist/project/adopt.js +3 -1
  126. package/dist/project/adopt.js.map +1 -1
  127. package/dist/project/detectors/disambiguate.js +1 -1
  128. package/dist/project/detectors/disambiguate.js.map +1 -1
  129. package/dist/project/detectors/index.d.ts.map +1 -1
  130. package/dist/project/detectors/index.js +2 -1
  131. package/dist/project/detectors/index.js.map +1 -1
  132. package/dist/project/detectors/ml.d.ts.map +1 -1
  133. package/dist/project/detectors/ml.js +2 -6
  134. package/dist/project/detectors/ml.js.map +1 -1
  135. package/dist/project/detectors/research.d.ts +4 -0
  136. package/dist/project/detectors/research.d.ts.map +1 -0
  137. package/dist/project/detectors/research.js +141 -0
  138. package/dist/project/detectors/research.js.map +1 -0
  139. package/dist/project/detectors/research.test.d.ts +2 -0
  140. package/dist/project/detectors/research.test.d.ts.map +1 -0
  141. package/dist/project/detectors/research.test.js +235 -0
  142. package/dist/project/detectors/research.test.js.map +1 -0
  143. package/dist/project/detectors/shared-signals.d.ts +3 -0
  144. package/dist/project/detectors/shared-signals.d.ts.map +1 -0
  145. package/dist/project/detectors/shared-signals.js +9 -0
  146. package/dist/project/detectors/shared-signals.js.map +1 -0
  147. package/dist/project/detectors/types.d.ts +6 -2
  148. package/dist/project/detectors/types.d.ts.map +1 -1
  149. package/dist/project/detectors/types.js.map +1 -1
  150. package/dist/state/lock-manager.d.ts +1 -0
  151. package/dist/state/lock-manager.d.ts.map +1 -1
  152. package/dist/state/lock-manager.js +1 -1
  153. package/dist/state/lock-manager.js.map +1 -1
  154. package/dist/types/config.d.ts +7 -1
  155. package/dist/types/config.d.ts.map +1 -1
  156. package/dist/wizard/copy/core.d.ts.map +1 -1
  157. package/dist/wizard/copy/core.js +4 -0
  158. package/dist/wizard/copy/core.js.map +1 -1
  159. package/dist/wizard/copy/index.d.ts.map +1 -1
  160. package/dist/wizard/copy/index.js +2 -0
  161. package/dist/wizard/copy/index.js.map +1 -1
  162. package/dist/wizard/copy/research.d.ts +3 -0
  163. package/dist/wizard/copy/research.d.ts.map +1 -0
  164. package/dist/wizard/copy/research.js +27 -0
  165. package/dist/wizard/copy/research.js.map +1 -0
  166. package/dist/wizard/copy/types.d.ts +5 -1
  167. package/dist/wizard/copy/types.d.ts.map +1 -1
  168. package/dist/wizard/flags.d.ts +7 -1
  169. package/dist/wizard/flags.d.ts.map +1 -1
  170. package/dist/wizard/questions.d.ts +4 -2
  171. package/dist/wizard/questions.d.ts.map +1 -1
  172. package/dist/wizard/questions.js +27 -1
  173. package/dist/wizard/questions.js.map +1 -1
  174. package/dist/wizard/questions.test.js +51 -0
  175. package/dist/wizard/questions.test.js.map +1 -1
  176. package/dist/wizard/wizard.d.ts +3 -2
  177. package/dist/wizard/wizard.d.ts.map +1 -1
  178. package/dist/wizard/wizard.js +3 -1
  179. package/dist/wizard/wizard.js.map +1 -1
  180. package/package.json +1 -1
@@ -0,0 +1,538 @@
1
+ ---
2
+ name: research-sim-compute-management
3
+ description: Compute resource management for simulations including wall-clock budgets, job scheduling with SLURM and PBS, parallelization strategies, checkpoint/restart for long simulations, resource monitoring, and cost estimation
4
+ topics: [research, simulation, compute-management, slurm, pbs, parallelization, checkpoint-restart, resource-monitoring, cost-estimation, hpc, wall-clock-budget]
5
+ ---
6
+
7
+ Simulation-based research consumes significant compute resources -- a single CFD run can take hours, and parameter sweeps multiply that by hundreds or thousands of evaluations. Effective compute management determines whether a research budget yields 50 useful results or 500. The key challenges are: allocating wall-clock budgets across the experiment campaign, scheduling jobs efficiently on shared HPC resources, choosing the right parallelization level (across parameters vs within simulations), implementing checkpoint/restart for runs that exceed time limits, and monitoring resource usage to prevent waste.
8
+
9
+ ## Summary
10
+
11
+ Set explicit wall-clock budgets at the campaign level and enforce them through job-level time limits. Use SLURM or PBS job arrays for parameter sweeps, with dependency chains for multi-stage workflows. Choose between parameter-level parallelism (many independent simulations) and simulation-level parallelism (MPI domain decomposition within one run) based on problem characteristics. Implement checkpoint/restart so long simulations survive scheduler preemption and time limits. Monitor resource utilization (CPU, memory, I/O) to right-size allocations and detect inefficient runs early. Estimate costs before launching campaigns to avoid budget overruns.
12
+
13
+ ## Deep Guidance
14
+
15
+ ### Wall-Clock Budget Management
16
+
17
+ Define budgets at multiple levels and enforce them programmatically:
18
+
19
+ ```python
20
+ # src/simulation/compute/budget.py
21
+ from dataclasses import dataclass, field
22
+ from datetime import datetime, timedelta
23
+ from typing import Any
24
+
25
+ @dataclass
26
+ class ComputeBudget:
27
+ """Hierarchical compute budget for a research campaign."""
28
+ total_core_hours: float
29
+ max_wall_time_per_job: timedelta = timedelta(hours=24)
30
+ max_concurrent_jobs: int = 50
31
+ reserve_fraction: float = 0.1 # Hold back 10% for follow-up runs
32
+
33
+ # Tracking
34
+ used_core_hours: float = 0.0
35
+ jobs_submitted: int = 0
36
+ jobs_completed: int = 0
37
+ jobs_failed: int = 0
38
+
39
+ @property
40
+ def remaining_core_hours(self) -> float:
41
+ return self.total_core_hours * (1 - self.reserve_fraction) - self.used_core_hours
42
+
43
+ @property
44
+ def utilization(self) -> float:
45
+ return self.used_core_hours / self.total_core_hours if self.total_core_hours > 0 else 0
46
+
47
+ def can_submit(self, estimated_core_hours: float) -> bool:
48
+ """Check if budget allows submitting a new job."""
49
+ if estimated_core_hours > self.remaining_core_hours:
50
+ return False
51
+ if self.jobs_submitted - self.jobs_completed >= self.max_concurrent_jobs:
52
+ return False
53
+ return True
54
+
55
+ def record_completion(self, actual_core_hours: float, success: bool) -> None:
56
+ """Record a completed job."""
57
+ self.used_core_hours += actual_core_hours
58
+ self.jobs_completed += 1
59
+ if not success:
60
+ self.jobs_failed += 1
61
+
62
+ def estimate_remaining_runs(self, avg_core_hours_per_run: float) -> int:
63
+ """Estimate how many more runs the budget supports."""
64
+ if avg_core_hours_per_run <= 0:
65
+ return 0
66
+ return int(self.remaining_core_hours / avg_core_hours_per_run)
67
+
68
+
69
+ class BudgetEnforcer:
70
+ """Enforces budget constraints on job submission."""
71
+
72
+ def __init__(self, budget: ComputeBudget):
73
+ self.budget = budget
74
+ self._history: list[dict[str, Any]] = []
75
+
76
+ def request_allocation(self, job_spec: dict[str, Any]) -> bool:
77
+ """Request permission to submit a job. Returns True if allowed."""
78
+ est_hours = job_spec["num_cores"] * job_spec["wall_hours"]
79
+ if not self.budget.can_submit(est_hours):
80
+ return False
81
+ self._history.append({
82
+ "timestamp": datetime.now().isoformat(),
83
+ "job_spec": job_spec,
84
+ "estimated_hours": est_hours,
85
+ })
86
+ self.budget.jobs_submitted += 1
87
+ return True
88
+
89
+ def adaptive_time_limit(self, base_hours: float, iteration: int) -> float:
90
+ """Adjust time limits based on observed runtimes."""
91
+ completed_times = [
92
+ h["actual_hours"] for h in self._history
93
+ if "actual_hours" in h
94
+ ]
95
+ if len(completed_times) < 5:
96
+ return base_hours # Not enough data yet
97
+
98
+ # Use 90th percentile of observed times + 20% margin
99
+ import numpy as np
100
+ p90 = np.percentile(completed_times, 90)
101
+ return min(base_hours, p90 * 1.2)
102
+ ```
103
+
104
+ ### SLURM Job Scheduling
105
+
106
+ Submit and manage jobs on SLURM-based HPC clusters:
107
+
108
+ ```python
109
+ # src/simulation/compute/slurm.py
110
+ import subprocess
111
+ from dataclasses import dataclass
112
+ from pathlib import Path
113
+ from typing import Any
114
+
115
+ from src.simulation.batch import JobSubmitter, JobSpec, JobStatus
116
+
117
+ @dataclass
118
+ class SlurmConfig:
119
+ """SLURM-specific configuration."""
120
+ partition: str = "standard"
121
+ account: str = ""
122
+ qos: str = "normal"
123
+ modules: list[str] = None # Modules to load before running
124
+
125
+ def __post_init__(self):
126
+ if self.modules is None:
127
+ self.modules = []
128
+
129
+ class SlurmSubmitter(JobSubmitter):
130
+ """Submit and manage SLURM jobs."""
131
+
132
+ def __init__(self, config: SlurmConfig):
133
+ self.config = config
134
+
135
+ def submit(self, spec: JobSpec) -> str:
136
+ """Submit job to SLURM, return job ID."""
137
+ script = self._generate_script(spec)
138
+ script_path = spec.case_dir / "job.slurm"
139
+ script_path.write_text(script)
140
+
141
+ result = subprocess.run(
142
+ ["sbatch", str(script_path)],
143
+ capture_output=True, text=True, check=True,
144
+ )
145
+ # Parse job ID from "Submitted batch job 12345"
146
+ job_id = result.stdout.strip().split()[-1]
147
+ return job_id
148
+
149
+ def status(self, job_id: str) -> JobStatus:
150
+ """Query SLURM for job status."""
151
+ result = subprocess.run(
152
+ ["sacct", "-j", job_id, "--format=State", "--noheader", "--parsable2"],
153
+ capture_output=True, text=True,
154
+ )
155
+ state = result.stdout.strip().split("\n")[0] if result.stdout.strip() else ""
156
+
157
+ status_map = {
158
+ "PENDING": JobStatus.PENDING,
159
+ "RUNNING": JobStatus.RUNNING,
160
+ "COMPLETED": JobStatus.COMPLETED,
161
+ "FAILED": JobStatus.FAILED,
162
+ "TIMEOUT": JobStatus.TIMEOUT,
163
+ "CANCELLED": JobStatus.FAILED,
164
+ "OUT_OF_MEMORY": JobStatus.FAILED,
165
+ }
166
+ return status_map.get(state, JobStatus.PENDING)
167
+
168
+ def wait(self, job_id: str, poll_interval: float = 30.0) -> JobStatus:
169
+ """Wait for job completion."""
170
+ import time
171
+ while True:
172
+ s = self.status(job_id)
173
+ if s in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.TIMEOUT):
174
+ return s
175
+ time.sleep(poll_interval)
176
+
177
+ def submit_array(self, specs: list[JobSpec], array_size: int | None = None) -> str:
178
+ """Submit a SLURM job array for parameter sweeps."""
179
+ if not specs:
180
+ raise ValueError("Empty spec list")
181
+
182
+ # Write parameter files
183
+ base_dir = specs[0].case_dir.parent
184
+ param_file = base_dir / "array_params.txt"
185
+ param_file.write_text("\n".join(str(s.case_dir) for s in specs))
186
+
187
+ script = self._generate_array_script(specs[0], len(specs))
188
+ script_path = base_dir / "array_job.slurm"
189
+ script_path.write_text(script)
190
+
191
+ result = subprocess.run(
192
+ ["sbatch", str(script_path)],
193
+ capture_output=True, text=True, check=True,
194
+ )
195
+ return result.stdout.strip().split()[-1]
196
+
197
+ def _generate_script(self, spec: JobSpec) -> str:
198
+ """Generate SLURM batch script."""
199
+ hours = int(spec.wall_time_hours)
200
+ minutes = int((spec.wall_time_hours - hours) * 60)
201
+ modules_str = "\n".join(f"module load {m}" for m in self.config.modules)
202
+
203
+ return f"""#!/bin/bash
204
+ #SBATCH --job-name=sim_{spec.case_dir.name}
205
+ #SBATCH --partition={self.config.partition}
206
+ #SBATCH --account={self.config.account}
207
+ #SBATCH --qos={self.config.qos}
208
+ #SBATCH --ntasks={spec.num_cores}
209
+ #SBATCH --mem={spec.memory_gb}G
210
+ #SBATCH --time={hours:02d}:{minutes:02d}:00
211
+ #SBATCH --output={spec.case_dir}/slurm_%j.out
212
+ #SBATCH --error={spec.case_dir}/slurm_%j.err
213
+
214
+ {modules_str}
215
+
216
+ cd {spec.case_dir}
217
+ {" ".join(spec.command)}
218
+ """
219
+
220
+ def _generate_array_script(self, template_spec: JobSpec, array_count: int) -> str:
221
+ """Generate SLURM array job script."""
222
+ hours = int(template_spec.wall_time_hours)
223
+ minutes = int((template_spec.wall_time_hours - hours) * 60)
224
+
225
+ return f"""#!/bin/bash
226
+ #SBATCH --job-name=sweep
227
+ #SBATCH --partition={self.config.partition}
228
+ #SBATCH --account={self.config.account}
229
+ #SBATCH --array=0-{array_count - 1}
230
+ #SBATCH --ntasks={template_spec.num_cores}
231
+ #SBATCH --mem={template_spec.memory_gb}G
232
+ #SBATCH --time={hours:02d}:{minutes:02d}:00
233
+ #SBATCH --output=logs/slurm_%A_%a.out
234
+
235
+ CASE_DIR=$(sed -n "${{SLURM_ARRAY_TASK_ID}}p" array_params.txt)
236
+ cd "$CASE_DIR"
237
+ {" ".join(template_spec.command)}
238
+ """
239
+ ```
240
+
241
+ ### Parallelization Strategies
242
+
243
+ Choose between parameter-level and simulation-level parallelism:
244
+
245
+ ```python
246
+ # src/simulation/compute/parallel.py
247
+ from dataclasses import dataclass
248
+ from enum import Enum
249
+ from typing import Any
250
+
251
+ class ParallelStrategy(Enum):
252
+ PARAMETER_LEVEL = "parameter" # Many independent single-core runs
253
+ SIMULATION_LEVEL = "simulation" # Few multi-core runs (MPI)
254
+ HYBRID = "hybrid" # Some of each
255
+
256
+ @dataclass
257
+ class ParallelConfig:
258
+ """Configuration for parallel execution strategy."""
259
+ strategy: ParallelStrategy
260
+ total_cores: int
261
+ cores_per_simulation: int = 1 # For simulation-level parallelism
262
+ max_concurrent: int | None = None # For parameter-level parallelism
263
+
264
+ @property
265
+ def concurrent_simulations(self) -> int:
266
+ if self.strategy == ParallelStrategy.PARAMETER_LEVEL:
267
+ return self.max_concurrent or self.total_cores
268
+ elif self.strategy == ParallelStrategy.SIMULATION_LEVEL:
269
+ return self.total_cores // self.cores_per_simulation
270
+ else: # Hybrid
271
+ return self.max_concurrent or (self.total_cores // self.cores_per_simulation)
272
+
273
+ def recommend_strategy(
274
+ single_run_time_hours: float,
275
+ num_evaluations: int,
276
+ available_cores: int,
277
+ strong_scaling_efficiency: float = 0.7,
278
+ ) -> ParallelConfig:
279
+ """Recommend parallelization strategy based on problem characteristics.
280
+
281
+ Rules:
282
+ - If single run < 10 min: parameter-level (overhead of MPI not worth it)
283
+ - If single run > 4 hours and scales well: simulation-level
284
+ - Otherwise: hybrid (medium MPI + some parameter parallelism)
285
+ """
286
+ if single_run_time_hours < 1/6: # < 10 minutes
287
+ return ParallelConfig(
288
+ strategy=ParallelStrategy.PARAMETER_LEVEL,
289
+ total_cores=available_cores,
290
+ cores_per_simulation=1,
291
+ max_concurrent=min(available_cores, num_evaluations),
292
+ )
293
+
294
+ if single_run_time_hours > 4 and strong_scaling_efficiency > 0.6:
295
+ cores_per_sim = min(available_cores, 64) # Cap at 64 for scaling
296
+ return ParallelConfig(
297
+ strategy=ParallelStrategy.SIMULATION_LEVEL,
298
+ total_cores=available_cores,
299
+ cores_per_simulation=cores_per_sim,
300
+ )
301
+
302
+ # Hybrid: balance between parallel sims and cores per sim
303
+ cores_per_sim = min(8, available_cores // 4)
304
+ return ParallelConfig(
305
+ strategy=ParallelStrategy.HYBRID,
306
+ total_cores=available_cores,
307
+ cores_per_simulation=cores_per_sim,
308
+ max_concurrent=available_cores // cores_per_sim,
309
+ )
310
+ ```
311
+
312
+ ### Checkpoint/Restart
313
+
314
+ Enable long simulations to survive time limits and preemption:
315
+
316
+ ```python
317
+ # src/simulation/compute/checkpoint.py
318
+ from dataclasses import dataclass
319
+ from pathlib import Path
320
+ from typing import Any
321
+ import json
322
+ import shutil
323
+ import time
324
+
325
+ @dataclass
326
+ class CheckpointConfig:
327
+ """Configuration for checkpoint/restart behavior."""
328
+ checkpoint_interval_minutes: float = 30.0
329
+ max_checkpoints_kept: int = 3
330
+ checkpoint_dir: Path = Path("checkpoints")
331
+ compress: bool = True
332
+
333
+ class CheckpointManager:
334
+ """Manages checkpoint creation and restoration for simulations."""
335
+
336
+ def __init__(self, case_dir: Path, config: CheckpointConfig):
337
+ self.case_dir = case_dir
338
+ self.config = config
339
+ self.checkpoint_base = case_dir / config.checkpoint_dir
340
+ self.checkpoint_base.mkdir(parents=True, exist_ok=True)
341
+ self._last_checkpoint_time = time.time()
342
+
343
+ def should_checkpoint(self) -> bool:
344
+ """Check if enough time has elapsed for a new checkpoint."""
345
+ elapsed = (time.time() - self._last_checkpoint_time) / 60
346
+ return elapsed >= self.config.checkpoint_interval_minutes
347
+
348
+ def save_checkpoint(self, state: dict[str, Any], iteration: int) -> Path:
349
+ """Save simulation state to a checkpoint."""
350
+ checkpoint_name = f"checkpoint_{iteration:06d}"
351
+ checkpoint_path = self.checkpoint_base / checkpoint_name
352
+
353
+ if checkpoint_path.exists():
354
+ shutil.rmtree(checkpoint_path)
355
+ checkpoint_path.mkdir()
356
+
357
+ # Save metadata
358
+ metadata = {
359
+ "iteration": iteration,
360
+ "timestamp": time.time(),
361
+ "state_keys": list(state.keys()),
362
+ }
363
+ (checkpoint_path / "metadata.json").write_text(json.dumps(metadata))
364
+
365
+ # Save state (simulation-specific files)
366
+ for key, value in state.items():
367
+ if isinstance(value, Path) and value.exists():
368
+ # Copy simulation output files
369
+ shutil.copy2(value, checkpoint_path / value.name)
370
+ else:
371
+ # Serialize scalar/dict state
372
+ (checkpoint_path / f"{key}.json").write_text(json.dumps(value, default=str))
373
+
374
+ self._last_checkpoint_time = time.time()
375
+ self._cleanup_old_checkpoints()
376
+
377
+ return checkpoint_path
378
+
379
+ def latest_checkpoint(self) -> Path | None:
380
+ """Find the most recent valid checkpoint."""
381
+ checkpoints = sorted(self.checkpoint_base.iterdir(), reverse=True)
382
+ for cp in checkpoints:
383
+ metadata_file = cp / "metadata.json"
384
+ if metadata_file.exists():
385
+ return cp
386
+ return None
387
+
388
+ def restore_checkpoint(self, checkpoint_path: Path) -> dict[str, Any]:
389
+ """Restore state from a checkpoint."""
390
+ metadata = json.loads((checkpoint_path / "metadata.json").read_text())
391
+ state = {"_iteration": metadata["iteration"]}
392
+
393
+ for key in metadata["state_keys"]:
394
+ json_file = checkpoint_path / f"{key}.json"
395
+ if json_file.exists():
396
+ state[key] = json.loads(json_file.read_text())
397
+
398
+ return state
399
+
400
+ def _cleanup_old_checkpoints(self) -> None:
401
+ """Remove oldest checkpoints beyond max_checkpoints_kept."""
402
+ checkpoints = sorted(self.checkpoint_base.iterdir())
403
+ while len(checkpoints) > self.config.max_checkpoints_kept:
404
+ oldest = checkpoints.pop(0)
405
+ shutil.rmtree(oldest)
406
+ ```
407
+
408
+ ### Resource Monitoring
409
+
410
+ Track resource utilization to optimize allocations and detect waste:
411
+
412
+ ```python
413
+ # src/simulation/compute/monitoring.py
414
+ from dataclasses import dataclass
415
+ import time
416
+ from typing import Any
417
+
418
+ @dataclass
419
+ class ResourceSnapshot:
420
+ """Point-in-time resource usage."""
421
+ timestamp: float
422
+ cpu_percent: float
423
+ memory_used_gb: float
424
+ memory_total_gb: float
425
+ disk_io_read_mb: float
426
+ disk_io_write_mb: float
427
+
428
+ @dataclass
429
+ class JobResourceReport:
430
+ """Summary of resource usage for a completed job."""
431
+ job_id: str
432
+ wall_time_hours: float
433
+ cpu_efficiency: float # actual CPU time / (wall time * cores)
434
+ peak_memory_gb: float
435
+ avg_memory_gb: float
436
+ total_io_gb: float
437
+ recommended_cores: int
438
+ recommended_memory_gb: float
439
+
440
+ def analyze_job_efficiency(
441
+ snapshots: list[ResourceSnapshot],
442
+ allocated_cores: int,
443
+ allocated_memory_gb: float,
444
+ ) -> JobResourceReport:
445
+ """Analyze resource usage to recommend better allocations."""
446
+ if not snapshots:
447
+ return JobResourceReport(
448
+ job_id="unknown", wall_time_hours=0,
449
+ cpu_efficiency=0, peak_memory_gb=0, avg_memory_gb=0,
450
+ total_io_gb=0, recommended_cores=1, recommended_memory_gb=4,
451
+ )
452
+
453
+ cpu_values = [s.cpu_percent for s in snapshots]
454
+ mem_values = [s.memory_used_gb for s in snapshots]
455
+
456
+ avg_cpu = sum(cpu_values) / len(cpu_values)
457
+ peak_mem = max(mem_values)
458
+ avg_mem = sum(mem_values) / len(mem_values)
459
+
460
+ # CPU efficiency: how much of allocated CPU was actually used
461
+ cpu_efficiency = avg_cpu / (allocated_cores * 100)
462
+
463
+ # Recommendations
464
+ # If using < 50% CPU, reduce cores
465
+ recommended_cores = max(1, int(allocated_cores * avg_cpu / 100 / 0.8))
466
+ # Memory: peak + 20% headroom
467
+ recommended_memory_gb = round(peak_mem * 1.2, 1)
468
+
469
+ wall_time = (snapshots[-1].timestamp - snapshots[0].timestamp) / 3600
470
+
471
+ return JobResourceReport(
472
+ job_id="",
473
+ wall_time_hours=wall_time,
474
+ cpu_efficiency=cpu_efficiency,
475
+ peak_memory_gb=peak_mem,
476
+ avg_memory_gb=avg_mem,
477
+ total_io_gb=sum(s.disk_io_write_mb for s in snapshots) / 1024,
478
+ recommended_cores=recommended_cores,
479
+ recommended_memory_gb=recommended_memory_gb,
480
+ )
481
+ ```
482
+
483
+ ### Cost Estimation
484
+
485
+ Estimate campaign costs before committing resources:
486
+
487
+ ```python
488
+ # src/simulation/compute/cost.py
489
+ from dataclasses import dataclass
490
+
491
+ @dataclass
492
+ class CostEstimate:
493
+ """Estimated cost for a simulation campaign."""
494
+ total_core_hours: float
495
+ total_cost_usd: float
496
+ per_run_core_hours: float
497
+ per_run_cost_usd: float
498
+ num_runs: int
499
+ confidence: str # "high", "medium", "low"
500
+ assumptions: list[str]
501
+
502
+ def estimate_campaign_cost(
503
+ single_run_hours: float,
504
+ cores_per_run: int,
505
+ num_runs: int,
506
+ cost_per_core_hour: float = 0.05, # Typical HPC rate
507
+ overhead_factor: float = 1.3, # Queue wait, failed runs, restarts
508
+ ) -> CostEstimate:
509
+ """Estimate total cost for a parameter sweep campaign."""
510
+ per_run_hours = single_run_hours * cores_per_run
511
+ raw_total = per_run_hours * num_runs
512
+ total_with_overhead = raw_total * overhead_factor
513
+
514
+ assumptions = [
515
+ f"Single run: {single_run_hours:.1f}h on {cores_per_run} cores",
516
+ f"Overhead factor: {overhead_factor}x (failed runs, queue inefficiency)",
517
+ f"Cost rate: ${cost_per_core_hour}/core-hour",
518
+ ]
519
+
520
+ # Confidence based on estimate uncertainty
521
+ if single_run_hours > 0 and num_runs < 100:
522
+ confidence = "high"
523
+ elif num_runs < 1000:
524
+ confidence = "medium"
525
+ else:
526
+ confidence = "low"
527
+ assumptions.append("Large campaign: actual costs may vary significantly")
528
+
529
+ return CostEstimate(
530
+ total_core_hours=total_with_overhead,
531
+ total_cost_usd=total_with_overhead * cost_per_core_hour,
532
+ per_run_core_hours=per_run_hours,
533
+ per_run_cost_usd=per_run_hours * cost_per_core_hour,
534
+ num_runs=num_runs,
535
+ confidence=confidence,
536
+ assumptions=assumptions,
537
+ )
538
+ ```