@zigrivers/scaffold 3.13.0 → 3.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -10
- package/content/knowledge/research/research-architecture.md +385 -0
- package/content/knowledge/research/research-conventions.md +248 -0
- package/content/knowledge/research/research-dev-environment.md +303 -0
- package/content/knowledge/research/research-experiment-loop.md +429 -0
- package/content/knowledge/research/research-experiment-tracking.md +336 -0
- package/content/knowledge/research/research-ml-architecture-search.md +383 -0
- package/content/knowledge/research/research-ml-evaluation.md +407 -0
- package/content/knowledge/research/research-ml-experiment-tracking.md +466 -0
- package/content/knowledge/research/research-ml-training-patterns.md +413 -0
- package/content/knowledge/research/research-observability.md +395 -0
- package/content/knowledge/research/research-overfitting-prevention.md +306 -0
- package/content/knowledge/research/research-project-structure.md +264 -0
- package/content/knowledge/research/research-quant-backtesting.md +326 -0
- package/content/knowledge/research/research-quant-market-data.md +366 -0
- package/content/knowledge/research/research-quant-metrics.md +335 -0
- package/content/knowledge/research/research-quant-requirements.md +223 -0
- package/content/knowledge/research/research-quant-risk.md +469 -0
- package/content/knowledge/research/research-quant-strategy-patterns.md +412 -0
- package/content/knowledge/research/research-requirements.md +201 -0
- package/content/knowledge/research/research-security.md +374 -0
- package/content/knowledge/research/research-sim-compute-management.md +538 -0
- package/content/knowledge/research/research-sim-engine-patterns.md +448 -0
- package/content/knowledge/research/research-sim-parameter-spaces.md +425 -0
- package/content/knowledge/research/research-sim-validation.md +456 -0
- package/content/knowledge/research/research-testing.md +334 -0
- package/content/methodology/research-ml-research.yml +23 -0
- package/content/methodology/research-overlay.yml +65 -0
- package/content/methodology/research-quant-finance.yml +29 -0
- package/content/methodology/research-simulation.yml +23 -0
- package/dist/cli/commands/adopt.d.ts.map +1 -1
- package/dist/cli/commands/adopt.js +30 -8
- package/dist/cli/commands/adopt.js.map +1 -1
- package/dist/cli/commands/adopt.serialization.test.js +49 -0
- package/dist/cli/commands/adopt.serialization.test.js.map +1 -1
- package/dist/cli/commands/adopt.test.js +8 -0
- package/dist/cli/commands/adopt.test.js.map +1 -1
- package/dist/cli/commands/build.d.ts.map +1 -1
- package/dist/cli/commands/build.js +191 -180
- package/dist/cli/commands/build.js.map +1 -1
- package/dist/cli/commands/complete.d.ts.map +1 -1
- package/dist/cli/commands/complete.js +16 -12
- package/dist/cli/commands/complete.js.map +1 -1
- package/dist/cli/commands/complete.test.js +14 -5
- package/dist/cli/commands/complete.test.js.map +1 -1
- package/dist/cli/commands/init.d.ts +4 -0
- package/dist/cli/commands/init.d.ts.map +1 -1
- package/dist/cli/commands/init.js +75 -51
- package/dist/cli/commands/init.js.map +1 -1
- package/dist/cli/commands/init.test.js +33 -27
- package/dist/cli/commands/init.test.js.map +1 -1
- package/dist/cli/commands/reset.d.ts.map +1 -1
- package/dist/cli/commands/reset.js +44 -40
- package/dist/cli/commands/reset.js.map +1 -1
- package/dist/cli/commands/reset.test.js +42 -20
- package/dist/cli/commands/reset.test.js.map +1 -1
- package/dist/cli/commands/rework.d.ts.map +1 -1
- package/dist/cli/commands/rework.js +16 -12
- package/dist/cli/commands/rework.js.map +1 -1
- package/dist/cli/commands/rework.test.js +12 -3
- package/dist/cli/commands/rework.test.js.map +1 -1
- package/dist/cli/commands/run.d.ts.map +1 -1
- package/dist/cli/commands/run.js +318 -298
- package/dist/cli/commands/run.js.map +1 -1
- package/dist/cli/commands/run.test.js +92 -120
- package/dist/cli/commands/run.test.js.map +1 -1
- package/dist/cli/commands/skip.d.ts.map +1 -1
- package/dist/cli/commands/skip.js +19 -15
- package/dist/cli/commands/skip.js.map +1 -1
- package/dist/cli/commands/skip.test.js +22 -11
- package/dist/cli/commands/skip.test.js.map +1 -1
- package/dist/cli/commands/update.d.ts.map +1 -1
- package/dist/cli/commands/update.js +3 -1
- package/dist/cli/commands/update.js.map +1 -1
- package/dist/cli/commands/update.test.js +8 -4
- package/dist/cli/commands/update.test.js.map +1 -1
- package/dist/cli/commands/version.d.ts.map +1 -1
- package/dist/cli/commands/version.js +3 -1
- package/dist/cli/commands/version.js.map +1 -1
- package/dist/cli/commands/version.test.js +9 -5
- package/dist/cli/commands/version.test.js.map +1 -1
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +2 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/init-flag-families.d.ts +6 -1
- package/dist/cli/init-flag-families.d.ts.map +1 -1
- package/dist/cli/init-flag-families.js +32 -1
- package/dist/cli/init-flag-families.js.map +1 -1
- package/dist/cli/init-flag-families.test.js +47 -0
- package/dist/cli/init-flag-families.test.js.map +1 -1
- package/dist/cli/output/interactive.d.ts +1 -0
- package/dist/cli/output/interactive.d.ts.map +1 -1
- package/dist/cli/output/interactive.js +5 -0
- package/dist/cli/output/interactive.js.map +1 -1
- package/dist/cli/shutdown.d.ts +51 -0
- package/dist/cli/shutdown.d.ts.map +1 -0
- package/dist/cli/shutdown.js +199 -0
- package/dist/cli/shutdown.js.map +1 -0
- package/dist/cli/shutdown.test.d.ts +2 -0
- package/dist/cli/shutdown.test.d.ts.map +1 -0
- package/dist/cli/shutdown.test.js +316 -0
- package/dist/cli/shutdown.test.js.map +1 -0
- package/dist/config/schema.d.ts +272 -16
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +25 -1
- package/dist/config/schema.js.map +1 -1
- package/dist/config/schema.test.js +103 -3
- package/dist/config/schema.test.js.map +1 -1
- package/dist/core/assembly/overlay-loader.d.ts +12 -0
- package/dist/core/assembly/overlay-loader.d.ts.map +1 -1
- package/dist/core/assembly/overlay-loader.js +30 -0
- package/dist/core/assembly/overlay-loader.js.map +1 -1
- package/dist/core/assembly/overlay-loader.test.js +66 -1
- package/dist/core/assembly/overlay-loader.test.js.map +1 -1
- package/dist/core/assembly/overlay-state-resolver.d.ts.map +1 -1
- package/dist/core/assembly/overlay-state-resolver.js +48 -19
- package/dist/core/assembly/overlay-state-resolver.js.map +1 -1
- package/dist/core/assembly/overlay-state-resolver.test.js +80 -0
- package/dist/core/assembly/overlay-state-resolver.test.js.map +1 -1
- package/dist/e2e/init.test.js +5 -4
- package/dist/e2e/init.test.js.map +1 -1
- package/dist/e2e/project-type-overlays.test.js +119 -0
- package/dist/e2e/project-type-overlays.test.js.map +1 -1
- package/dist/project/adopt.d.ts.map +1 -1
- package/dist/project/adopt.js +3 -1
- package/dist/project/adopt.js.map +1 -1
- package/dist/project/detectors/disambiguate.js +1 -1
- package/dist/project/detectors/disambiguate.js.map +1 -1
- package/dist/project/detectors/index.d.ts.map +1 -1
- package/dist/project/detectors/index.js +2 -1
- package/dist/project/detectors/index.js.map +1 -1
- package/dist/project/detectors/ml.d.ts.map +1 -1
- package/dist/project/detectors/ml.js +2 -6
- package/dist/project/detectors/ml.js.map +1 -1
- package/dist/project/detectors/research.d.ts +4 -0
- package/dist/project/detectors/research.d.ts.map +1 -0
- package/dist/project/detectors/research.js +141 -0
- package/dist/project/detectors/research.js.map +1 -0
- package/dist/project/detectors/research.test.d.ts +2 -0
- package/dist/project/detectors/research.test.d.ts.map +1 -0
- package/dist/project/detectors/research.test.js +235 -0
- package/dist/project/detectors/research.test.js.map +1 -0
- package/dist/project/detectors/shared-signals.d.ts +3 -0
- package/dist/project/detectors/shared-signals.d.ts.map +1 -0
- package/dist/project/detectors/shared-signals.js +9 -0
- package/dist/project/detectors/shared-signals.js.map +1 -0
- package/dist/project/detectors/types.d.ts +6 -2
- package/dist/project/detectors/types.d.ts.map +1 -1
- package/dist/project/detectors/types.js.map +1 -1
- package/dist/state/lock-manager.d.ts +1 -0
- package/dist/state/lock-manager.d.ts.map +1 -1
- package/dist/state/lock-manager.js +1 -1
- package/dist/state/lock-manager.js.map +1 -1
- package/dist/types/config.d.ts +7 -1
- package/dist/types/config.d.ts.map +1 -1
- package/dist/wizard/copy/core.d.ts.map +1 -1
- package/dist/wizard/copy/core.js +4 -0
- package/dist/wizard/copy/core.js.map +1 -1
- package/dist/wizard/copy/index.d.ts.map +1 -1
- package/dist/wizard/copy/index.js +2 -0
- package/dist/wizard/copy/index.js.map +1 -1
- package/dist/wizard/copy/research.d.ts +3 -0
- package/dist/wizard/copy/research.d.ts.map +1 -0
- package/dist/wizard/copy/research.js +27 -0
- package/dist/wizard/copy/research.js.map +1 -0
- package/dist/wizard/copy/types.d.ts +5 -1
- package/dist/wizard/copy/types.d.ts.map +1 -1
- package/dist/wizard/flags.d.ts +7 -1
- package/dist/wizard/flags.d.ts.map +1 -1
- package/dist/wizard/questions.d.ts +4 -2
- package/dist/wizard/questions.d.ts.map +1 -1
- package/dist/wizard/questions.js +27 -1
- package/dist/wizard/questions.js.map +1 -1
- package/dist/wizard/questions.test.js +51 -0
- package/dist/wizard/questions.test.js.map +1 -1
- package/dist/wizard/wizard.d.ts +3 -2
- package/dist/wizard/wizard.d.ts.map +1 -1
- package/dist/wizard/wizard.js +3 -1
- package/dist/wizard/wizard.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,538 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: research-sim-compute-management
|
|
3
|
+
description: Compute resource management for simulations including wall-clock budgets, job scheduling with SLURM and PBS, parallelization strategies, checkpoint/restart for long simulations, resource monitoring, and cost estimation
|
|
4
|
+
topics: [research, simulation, compute-management, slurm, pbs, parallelization, checkpoint-restart, resource-monitoring, cost-estimation, hpc, wall-clock-budget]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
Simulation-based research consumes significant compute resources -- a single CFD run can take hours, and parameter sweeps multiply that by hundreds or thousands of evaluations. Effective compute management determines whether a research budget yields 50 useful results or 500. The key challenges are: allocating wall-clock budgets across the experiment campaign, scheduling jobs efficiently on shared HPC resources, choosing the right parallelization level (across parameters vs within simulations), implementing checkpoint/restart for runs that exceed time limits, and monitoring resource usage to prevent waste.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
Set explicit wall-clock budgets at the campaign level and enforce them through job-level time limits. Use SLURM or PBS job arrays for parameter sweeps, with dependency chains for multi-stage workflows. Choose between parameter-level parallelism (many independent simulations) and simulation-level parallelism (MPI domain decomposition within one run) based on problem characteristics. Implement checkpoint/restart so long simulations survive scheduler preemption and time limits. Monitor resource utilization (CPU, memory, I/O) to right-size allocations and detect inefficient runs early. Estimate costs before launching campaigns to avoid budget overruns.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### Wall-Clock Budget Management
|
|
16
|
+
|
|
17
|
+
Define budgets at multiple levels and enforce them programmatically:
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
# src/simulation/compute/budget.py
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from datetime import datetime, timedelta
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ComputeBudget:
|
|
27
|
+
"""Hierarchical compute budget for a research campaign."""
|
|
28
|
+
total_core_hours: float
|
|
29
|
+
max_wall_time_per_job: timedelta = timedelta(hours=24)
|
|
30
|
+
max_concurrent_jobs: int = 50
|
|
31
|
+
reserve_fraction: float = 0.1 # Hold back 10% for follow-up runs
|
|
32
|
+
|
|
33
|
+
# Tracking
|
|
34
|
+
used_core_hours: float = 0.0
|
|
35
|
+
jobs_submitted: int = 0
|
|
36
|
+
jobs_completed: int = 0
|
|
37
|
+
jobs_failed: int = 0
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def remaining_core_hours(self) -> float:
|
|
41
|
+
return self.total_core_hours * (1 - self.reserve_fraction) - self.used_core_hours
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def utilization(self) -> float:
|
|
45
|
+
return self.used_core_hours / self.total_core_hours if self.total_core_hours > 0 else 0
|
|
46
|
+
|
|
47
|
+
def can_submit(self, estimated_core_hours: float) -> bool:
|
|
48
|
+
"""Check if budget allows submitting a new job."""
|
|
49
|
+
if estimated_core_hours > self.remaining_core_hours:
|
|
50
|
+
return False
|
|
51
|
+
if self.jobs_submitted - self.jobs_completed >= self.max_concurrent_jobs:
|
|
52
|
+
return False
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
def record_completion(self, actual_core_hours: float, success: bool) -> None:
|
|
56
|
+
"""Record a completed job."""
|
|
57
|
+
self.used_core_hours += actual_core_hours
|
|
58
|
+
self.jobs_completed += 1
|
|
59
|
+
if not success:
|
|
60
|
+
self.jobs_failed += 1
|
|
61
|
+
|
|
62
|
+
def estimate_remaining_runs(self, avg_core_hours_per_run: float) -> int:
|
|
63
|
+
"""Estimate how many more runs the budget supports."""
|
|
64
|
+
if avg_core_hours_per_run <= 0:
|
|
65
|
+
return 0
|
|
66
|
+
return int(self.remaining_core_hours / avg_core_hours_per_run)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class BudgetEnforcer:
|
|
70
|
+
"""Enforces budget constraints on job submission."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, budget: ComputeBudget):
|
|
73
|
+
self.budget = budget
|
|
74
|
+
self._history: list[dict[str, Any]] = []
|
|
75
|
+
|
|
76
|
+
def request_allocation(self, job_spec: dict[str, Any]) -> bool:
|
|
77
|
+
"""Request permission to submit a job. Returns True if allowed."""
|
|
78
|
+
est_hours = job_spec["num_cores"] * job_spec["wall_hours"]
|
|
79
|
+
if not self.budget.can_submit(est_hours):
|
|
80
|
+
return False
|
|
81
|
+
self._history.append({
|
|
82
|
+
"timestamp": datetime.now().isoformat(),
|
|
83
|
+
"job_spec": job_spec,
|
|
84
|
+
"estimated_hours": est_hours,
|
|
85
|
+
})
|
|
86
|
+
self.budget.jobs_submitted += 1
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
def adaptive_time_limit(self, base_hours: float, iteration: int) -> float:
|
|
90
|
+
"""Adjust time limits based on observed runtimes."""
|
|
91
|
+
completed_times = [
|
|
92
|
+
h["actual_hours"] for h in self._history
|
|
93
|
+
if "actual_hours" in h
|
|
94
|
+
]
|
|
95
|
+
if len(completed_times) < 5:
|
|
96
|
+
return base_hours # Not enough data yet
|
|
97
|
+
|
|
98
|
+
# Use 90th percentile of observed times + 20% margin
|
|
99
|
+
import numpy as np
|
|
100
|
+
p90 = np.percentile(completed_times, 90)
|
|
101
|
+
return min(base_hours, p90 * 1.2)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### SLURM Job Scheduling
|
|
105
|
+
|
|
106
|
+
Submit and manage jobs on SLURM-based HPC clusters:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# src/simulation/compute/slurm.py
|
|
110
|
+
import subprocess
|
|
111
|
+
from dataclasses import dataclass
|
|
112
|
+
from pathlib import Path
|
|
113
|
+
from typing import Any
|
|
114
|
+
|
|
115
|
+
from src.simulation.batch import JobSubmitter, JobSpec, JobStatus
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class SlurmConfig:
|
|
119
|
+
"""SLURM-specific configuration."""
|
|
120
|
+
partition: str = "standard"
|
|
121
|
+
account: str = ""
|
|
122
|
+
qos: str = "normal"
|
|
123
|
+
modules: list[str] = None # Modules to load before running
|
|
124
|
+
|
|
125
|
+
def __post_init__(self):
|
|
126
|
+
if self.modules is None:
|
|
127
|
+
self.modules = []
|
|
128
|
+
|
|
129
|
+
class SlurmSubmitter(JobSubmitter):
|
|
130
|
+
"""Submit and manage SLURM jobs."""
|
|
131
|
+
|
|
132
|
+
def __init__(self, config: SlurmConfig):
|
|
133
|
+
self.config = config
|
|
134
|
+
|
|
135
|
+
def submit(self, spec: JobSpec) -> str:
|
|
136
|
+
"""Submit job to SLURM, return job ID."""
|
|
137
|
+
script = self._generate_script(spec)
|
|
138
|
+
script_path = spec.case_dir / "job.slurm"
|
|
139
|
+
script_path.write_text(script)
|
|
140
|
+
|
|
141
|
+
result = subprocess.run(
|
|
142
|
+
["sbatch", str(script_path)],
|
|
143
|
+
capture_output=True, text=True, check=True,
|
|
144
|
+
)
|
|
145
|
+
# Parse job ID from "Submitted batch job 12345"
|
|
146
|
+
job_id = result.stdout.strip().split()[-1]
|
|
147
|
+
return job_id
|
|
148
|
+
|
|
149
|
+
def status(self, job_id: str) -> JobStatus:
|
|
150
|
+
"""Query SLURM for job status."""
|
|
151
|
+
result = subprocess.run(
|
|
152
|
+
["sacct", "-j", job_id, "--format=State", "--noheader", "--parsable2"],
|
|
153
|
+
capture_output=True, text=True,
|
|
154
|
+
)
|
|
155
|
+
state = result.stdout.strip().split("\n")[0] if result.stdout.strip() else ""
|
|
156
|
+
|
|
157
|
+
status_map = {
|
|
158
|
+
"PENDING": JobStatus.PENDING,
|
|
159
|
+
"RUNNING": JobStatus.RUNNING,
|
|
160
|
+
"COMPLETED": JobStatus.COMPLETED,
|
|
161
|
+
"FAILED": JobStatus.FAILED,
|
|
162
|
+
"TIMEOUT": JobStatus.TIMEOUT,
|
|
163
|
+
"CANCELLED": JobStatus.FAILED,
|
|
164
|
+
"OUT_OF_MEMORY": JobStatus.FAILED,
|
|
165
|
+
}
|
|
166
|
+
return status_map.get(state, JobStatus.PENDING)
|
|
167
|
+
|
|
168
|
+
def wait(self, job_id: str, poll_interval: float = 30.0) -> JobStatus:
|
|
169
|
+
"""Wait for job completion."""
|
|
170
|
+
import time
|
|
171
|
+
while True:
|
|
172
|
+
s = self.status(job_id)
|
|
173
|
+
if s in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.TIMEOUT):
|
|
174
|
+
return s
|
|
175
|
+
time.sleep(poll_interval)
|
|
176
|
+
|
|
177
|
+
def submit_array(self, specs: list[JobSpec], array_size: int | None = None) -> str:
|
|
178
|
+
"""Submit a SLURM job array for parameter sweeps."""
|
|
179
|
+
if not specs:
|
|
180
|
+
raise ValueError("Empty spec list")
|
|
181
|
+
|
|
182
|
+
# Write parameter files
|
|
183
|
+
base_dir = specs[0].case_dir.parent
|
|
184
|
+
param_file = base_dir / "array_params.txt"
|
|
185
|
+
param_file.write_text("\n".join(str(s.case_dir) for s in specs))
|
|
186
|
+
|
|
187
|
+
script = self._generate_array_script(specs[0], len(specs))
|
|
188
|
+
script_path = base_dir / "array_job.slurm"
|
|
189
|
+
script_path.write_text(script)
|
|
190
|
+
|
|
191
|
+
result = subprocess.run(
|
|
192
|
+
["sbatch", str(script_path)],
|
|
193
|
+
capture_output=True, text=True, check=True,
|
|
194
|
+
)
|
|
195
|
+
return result.stdout.strip().split()[-1]
|
|
196
|
+
|
|
197
|
+
def _generate_script(self, spec: JobSpec) -> str:
|
|
198
|
+
"""Generate SLURM batch script."""
|
|
199
|
+
hours = int(spec.wall_time_hours)
|
|
200
|
+
minutes = int((spec.wall_time_hours - hours) * 60)
|
|
201
|
+
modules_str = "\n".join(f"module load {m}" for m in self.config.modules)
|
|
202
|
+
|
|
203
|
+
return f"""#!/bin/bash
|
|
204
|
+
#SBATCH --job-name=sim_{spec.case_dir.name}
|
|
205
|
+
#SBATCH --partition={self.config.partition}
|
|
206
|
+
#SBATCH --account={self.config.account}
|
|
207
|
+
#SBATCH --qos={self.config.qos}
|
|
208
|
+
#SBATCH --ntasks={spec.num_cores}
|
|
209
|
+
#SBATCH --mem={spec.memory_gb}G
|
|
210
|
+
#SBATCH --time={hours:02d}:{minutes:02d}:00
|
|
211
|
+
#SBATCH --output={spec.case_dir}/slurm_%j.out
|
|
212
|
+
#SBATCH --error={spec.case_dir}/slurm_%j.err
|
|
213
|
+
|
|
214
|
+
{modules_str}
|
|
215
|
+
|
|
216
|
+
cd {spec.case_dir}
|
|
217
|
+
{" ".join(spec.command)}
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
def _generate_array_script(self, template_spec: JobSpec, array_count: int) -> str:
|
|
221
|
+
"""Generate SLURM array job script."""
|
|
222
|
+
hours = int(template_spec.wall_time_hours)
|
|
223
|
+
minutes = int((template_spec.wall_time_hours - hours) * 60)
|
|
224
|
+
|
|
225
|
+
return f"""#!/bin/bash
|
|
226
|
+
#SBATCH --job-name=sweep
|
|
227
|
+
#SBATCH --partition={self.config.partition}
|
|
228
|
+
#SBATCH --account={self.config.account}
|
|
229
|
+
#SBATCH --array=0-{array_count - 1}
|
|
230
|
+
#SBATCH --ntasks={template_spec.num_cores}
|
|
231
|
+
#SBATCH --mem={template_spec.memory_gb}G
|
|
232
|
+
#SBATCH --time={hours:02d}:{minutes:02d}:00
|
|
233
|
+
#SBATCH --output=logs/slurm_%A_%a.out
|
|
234
|
+
|
|
235
|
+
CASE_DIR=$(sed -n "${{SLURM_ARRAY_TASK_ID}}p" array_params.txt)
|
|
236
|
+
cd "$CASE_DIR"
|
|
237
|
+
{" ".join(template_spec.command)}
|
|
238
|
+
"""
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Parallelization Strategies
|
|
242
|
+
|
|
243
|
+
Choose between parameter-level and simulation-level parallelism:
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
# src/simulation/compute/parallel.py
|
|
247
|
+
from dataclasses import dataclass
|
|
248
|
+
from enum import Enum
|
|
249
|
+
from typing import Any
|
|
250
|
+
|
|
251
|
+
class ParallelStrategy(Enum):
|
|
252
|
+
PARAMETER_LEVEL = "parameter" # Many independent single-core runs
|
|
253
|
+
SIMULATION_LEVEL = "simulation" # Few multi-core runs (MPI)
|
|
254
|
+
HYBRID = "hybrid" # Some of each
|
|
255
|
+
|
|
256
|
+
@dataclass
|
|
257
|
+
class ParallelConfig:
|
|
258
|
+
"""Configuration for parallel execution strategy."""
|
|
259
|
+
strategy: ParallelStrategy
|
|
260
|
+
total_cores: int
|
|
261
|
+
cores_per_simulation: int = 1 # For simulation-level parallelism
|
|
262
|
+
max_concurrent: int | None = None # For parameter-level parallelism
|
|
263
|
+
|
|
264
|
+
@property
|
|
265
|
+
def concurrent_simulations(self) -> int:
|
|
266
|
+
if self.strategy == ParallelStrategy.PARAMETER_LEVEL:
|
|
267
|
+
return self.max_concurrent or self.total_cores
|
|
268
|
+
elif self.strategy == ParallelStrategy.SIMULATION_LEVEL:
|
|
269
|
+
return self.total_cores // self.cores_per_simulation
|
|
270
|
+
else: # Hybrid
|
|
271
|
+
return self.max_concurrent or (self.total_cores // self.cores_per_simulation)
|
|
272
|
+
|
|
273
|
+
def recommend_strategy(
|
|
274
|
+
single_run_time_hours: float,
|
|
275
|
+
num_evaluations: int,
|
|
276
|
+
available_cores: int,
|
|
277
|
+
strong_scaling_efficiency: float = 0.7,
|
|
278
|
+
) -> ParallelConfig:
|
|
279
|
+
"""Recommend parallelization strategy based on problem characteristics.
|
|
280
|
+
|
|
281
|
+
Rules:
|
|
282
|
+
- If single run < 10 min: parameter-level (overhead of MPI not worth it)
|
|
283
|
+
- If single run > 4 hours and scales well: simulation-level
|
|
284
|
+
- Otherwise: hybrid (medium MPI + some parameter parallelism)
|
|
285
|
+
"""
|
|
286
|
+
if single_run_time_hours < 1/6: # < 10 minutes
|
|
287
|
+
return ParallelConfig(
|
|
288
|
+
strategy=ParallelStrategy.PARAMETER_LEVEL,
|
|
289
|
+
total_cores=available_cores,
|
|
290
|
+
cores_per_simulation=1,
|
|
291
|
+
max_concurrent=min(available_cores, num_evaluations),
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
if single_run_time_hours > 4 and strong_scaling_efficiency > 0.6:
|
|
295
|
+
cores_per_sim = min(available_cores, 64) # Cap at 64 for scaling
|
|
296
|
+
return ParallelConfig(
|
|
297
|
+
strategy=ParallelStrategy.SIMULATION_LEVEL,
|
|
298
|
+
total_cores=available_cores,
|
|
299
|
+
cores_per_simulation=cores_per_sim,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Hybrid: balance between parallel sims and cores per sim
|
|
303
|
+
cores_per_sim = min(8, available_cores // 4)
|
|
304
|
+
return ParallelConfig(
|
|
305
|
+
strategy=ParallelStrategy.HYBRID,
|
|
306
|
+
total_cores=available_cores,
|
|
307
|
+
cores_per_simulation=cores_per_sim,
|
|
308
|
+
max_concurrent=available_cores // cores_per_sim,
|
|
309
|
+
)
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Checkpoint/Restart
|
|
313
|
+
|
|
314
|
+
Enable long simulations to survive time limits and preemption:
|
|
315
|
+
|
|
316
|
+
```python
|
|
317
|
+
# src/simulation/compute/checkpoint.py
|
|
318
|
+
from dataclasses import dataclass
|
|
319
|
+
from pathlib import Path
|
|
320
|
+
from typing import Any
|
|
321
|
+
import json
|
|
322
|
+
import shutil
|
|
323
|
+
import time
|
|
324
|
+
|
|
325
|
+
@dataclass
|
|
326
|
+
class CheckpointConfig:
|
|
327
|
+
"""Configuration for checkpoint/restart behavior."""
|
|
328
|
+
checkpoint_interval_minutes: float = 30.0
|
|
329
|
+
max_checkpoints_kept: int = 3
|
|
330
|
+
checkpoint_dir: Path = Path("checkpoints")
|
|
331
|
+
compress: bool = True
|
|
332
|
+
|
|
333
|
+
class CheckpointManager:
|
|
334
|
+
"""Manages checkpoint creation and restoration for simulations."""
|
|
335
|
+
|
|
336
|
+
def __init__(self, case_dir: Path, config: CheckpointConfig):
|
|
337
|
+
self.case_dir = case_dir
|
|
338
|
+
self.config = config
|
|
339
|
+
self.checkpoint_base = case_dir / config.checkpoint_dir
|
|
340
|
+
self.checkpoint_base.mkdir(parents=True, exist_ok=True)
|
|
341
|
+
self._last_checkpoint_time = time.time()
|
|
342
|
+
|
|
343
|
+
def should_checkpoint(self) -> bool:
|
|
344
|
+
"""Check if enough time has elapsed for a new checkpoint."""
|
|
345
|
+
elapsed = (time.time() - self._last_checkpoint_time) / 60
|
|
346
|
+
return elapsed >= self.config.checkpoint_interval_minutes
|
|
347
|
+
|
|
348
|
+
def save_checkpoint(self, state: dict[str, Any], iteration: int) -> Path:
|
|
349
|
+
"""Save simulation state to a checkpoint."""
|
|
350
|
+
checkpoint_name = f"checkpoint_{iteration:06d}"
|
|
351
|
+
checkpoint_path = self.checkpoint_base / checkpoint_name
|
|
352
|
+
|
|
353
|
+
if checkpoint_path.exists():
|
|
354
|
+
shutil.rmtree(checkpoint_path)
|
|
355
|
+
checkpoint_path.mkdir()
|
|
356
|
+
|
|
357
|
+
# Save metadata
|
|
358
|
+
metadata = {
|
|
359
|
+
"iteration": iteration,
|
|
360
|
+
"timestamp": time.time(),
|
|
361
|
+
"state_keys": list(state.keys()),
|
|
362
|
+
}
|
|
363
|
+
(checkpoint_path / "metadata.json").write_text(json.dumps(metadata))
|
|
364
|
+
|
|
365
|
+
# Save state (simulation-specific files)
|
|
366
|
+
for key, value in state.items():
|
|
367
|
+
if isinstance(value, Path) and value.exists():
|
|
368
|
+
# Copy simulation output files
|
|
369
|
+
shutil.copy2(value, checkpoint_path / value.name)
|
|
370
|
+
else:
|
|
371
|
+
# Serialize scalar/dict state
|
|
372
|
+
(checkpoint_path / f"{key}.json").write_text(json.dumps(value, default=str))
|
|
373
|
+
|
|
374
|
+
self._last_checkpoint_time = time.time()
|
|
375
|
+
self._cleanup_old_checkpoints()
|
|
376
|
+
|
|
377
|
+
return checkpoint_path
|
|
378
|
+
|
|
379
|
+
def latest_checkpoint(self) -> Path | None:
|
|
380
|
+
"""Find the most recent valid checkpoint."""
|
|
381
|
+
checkpoints = sorted(self.checkpoint_base.iterdir(), reverse=True)
|
|
382
|
+
for cp in checkpoints:
|
|
383
|
+
metadata_file = cp / "metadata.json"
|
|
384
|
+
if metadata_file.exists():
|
|
385
|
+
return cp
|
|
386
|
+
return None
|
|
387
|
+
|
|
388
|
+
def restore_checkpoint(self, checkpoint_path: Path) -> dict[str, Any]:
|
|
389
|
+
"""Restore state from a checkpoint."""
|
|
390
|
+
metadata = json.loads((checkpoint_path / "metadata.json").read_text())
|
|
391
|
+
state = {"_iteration": metadata["iteration"]}
|
|
392
|
+
|
|
393
|
+
for key in metadata["state_keys"]:
|
|
394
|
+
json_file = checkpoint_path / f"{key}.json"
|
|
395
|
+
if json_file.exists():
|
|
396
|
+
state[key] = json.loads(json_file.read_text())
|
|
397
|
+
|
|
398
|
+
return state
|
|
399
|
+
|
|
400
|
+
def _cleanup_old_checkpoints(self) -> None:
|
|
401
|
+
"""Remove oldest checkpoints beyond max_checkpoints_kept."""
|
|
402
|
+
checkpoints = sorted(self.checkpoint_base.iterdir())
|
|
403
|
+
while len(checkpoints) > self.config.max_checkpoints_kept:
|
|
404
|
+
oldest = checkpoints.pop(0)
|
|
405
|
+
shutil.rmtree(oldest)
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
### Resource Monitoring
|
|
409
|
+
|
|
410
|
+
Track resource utilization to optimize allocations and detect waste:
|
|
411
|
+
|
|
412
|
+
```python
|
|
413
|
+
# src/simulation/compute/monitoring.py
|
|
414
|
+
from dataclasses import dataclass
|
|
415
|
+
import time
|
|
416
|
+
from typing import Any
|
|
417
|
+
|
|
418
|
+
@dataclass
|
|
419
|
+
class ResourceSnapshot:
|
|
420
|
+
"""Point-in-time resource usage."""
|
|
421
|
+
timestamp: float
|
|
422
|
+
cpu_percent: float
|
|
423
|
+
memory_used_gb: float
|
|
424
|
+
memory_total_gb: float
|
|
425
|
+
disk_io_read_mb: float
|
|
426
|
+
disk_io_write_mb: float
|
|
427
|
+
|
|
428
|
+
@dataclass
|
|
429
|
+
class JobResourceReport:
|
|
430
|
+
"""Summary of resource usage for a completed job."""
|
|
431
|
+
job_id: str
|
|
432
|
+
wall_time_hours: float
|
|
433
|
+
cpu_efficiency: float # actual CPU time / (wall time * cores)
|
|
434
|
+
peak_memory_gb: float
|
|
435
|
+
avg_memory_gb: float
|
|
436
|
+
total_io_gb: float
|
|
437
|
+
recommended_cores: int
|
|
438
|
+
recommended_memory_gb: float
|
|
439
|
+
|
|
440
|
+
def analyze_job_efficiency(
|
|
441
|
+
snapshots: list[ResourceSnapshot],
|
|
442
|
+
allocated_cores: int,
|
|
443
|
+
allocated_memory_gb: float,
|
|
444
|
+
) -> JobResourceReport:
|
|
445
|
+
"""Analyze resource usage to recommend better allocations."""
|
|
446
|
+
if not snapshots:
|
|
447
|
+
return JobResourceReport(
|
|
448
|
+
job_id="unknown", wall_time_hours=0,
|
|
449
|
+
cpu_efficiency=0, peak_memory_gb=0, avg_memory_gb=0,
|
|
450
|
+
total_io_gb=0, recommended_cores=1, recommended_memory_gb=4,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
cpu_values = [s.cpu_percent for s in snapshots]
|
|
454
|
+
mem_values = [s.memory_used_gb for s in snapshots]
|
|
455
|
+
|
|
456
|
+
avg_cpu = sum(cpu_values) / len(cpu_values)
|
|
457
|
+
peak_mem = max(mem_values)
|
|
458
|
+
avg_mem = sum(mem_values) / len(mem_values)
|
|
459
|
+
|
|
460
|
+
# CPU efficiency: how much of allocated CPU was actually used
|
|
461
|
+
cpu_efficiency = avg_cpu / (allocated_cores * 100)
|
|
462
|
+
|
|
463
|
+
# Recommendations
|
|
464
|
+
# If using < 50% CPU, reduce cores
|
|
465
|
+
recommended_cores = max(1, int(allocated_cores * avg_cpu / 100 / 0.8))
|
|
466
|
+
# Memory: peak + 20% headroom
|
|
467
|
+
recommended_memory_gb = round(peak_mem * 1.2, 1)
|
|
468
|
+
|
|
469
|
+
wall_time = (snapshots[-1].timestamp - snapshots[0].timestamp) / 3600
|
|
470
|
+
|
|
471
|
+
return JobResourceReport(
|
|
472
|
+
job_id="",
|
|
473
|
+
wall_time_hours=wall_time,
|
|
474
|
+
cpu_efficiency=cpu_efficiency,
|
|
475
|
+
peak_memory_gb=peak_mem,
|
|
476
|
+
avg_memory_gb=avg_mem,
|
|
477
|
+
total_io_gb=sum(s.disk_io_write_mb for s in snapshots) / 1024,
|
|
478
|
+
recommended_cores=recommended_cores,
|
|
479
|
+
recommended_memory_gb=recommended_memory_gb,
|
|
480
|
+
)
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
### Cost Estimation
|
|
484
|
+
|
|
485
|
+
Estimate campaign costs before committing resources:
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
# src/simulation/compute/cost.py
|
|
489
|
+
from dataclasses import dataclass
|
|
490
|
+
|
|
491
|
+
@dataclass
|
|
492
|
+
class CostEstimate:
|
|
493
|
+
"""Estimated cost for a simulation campaign."""
|
|
494
|
+
total_core_hours: float
|
|
495
|
+
total_cost_usd: float
|
|
496
|
+
per_run_core_hours: float
|
|
497
|
+
per_run_cost_usd: float
|
|
498
|
+
num_runs: int
|
|
499
|
+
confidence: str # "high", "medium", "low"
|
|
500
|
+
assumptions: list[str]
|
|
501
|
+
|
|
502
|
+
def estimate_campaign_cost(
|
|
503
|
+
single_run_hours: float,
|
|
504
|
+
cores_per_run: int,
|
|
505
|
+
num_runs: int,
|
|
506
|
+
cost_per_core_hour: float = 0.05, # Typical HPC rate
|
|
507
|
+
overhead_factor: float = 1.3, # Queue wait, failed runs, restarts
|
|
508
|
+
) -> CostEstimate:
|
|
509
|
+
"""Estimate total cost for a parameter sweep campaign."""
|
|
510
|
+
per_run_hours = single_run_hours * cores_per_run
|
|
511
|
+
raw_total = per_run_hours * num_runs
|
|
512
|
+
total_with_overhead = raw_total * overhead_factor
|
|
513
|
+
|
|
514
|
+
assumptions = [
|
|
515
|
+
f"Single run: {single_run_hours:.1f}h on {cores_per_run} cores",
|
|
516
|
+
f"Overhead factor: {overhead_factor}x (failed runs, queue inefficiency)",
|
|
517
|
+
f"Cost rate: ${cost_per_core_hour}/core-hour",
|
|
518
|
+
]
|
|
519
|
+
|
|
520
|
+
# Confidence based on estimate uncertainty
|
|
521
|
+
if single_run_hours > 0 and num_runs < 100:
|
|
522
|
+
confidence = "high"
|
|
523
|
+
elif num_runs < 1000:
|
|
524
|
+
confidence = "medium"
|
|
525
|
+
else:
|
|
526
|
+
confidence = "low"
|
|
527
|
+
assumptions.append("Large campaign: actual costs may vary significantly")
|
|
528
|
+
|
|
529
|
+
return CostEstimate(
|
|
530
|
+
total_core_hours=total_with_overhead,
|
|
531
|
+
total_cost_usd=total_with_overhead * cost_per_core_hour,
|
|
532
|
+
per_run_core_hours=per_run_hours,
|
|
533
|
+
per_run_cost_usd=per_run_hours * cost_per_core_hour,
|
|
534
|
+
num_runs=num_runs,
|
|
535
|
+
confidence=confidence,
|
|
536
|
+
assumptions=assumptions,
|
|
537
|
+
)
|
|
538
|
+
```
|