claude-turing 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/fork.md +40 -0
- package/commands/lit.md +47 -0
- package/commands/paper.md +44 -0
- package/commands/queue.md +48 -0
- package/commands/retry.md +41 -0
- package/commands/turing.md +10 -0
- package/config/failure_modes.yaml +74 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +6 -0
- package/templates/scripts/__pycache__/draft_paper_sections.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_queue.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/fork_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/literature_search.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/smart_retry.cpython-314.pyc +0 -0
- package/templates/scripts/draft_paper_sections.py +498 -0
- package/templates/scripts/experiment_queue.py +441 -0
- package/templates/scripts/fork_experiment.py +286 -0
- package/templates/scripts/generate_brief.py +25 -0
- package/templates/scripts/literature_search.py +421 -0
- package/templates/scripts/scaffold.py +10 -0
- package/templates/scripts/smart_retry.py +398 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Smart failure recovery for ML experiments.
|
|
3
|
+
|
|
4
|
+
Auto-diagnoses crash type and retries with a targeted fix.
|
|
5
|
+
OOM → reduce batch size. NaN → add gradient clipping. Timeout → more patience.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python scripts/smart_retry.py exp-042 # Auto-diagnose and retry
|
|
9
|
+
python scripts/smart_retry.py exp-042 --max-attempts 5 # More retries
|
|
10
|
+
python scripts/smart_retry.py --classify "CUDA out of memory" # Just classify
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import subprocess
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import yaml
|
|
23
|
+
|
|
24
|
+
from scripts.turing_io import load_config, load_experiments
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DEFAULT_MAX_ATTEMPTS = 3
|
|
28
|
+
DEFAULT_FAILURE_MODES_PATH = "config/failure_modes.yaml"
|
|
29
|
+
|
|
30
|
+
# Built-in failure taxonomy (overridden by config/failure_modes.yaml if present)
|
|
31
|
+
BUILTIN_FAILURE_MODES = {
|
|
32
|
+
"oom": {
|
|
33
|
+
"patterns": ["CUDA out of memory", "MemoryError", "RuntimeError: out of memory",
|
|
34
|
+
"std::bad_alloc", "OutOfMemoryError"],
|
|
35
|
+
"fix": "Reduce batch_size by 50%",
|
|
36
|
+
"config_changes": {"batch_size": "//2"},
|
|
37
|
+
"severity": "recoverable",
|
|
38
|
+
},
|
|
39
|
+
"nan_loss": {
|
|
40
|
+
"patterns": ["loss is NaN", "loss is nan", "RuntimeWarning: invalid value",
|
|
41
|
+
"loss: nan", "NaN loss"],
|
|
42
|
+
"fix": "Add gradient clipping at 1.0, reduce learning_rate by 10x",
|
|
43
|
+
"config_changes": {"gradient_clip": 1.0, "learning_rate": "//10"},
|
|
44
|
+
"severity": "recoverable",
|
|
45
|
+
},
|
|
46
|
+
"timeout": {
|
|
47
|
+
"patterns": ["TimeoutError", "exceeded time limit", "timed out",
|
|
48
|
+
"TimeoutExpired"],
|
|
49
|
+
"fix": "Double max_epochs or training timeout",
|
|
50
|
+
"config_changes": {"max_epochs": "*2"},
|
|
51
|
+
"severity": "recoverable",
|
|
52
|
+
},
|
|
53
|
+
"import_error": {
|
|
54
|
+
"patterns": ["ModuleNotFoundError", "ImportError", "No module named"],
|
|
55
|
+
"fix": "Install missing dependency",
|
|
56
|
+
"config_changes": {},
|
|
57
|
+
"severity": "requires_intervention",
|
|
58
|
+
"action": "pip_install",
|
|
59
|
+
},
|
|
60
|
+
"convergence_failure": {
|
|
61
|
+
"patterns": ["loss did not decrease", "no improvement", "early stopping",
|
|
62
|
+
"convergence warning"],
|
|
63
|
+
"fix": "Increase learning_rate by 3x for warm-up",
|
|
64
|
+
"config_changes": {"learning_rate": "*3"},
|
|
65
|
+
"severity": "recoverable",
|
|
66
|
+
},
|
|
67
|
+
"data_error": {
|
|
68
|
+
"patterns": ["FileNotFoundError", "No such file", "empty dataset",
|
|
69
|
+
"zero samples", "KeyError"],
|
|
70
|
+
"fix": "Check data path and preprocessing",
|
|
71
|
+
"config_changes": {},
|
|
72
|
+
"severity": "requires_intervention",
|
|
73
|
+
},
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def load_failure_modes(config_path: str = DEFAULT_FAILURE_MODES_PATH) -> dict:
|
|
78
|
+
"""Load failure mode taxonomy from config or use built-in."""
|
|
79
|
+
path = Path(config_path)
|
|
80
|
+
if path.exists():
|
|
81
|
+
with open(path) as f:
|
|
82
|
+
custom = yaml.safe_load(f) or {}
|
|
83
|
+
if isinstance(custom, dict) and custom:
|
|
84
|
+
return custom
|
|
85
|
+
return BUILTIN_FAILURE_MODES
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def classify_failure(
|
|
89
|
+
output: str,
|
|
90
|
+
exit_code: int | None = None,
|
|
91
|
+
failure_modes: dict | None = None,
|
|
92
|
+
) -> dict:
|
|
93
|
+
"""Classify an experiment failure against the taxonomy.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
output: Combined stdout + stderr from the failed run.
|
|
97
|
+
exit_code: Process exit code.
|
|
98
|
+
failure_modes: Failure taxonomy dict.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dict with failure_type, matched_pattern, fix, config_changes, severity.
|
|
102
|
+
"""
|
|
103
|
+
if failure_modes is None:
|
|
104
|
+
failure_modes = BUILTIN_FAILURE_MODES
|
|
105
|
+
|
|
106
|
+
output_lower = output.lower()
|
|
107
|
+
|
|
108
|
+
for failure_type, mode in failure_modes.items():
|
|
109
|
+
patterns = mode.get("patterns", [])
|
|
110
|
+
for pattern in patterns:
|
|
111
|
+
if pattern.lower() in output_lower:
|
|
112
|
+
return {
|
|
113
|
+
"failure_type": failure_type,
|
|
114
|
+
"matched_pattern": pattern,
|
|
115
|
+
"fix": mode.get("fix", "Unknown fix"),
|
|
116
|
+
"config_changes": mode.get("config_changes", {}),
|
|
117
|
+
"severity": mode.get("severity", "unknown"),
|
|
118
|
+
"action": mode.get("action"),
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"failure_type": "unknown",
|
|
123
|
+
"matched_pattern": None,
|
|
124
|
+
"fix": "Manual investigation required",
|
|
125
|
+
"config_changes": {},
|
|
126
|
+
"severity": "requires_intervention",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def apply_config_changes(
|
|
131
|
+
config: dict,
|
|
132
|
+
changes: dict[str, str | int | float],
|
|
133
|
+
) -> dict:
|
|
134
|
+
"""Apply fix changes to a config dict.
|
|
135
|
+
|
|
136
|
+
Supports operators: //N (divide), *N (multiply), or literal values.
|
|
137
|
+
|
|
138
|
+
Returns modified config dict.
|
|
139
|
+
"""
|
|
140
|
+
hyperparams = config.get("model", {}).get("hyperparams", {})
|
|
141
|
+
|
|
142
|
+
for key, value in changes.items():
|
|
143
|
+
if isinstance(value, str):
|
|
144
|
+
if value.startswith("//"):
|
|
145
|
+
divisor = int(value[2:])
|
|
146
|
+
current = hyperparams.get(key)
|
|
147
|
+
if current and isinstance(current, (int, float)):
|
|
148
|
+
hyperparams[key] = max(1, current // divisor) if isinstance(current, int) else current / divisor
|
|
149
|
+
elif value.startswith("*"):
|
|
150
|
+
multiplier = int(value[1:])
|
|
151
|
+
current = hyperparams.get(key)
|
|
152
|
+
if current and isinstance(current, (int, float)):
|
|
153
|
+
hyperparams[key] = current * multiplier
|
|
154
|
+
else:
|
|
155
|
+
hyperparams[key] = value
|
|
156
|
+
|
|
157
|
+
config.setdefault("model", {})["hyperparams"] = hyperparams
|
|
158
|
+
return config
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def run_retry(
|
|
162
|
+
output: str,
|
|
163
|
+
exit_code: int = 1,
|
|
164
|
+
seed: int = 42,
|
|
165
|
+
timeout: int = 600,
|
|
166
|
+
failure_modes: dict | None = None,
|
|
167
|
+
) -> dict:
|
|
168
|
+
"""Classify failure, apply fix, and re-run.
|
|
169
|
+
|
|
170
|
+
Returns dict with classification, fix applied, and retry result.
|
|
171
|
+
"""
|
|
172
|
+
classification = classify_failure(output, exit_code, failure_modes)
|
|
173
|
+
|
|
174
|
+
if classification["severity"] == "requires_intervention":
|
|
175
|
+
return {
|
|
176
|
+
"status": "cannot_auto_fix",
|
|
177
|
+
"classification": classification,
|
|
178
|
+
"message": f"Failure type '{classification['failure_type']}' requires manual intervention: {classification['fix']}",
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# Apply fix and re-run
|
|
182
|
+
cmd = ["python", "train.py", "--seed", str(seed)]
|
|
183
|
+
try:
|
|
184
|
+
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
185
|
+
except subprocess.TimeoutExpired:
|
|
186
|
+
return {
|
|
187
|
+
"status": "retry_timeout",
|
|
188
|
+
"classification": classification,
|
|
189
|
+
"message": "Retry also timed out",
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if proc.returncode == 0:
|
|
193
|
+
# Parse metrics
|
|
194
|
+
metrics = {}
|
|
195
|
+
in_block = False
|
|
196
|
+
for line in proc.stdout.splitlines():
|
|
197
|
+
line = line.strip()
|
|
198
|
+
if line == "---":
|
|
199
|
+
if in_block:
|
|
200
|
+
break
|
|
201
|
+
in_block = True
|
|
202
|
+
continue
|
|
203
|
+
if in_block and ":" in line:
|
|
204
|
+
k, v = line.split(":", 1)
|
|
205
|
+
try:
|
|
206
|
+
metrics[k.strip()] = float(v.strip())
|
|
207
|
+
except ValueError:
|
|
208
|
+
metrics[k.strip()] = v.strip()
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
"status": "retry_succeeded",
|
|
212
|
+
"classification": classification,
|
|
213
|
+
"metrics": metrics,
|
|
214
|
+
"message": f"Retry succeeded after applying fix: {classification['fix']}",
|
|
215
|
+
}
|
|
216
|
+
else:
|
|
217
|
+
return {
|
|
218
|
+
"status": "retry_failed",
|
|
219
|
+
"classification": classification,
|
|
220
|
+
"message": f"Retry failed. Error persists after fix: {classification['fix']}",
|
|
221
|
+
"retry_output": proc.stderr[-500:] if proc.stderr else "",
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def retry_experiment(
|
|
226
|
+
exp_id: str,
|
|
227
|
+
max_attempts: int = DEFAULT_MAX_ATTEMPTS,
|
|
228
|
+
config_path: str = "config.yaml",
|
|
229
|
+
log_path: str = "experiments/log.jsonl",
|
|
230
|
+
timeout: int = 600,
|
|
231
|
+
) -> dict:
|
|
232
|
+
"""Retry a failed experiment with auto-diagnosis and fix.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
exp_id: Experiment ID to retry.
|
|
236
|
+
max_attempts: Maximum retry attempts.
|
|
237
|
+
config_path: Path to config.yaml.
|
|
238
|
+
log_path: Path to experiment log.
|
|
239
|
+
timeout: Per-run timeout.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Retry result dict.
|
|
243
|
+
"""
|
|
244
|
+
experiments = load_experiments(log_path)
|
|
245
|
+
target = None
|
|
246
|
+
for exp in experiments:
|
|
247
|
+
if exp.get("experiment_id") == exp_id:
|
|
248
|
+
target = exp
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
if not target:
|
|
252
|
+
return {"error": f"Experiment {exp_id} not found"}
|
|
253
|
+
|
|
254
|
+
# Check if there's a run.log for this experiment
|
|
255
|
+
run_log = Path(f"experiments/logs/{exp_id}-run.log")
|
|
256
|
+
if run_log.exists():
|
|
257
|
+
output = run_log.read_text()
|
|
258
|
+
else:
|
|
259
|
+
output = target.get("error_output", target.get("description", ""))
|
|
260
|
+
|
|
261
|
+
failure_modes = load_failure_modes()
|
|
262
|
+
classification = classify_failure(output, failure_modes=failure_modes)
|
|
263
|
+
|
|
264
|
+
print(f"Retrying {exp_id}", file=sys.stderr)
|
|
265
|
+
print(f"Failure type: {classification['failure_type']}", file=sys.stderr)
|
|
266
|
+
print(f"Fix: {classification['fix']}", file=sys.stderr)
|
|
267
|
+
|
|
268
|
+
if classification["severity"] == "requires_intervention":
|
|
269
|
+
return {
|
|
270
|
+
"experiment_id": exp_id,
|
|
271
|
+
"status": "cannot_auto_fix",
|
|
272
|
+
"classification": classification,
|
|
273
|
+
"attempts": 0,
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
# Retry loop
|
|
277
|
+
attempts = []
|
|
278
|
+
for attempt in range(1, max_attempts + 1):
|
|
279
|
+
print(f"\n Attempt {attempt}/{max_attempts}...", end=" ", flush=True, file=sys.stderr)
|
|
280
|
+
|
|
281
|
+
result = run_retry(
|
|
282
|
+
output=output,
|
|
283
|
+
seed=42 + attempt,
|
|
284
|
+
timeout=timeout,
|
|
285
|
+
failure_modes=failure_modes,
|
|
286
|
+
)
|
|
287
|
+
attempts.append(result)
|
|
288
|
+
|
|
289
|
+
if result["status"] == "retry_succeeded":
|
|
290
|
+
print("SUCCESS", file=sys.stderr)
|
|
291
|
+
break
|
|
292
|
+
else:
|
|
293
|
+
print(f"FAILED ({result['status']})", file=sys.stderr)
|
|
294
|
+
if result.get("retry_output"):
|
|
295
|
+
output = result["retry_output"]
|
|
296
|
+
|
|
297
|
+
final_status = attempts[-1]["status"] if attempts else "no_attempts"
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
"experiment_id": exp_id,
|
|
301
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
302
|
+
"classification": classification,
|
|
303
|
+
"attempts": len(attempts),
|
|
304
|
+
"max_attempts": max_attempts,
|
|
305
|
+
"final_status": final_status,
|
|
306
|
+
"history": attempts,
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def save_retry_report(report: dict, output_dir: str = "experiments/retries") -> Path:
|
|
311
|
+
"""Save retry report to YAML."""
|
|
312
|
+
out_path = Path(output_dir)
|
|
313
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
314
|
+
exp_id = report.get("experiment_id", "unknown")
|
|
315
|
+
filepath = out_path / f"{exp_id}-retry.yaml"
|
|
316
|
+
with open(filepath, "w") as f:
|
|
317
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
318
|
+
return filepath
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def format_retry_report(report: dict) -> str:
|
|
322
|
+
"""Format retry report as markdown."""
|
|
323
|
+
if "error" in report:
|
|
324
|
+
return f"ERROR: {report['error']}"
|
|
325
|
+
|
|
326
|
+
exp_id = report.get("experiment_id", "?")
|
|
327
|
+
classification = report.get("classification", {})
|
|
328
|
+
final = report.get("final_status", "unknown")
|
|
329
|
+
|
|
330
|
+
status_markers = {
|
|
331
|
+
"retry_succeeded": "RECOVERED",
|
|
332
|
+
"retry_failed": "FAILED",
|
|
333
|
+
"retry_timeout": "TIMEOUT",
|
|
334
|
+
"cannot_auto_fix": "MANUAL FIX NEEDED",
|
|
335
|
+
}
|
|
336
|
+
marker = status_markers.get(final, final)
|
|
337
|
+
|
|
338
|
+
lines = [
|
|
339
|
+
f"# Retry Report: {exp_id}",
|
|
340
|
+
"",
|
|
341
|
+
f"**Status: {marker}**",
|
|
342
|
+
"",
|
|
343
|
+
f"- **Failure type:** {classification.get('failure_type', '?')}",
|
|
344
|
+
f"- **Matched pattern:** {classification.get('matched_pattern', 'N/A')}",
|
|
345
|
+
f"- **Fix applied:** {classification.get('fix', 'N/A')}",
|
|
346
|
+
f"- **Attempts:** {report.get('attempts', 0)}/{report.get('max_attempts', 3)}",
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
if report.get("history"):
|
|
350
|
+
lines.extend(["", "## Attempt History", ""])
|
|
351
|
+
for i, attempt in enumerate(report["history"], 1):
|
|
352
|
+
lines.append(f"- **Attempt {i}:** {attempt.get('status', '?')} — {attempt.get('message', '')}")
|
|
353
|
+
|
|
354
|
+
return "\n".join(lines)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def main() -> None:
|
|
358
|
+
"""CLI entry point."""
|
|
359
|
+
parser = argparse.ArgumentParser(description="Smart failure recovery")
|
|
360
|
+
parser.add_argument("exp_id", nargs="?", default=None, help="Experiment ID to retry")
|
|
361
|
+
parser.add_argument("--max-attempts", type=int, default=DEFAULT_MAX_ATTEMPTS)
|
|
362
|
+
parser.add_argument("--config", default="config.yaml")
|
|
363
|
+
parser.add_argument("--log", default="experiments/log.jsonl")
|
|
364
|
+
parser.add_argument("--timeout", type=int, default=600)
|
|
365
|
+
parser.add_argument("--classify", default=None, help="Just classify error text")
|
|
366
|
+
parser.add_argument("--json", action="store_true")
|
|
367
|
+
args = parser.parse_args()
|
|
368
|
+
|
|
369
|
+
if args.classify:
|
|
370
|
+
result = classify_failure(args.classify)
|
|
371
|
+
if args.json:
|
|
372
|
+
print(json.dumps(result, indent=2))
|
|
373
|
+
else:
|
|
374
|
+
print(f"Type: {result['failure_type']}")
|
|
375
|
+
print(f"Fix: {result['fix']}")
|
|
376
|
+
print(f"Severity: {result['severity']}")
|
|
377
|
+
sys.exit(0)
|
|
378
|
+
|
|
379
|
+
if not args.exp_id:
|
|
380
|
+
print("Usage: smart_retry.py <exp-id> [--max-attempts 3]", file=sys.stderr)
|
|
381
|
+
sys.exit(1)
|
|
382
|
+
|
|
383
|
+
report = retry_experiment(
|
|
384
|
+
args.exp_id, args.max_attempts, args.config, args.log, args.timeout,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if "error" not in report:
|
|
388
|
+
filepath = save_retry_report(report)
|
|
389
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
390
|
+
|
|
391
|
+
if args.json:
|
|
392
|
+
print(json.dumps(report, indent=2, default=str))
|
|
393
|
+
else:
|
|
394
|
+
print(format_retry_report(report))
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
if __name__ == "__main__":
|
|
398
|
+
main()
|