claude-turing 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +33 -2
  3. package/commands/ablate.md +47 -0
  4. package/commands/diagnose.md +52 -0
  5. package/commands/frontier.md +45 -0
  6. package/commands/reproduce.md +48 -0
  7. package/commands/seed.md +47 -0
  8. package/commands/turing.md +10 -0
  9. package/package.json +1 -1
  10. package/src/install.js +2 -1
  11. package/src/verify.js +5 -0
  12. package/templates/config.yaml +10 -0
  13. package/templates/program.md +5 -0
  14. package/templates/scripts/__pycache__/ablation_study.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/diagnose_errors.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/pareto_frontier.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  24. package/templates/scripts/ablation_study.py +487 -0
  25. package/templates/scripts/diagnose_errors.py +601 -0
  26. package/templates/scripts/generate_brief.py +117 -0
  27. package/templates/scripts/generate_model_card.py +25 -0
  28. package/templates/scripts/leaderboard.py +10 -0
  29. package/templates/scripts/pareto_frontier.py +470 -0
  30. package/templates/scripts/reproduce_experiment.py +548 -0
  31. package/templates/scripts/scaffold.py +11 -0
  32. package/templates/scripts/seed_runner.py +414 -0
  33. package/templates/scripts/show_metrics.py +17 -0
  34. package/templates/scripts/turing_io.py +36 -0
  35. package/templates/scripts/update_state.py +13 -0
@@ -0,0 +1,414 @@
1
+ #!/usr/bin/env python3
2
+ """Multi-seed experiment runner for statistical rigor.
3
+
4
+ Runs the same experiment configuration across N random seeds, computes
5
+ mean/std/confidence intervals, and flags seed-sensitive results.
6
+
7
+ Prevents publishing lucky seeds by requiring distributional evidence
8
+ before claiming a result.
9
+
10
+ Usage:
11
+ python scripts/seed_runner.py # 5 seeds, best experiment
12
+ python scripts/seed_runner.py --quick # 3 seeds for fast checks
13
+ python scripts/seed_runner.py --seeds 10 # Custom seed count
14
+ python scripts/seed_runner.py --exp-id exp-042 # Specific experiment
15
+ python scripts/seed_runner.py --seed-list 42,123,456 # Custom seed values
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import math
23
+ import subprocess
24
+ import sys
25
+ from datetime import datetime, timezone
26
+ from pathlib import Path
27
+
28
+ import numpy as np
29
+ import yaml
30
+
31
+ from scripts.turing_io import load_config, load_experiments
32
+
33
+ # Default seed values (diverse primes and powers for good coverage)
34
+ DEFAULT_SEEDS = [42, 123, 456, 789, 1024, 1337, 2048, 3141, 4096, 7919]
35
+ DEFAULT_N_SEEDS = 5
36
+ QUICK_N_SEEDS = 3
37
+ CV_THRESHOLD = 5.0 # Percent — above this, result is seed-sensitive
38
+
39
+
40
+ def get_experiment_config(
41
+ experiments: list[dict],
42
+ exp_id: str | None,
43
+ primary_metric: str,
44
+ lower_is_better: bool,
45
+ ) -> dict | None:
46
+ """Retrieve experiment config from log by ID, or find the best experiment."""
47
+ if exp_id:
48
+ for exp in experiments:
49
+ if exp.get("experiment_id") == exp_id:
50
+ return exp
51
+ return None
52
+
53
+ # Find best kept experiment
54
+ best = None
55
+ best_val = float("inf") if lower_is_better else float("-inf")
56
+ for exp in experiments:
57
+ if exp.get("status") != "kept":
58
+ continue
59
+ val = exp.get("metrics", {}).get(primary_metric)
60
+ if val is None:
61
+ continue
62
+ if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
63
+ best_val = val
64
+ best = exp
65
+ return best
66
+
67
+
68
+ def run_single_seed(seed: int, timeout: int = 600) -> dict | None:
69
+ """Run train.py with a specific seed and parse metrics from output.
70
+
71
+ Returns dict of parsed metrics, or None on failure.
72
+ """
73
+ cmd = ["python", "train.py", "--seed", str(seed)]
74
+ try:
75
+ proc = subprocess.run(
76
+ cmd, capture_output=True, text=True, timeout=timeout,
77
+ )
78
+ except subprocess.TimeoutExpired:
79
+ return None
80
+
81
+ if proc.returncode != 0:
82
+ return None
83
+
84
+ # Parse metrics from --- delimited block
85
+ metrics = {}
86
+ in_block = False
87
+ metadata_keys = {"model_type", "train_seconds"}
88
+
89
+ for line in proc.stdout.splitlines():
90
+ line = line.strip()
91
+ if line == "---":
92
+ if in_block:
93
+ break
94
+ in_block = True
95
+ continue
96
+ if in_block and ":" in line:
97
+ key, value = line.split(":", 1)
98
+ key = key.strip()
99
+ value = value.strip()
100
+ if key in metadata_keys:
101
+ metrics[key] = value
102
+ else:
103
+ try:
104
+ metrics[key] = float(value)
105
+ except ValueError:
106
+ metrics[key] = value
107
+
108
+ return metrics if metrics else None
109
+
110
+
111
+ def compute_seed_statistics(
112
+ values: list[float],
113
+ seeds: list[int],
114
+ ) -> dict:
115
+ """Compute statistical summary for seed study results.
116
+
117
+ Returns dict with mean, std, 95% CI, CV%, and sensitivity flag.
118
+ """
119
+ arr = np.array(values)
120
+ n = len(arr)
121
+ mean = float(np.mean(arr))
122
+ std = float(np.std(arr, ddof=1)) if n > 1 else 0.0
123
+
124
+ # 95% CI using t-distribution
125
+ if n > 1:
126
+ from scipy import stats as scipy_stats
127
+ t_crit = scipy_stats.t.ppf(0.975, df=n - 1)
128
+ margin = t_crit * std / np.sqrt(n)
129
+ else:
130
+ margin = 0.0
131
+
132
+ ci_lower = mean - margin
133
+ ci_upper = mean + margin
134
+
135
+ # Coefficient of variation
136
+ cv = (std / abs(mean) * 100) if mean != 0 else float("inf")
137
+
138
+ # Identify best and worst seeds
139
+ best_idx = int(np.argmax(arr))
140
+ worst_idx = int(np.argmin(arr))
141
+
142
+ return {
143
+ "mean": round(mean, 6),
144
+ "std": round(std, 6),
145
+ "ci_95": [round(ci_lower, 6), round(ci_upper, 6)],
146
+ "cv_percent": round(cv, 2),
147
+ "seed_sensitive": cv > CV_THRESHOLD,
148
+ "best_seed": seeds[best_idx],
149
+ "best_value": round(float(arr[best_idx]), 6),
150
+ "worst_seed": seeds[worst_idx],
151
+ "worst_value": round(float(arr[worst_idx]), 6),
152
+ "range": round(float(np.max(arr) - np.min(arr)), 6),
153
+ }
154
+
155
+
156
+ def run_seed_study(
157
+ n_seeds: int = DEFAULT_N_SEEDS,
158
+ seed_list: list[int] | None = None,
159
+ exp_id: str | None = None,
160
+ config_path: str = "config.yaml",
161
+ log_path: str = "experiments/log.jsonl",
162
+ timeout: int = 600,
163
+ ) -> dict:
164
+ """Run a complete multi-seed study.
165
+
166
+ Args:
167
+ n_seeds: Number of seeds to run.
168
+ seed_list: Explicit seed values (overrides n_seeds if provided).
169
+ exp_id: Specific experiment ID to study (defaults to best).
170
+ config_path: Path to config.yaml.
171
+ log_path: Path to experiment log.
172
+ timeout: Per-run timeout in seconds.
173
+
174
+ Returns:
175
+ Complete seed study result dict.
176
+ """
177
+ config = load_config(config_path)
178
+ eval_cfg = config.get("evaluation", {})
179
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
180
+ lower_is_better = eval_cfg.get("lower_is_better", False)
181
+
182
+ # Determine seeds to use
183
+ configured_seeds = eval_cfg.get("seed_seeds", DEFAULT_SEEDS)
184
+ if seed_list:
185
+ seeds = seed_list
186
+ else:
187
+ seeds = configured_seeds[:n_seeds]
188
+
189
+ # Find the target experiment
190
+ experiments = load_experiments(log_path)
191
+ target_exp = get_experiment_config(experiments, exp_id, primary_metric, lower_is_better)
192
+
193
+ if not target_exp:
194
+ return {
195
+ "error": f"No experiment found{f' with ID {exp_id}' if exp_id else ''}",
196
+ "experiment_id": exp_id,
197
+ }
198
+
199
+ target_id = target_exp.get("experiment_id", "unknown")
200
+ print(f"Seed study for {target_id}", file=sys.stderr)
201
+ print(f"Primary metric: {primary_metric} ({'lower' if lower_is_better else 'higher'} is better)", file=sys.stderr)
202
+ print(f"Seeds: {seeds}", file=sys.stderr)
203
+ print(file=sys.stderr)
204
+
205
+ # Run each seed
206
+ results = []
207
+ failed_seeds = []
208
+ for i, seed in enumerate(seeds):
209
+ print(f" Run {i + 1}/{len(seeds)} (seed={seed})...", end=" ", flush=True, file=sys.stderr)
210
+ metrics = run_single_seed(seed, timeout=timeout)
211
+ if metrics and primary_metric in metrics:
212
+ value = metrics[primary_metric]
213
+ results.append({"seed": seed, "value": value, "metrics": metrics})
214
+ print(f"{primary_metric}={value:.4f}", file=sys.stderr)
215
+ else:
216
+ failed_seeds.append(seed)
217
+ print("FAILED", file=sys.stderr)
218
+
219
+ if len(results) < 2:
220
+ return {
221
+ "error": f"Only {len(results)} successful runs — need at least 2 for statistics",
222
+ "experiment_id": target_id,
223
+ "successful_runs": len(results),
224
+ "failed_seeds": failed_seeds,
225
+ }
226
+
227
+ # Compute statistics
228
+ values = [r["value"] for r in results]
229
+ seeds_run = [r["seed"] for r in results]
230
+ stats = compute_seed_statistics(values, seeds_run)
231
+
232
+ # Build result
233
+ study = {
234
+ "experiment_id": target_id,
235
+ "timestamp": datetime.now(timezone.utc).isoformat(),
236
+ "metric": primary_metric,
237
+ "lower_is_better": lower_is_better,
238
+ "seeds_run": seeds_run,
239
+ "results": [round(v, 6) for v in values],
240
+ "failed_seeds": failed_seeds,
241
+ **stats,
242
+ }
243
+
244
+ return study
245
+
246
+
247
+ def save_seed_study(study: dict, output_dir: str = "experiments/seed_studies") -> Path:
248
+ """Save seed study results to YAML file.
249
+
250
+ Returns path to the saved file.
251
+ """
252
+ out_path = Path(output_dir)
253
+ out_path.mkdir(parents=True, exist_ok=True)
254
+
255
+ exp_id = study.get("experiment_id", "unknown")
256
+ filename = f"{exp_id}-seeds.yaml"
257
+ filepath = out_path / filename
258
+
259
+ with open(filepath, "w") as f:
260
+ yaml.dump(study, f, default_flow_style=False, sort_keys=False)
261
+
262
+ return filepath
263
+
264
+
265
+ def format_seed_report(study: dict) -> str:
266
+ """Format seed study results as a human-readable report."""
267
+ if "error" in study:
268
+ return f"ERROR: {study['error']}"
269
+
270
+ exp_id = study["experiment_id"]
271
+ metric = study["metric"]
272
+ direction = "lower" if study.get("lower_is_better", False) else "higher"
273
+ sensitive = study["seed_sensitive"]
274
+
275
+ lines = [
276
+ f"# Seed Study: {exp_id}",
277
+ "",
278
+ f"*{metric} ({direction} is better) across {len(study['seeds_run'])} seeds*",
279
+ "",
280
+ "## Results",
281
+ "",
282
+ "| Seed | Value |",
283
+ "|------|-------|",
284
+ ]
285
+
286
+ for seed, value in zip(study["seeds_run"], study["results"]):
287
+ marker = ""
288
+ if seed == study["best_seed"]:
289
+ marker = " (best)"
290
+ elif seed == study["worst_seed"]:
291
+ marker = " (worst)"
292
+ lines.append(f"| {seed} | {value:.4f}{marker} |")
293
+
294
+ lines.extend([
295
+ "",
296
+ "## Statistics",
297
+ "",
298
+ f"| Statistic | Value |",
299
+ f"|-----------|-------|",
300
+ f"| Mean | {study['mean']:.4f} |",
301
+ f"| Std | {study['std']:.4f} |",
302
+ f"| 95% CI | [{study['ci_95'][0]:.4f}, {study['ci_95'][1]:.4f}] |",
303
+ f"| CV | {study['cv_percent']:.2f}% |",
304
+ f"| Range | {study['range']:.4f} |",
305
+ f"| Best seed | {study['best_seed']} ({study['best_value']:.4f}) |",
306
+ f"| Worst seed | {study['worst_seed']} ({study['worst_value']:.4f}) |",
307
+ "",
308
+ "## Verdict",
309
+ "",
310
+ ])
311
+
312
+ if sensitive:
313
+ lines.extend([
314
+ f"**SEED-SENSITIVE** (CV={study['cv_percent']:.2f}% > {CV_THRESHOLD}%)",
315
+ "",
316
+ "This result varies significantly across seeds. Do not report a single-seed result.",
317
+ "Report as: **{metric} = {mean:.4f} +/- {std:.4f}** (mean +/- std over {n} seeds)".format(
318
+ metric=metric,
319
+ mean=study["mean"],
320
+ std=study["std"],
321
+ n=len(study["seeds_run"]),
322
+ ),
323
+ ])
324
+ else:
325
+ lines.extend([
326
+ f"**STABLE** (CV={study['cv_percent']:.2f}% < {CV_THRESHOLD}%)",
327
+ "",
328
+ "Result is robust across seeds. Safe to report.",
329
+ "Report as: **{metric} = {mean:.4f} +/- {std:.4f}** (mean +/- std over {n} seeds, 95% CI [{ci_lo:.4f}, {ci_hi:.4f}])".format(
330
+ metric=metric,
331
+ mean=study["mean"],
332
+ std=study["std"],
333
+ n=len(study["seeds_run"]),
334
+ ci_lo=study["ci_95"][0],
335
+ ci_hi=study["ci_95"][1],
336
+ ),
337
+ ])
338
+
339
+ if study.get("failed_seeds"):
340
+ lines.extend([
341
+ "",
342
+ f"**Warning:** {len(study['failed_seeds'])} seeds failed: {study['failed_seeds']}",
343
+ ])
344
+
345
+ return "\n".join(lines)
346
+
347
+
348
+ def main() -> None:
349
+ """CLI entry point."""
350
+ parser = argparse.ArgumentParser(
351
+ description="Multi-seed experiment runner for statistical rigor",
352
+ )
353
+ parser.add_argument(
354
+ "--seeds", type=int, default=DEFAULT_N_SEEDS,
355
+ help=f"Number of seeds to run (default: {DEFAULT_N_SEEDS})",
356
+ )
357
+ parser.add_argument(
358
+ "--quick", action="store_true",
359
+ help=f"Quick mode: run {QUICK_N_SEEDS} seeds instead of {DEFAULT_N_SEEDS}",
360
+ )
361
+ parser.add_argument(
362
+ "--seed-list", type=str, default=None,
363
+ help="Comma-separated list of specific seed values",
364
+ )
365
+ parser.add_argument(
366
+ "--exp-id", type=str, default=None,
367
+ help="Experiment ID to study (defaults to best experiment)",
368
+ )
369
+ parser.add_argument(
370
+ "--config", default="config.yaml",
371
+ help="Path to config.yaml",
372
+ )
373
+ parser.add_argument(
374
+ "--log", default="experiments/log.jsonl",
375
+ help="Path to experiment log",
376
+ )
377
+ parser.add_argument(
378
+ "--timeout", type=int, default=600,
379
+ help="Per-run timeout in seconds (default: 600)",
380
+ )
381
+ parser.add_argument(
382
+ "--json", action="store_true",
383
+ help="Output raw JSON instead of formatted report",
384
+ )
385
+ args = parser.parse_args()
386
+
387
+ n_seeds = QUICK_N_SEEDS if args.quick else args.seeds
388
+ seed_list = None
389
+ if args.seed_list:
390
+ seed_list = [int(s.strip()) for s in args.seed_list.split(",")]
391
+
392
+ study = run_seed_study(
393
+ n_seeds=n_seeds,
394
+ seed_list=seed_list,
395
+ exp_id=args.exp_id,
396
+ config_path=args.config,
397
+ log_path=args.log,
398
+ timeout=args.timeout,
399
+ )
400
+
401
+ # Save results
402
+ if "error" not in study:
403
+ filepath = save_seed_study(study)
404
+ print(f"\nSaved to {filepath}", file=sys.stderr)
405
+
406
+ # Output
407
+ if args.json:
408
+ print(json.dumps(study, indent=2))
409
+ else:
410
+ print(format_seed_report(study))
411
+
412
+
413
+ if __name__ == "__main__":
414
+ main()
@@ -151,6 +151,11 @@ def main() -> None:
151
151
  action="store_true",
152
152
  help="Include git diffs for discarded experiments",
153
153
  )
154
+ parser.add_argument(
155
+ "--with-seeds",
156
+ action="store_true",
157
+ help="Show seed study results alongside best experiment",
158
+ )
154
159
  args = parser.parse_args()
155
160
 
156
161
  config = load_config()
@@ -167,6 +172,18 @@ def main() -> None:
167
172
  best_id = find_best(experiments, primary_metric, lower_is_better)
168
173
  print(format_table(experiments, best_id, metric_names))
169
174
 
175
+ if args.with_seeds and best_id:
176
+ from scripts.turing_io import load_seed_study
177
+ study = load_seed_study(best_id)
178
+ if study and "mean" in study:
179
+ sensitive = "SEED-SENSITIVE" if study.get("seed_sensitive") else "STABLE"
180
+ print(f"\nSeed Study ({best_id}): {sensitive}")
181
+ print(f" {primary_metric} = {study['mean']:.4f} +/- {study.get('std', 0):.4f}")
182
+ if "ci_95" in study:
183
+ ci = study["ci_95"]
184
+ print(f" 95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")
185
+ print(f" CV: {study.get('cv_percent', 0):.2f}%")
186
+
170
187
  if args.with_diffs:
171
188
  all_experiments = load_experiments(args.log)
172
189
  diffs = get_experiment_diffs(all_experiments)
@@ -74,3 +74,39 @@ def load_hypotheses(queue_path: str) -> list[dict]:
74
74
  with open(path) as f:
75
75
  data = yaml.safe_load(f)
76
76
  return data if isinstance(data, list) else []
77
+
78
+
79
+ def load_seed_study(exp_id: str, seed_dir: str = "experiments/seed_studies") -> dict | None:
80
+ """Load a seed study result for a specific experiment.
81
+
82
+ Args:
83
+ exp_id: Experiment ID (e.g., "exp-042").
84
+ seed_dir: Directory containing seed study YAML files.
85
+
86
+ Returns:
87
+ Seed study dict, or None if not found.
88
+ """
89
+ path = Path(seed_dir) / f"{exp_id}-seeds.yaml"
90
+ if not path.exists():
91
+ return None
92
+ with open(path) as f:
93
+ data = yaml.safe_load(f)
94
+ return data if isinstance(data, dict) else None
95
+
96
+
97
+ def load_reproduction(exp_id: str, repro_dir: str = "experiments/reproductions") -> dict | None:
98
+ """Load a reproduction report for a specific experiment.
99
+
100
+ Args:
101
+ exp_id: Experiment ID (e.g., "exp-042").
102
+ repro_dir: Directory containing reproduction YAML files.
103
+
104
+ Returns:
105
+ Reproduction report dict, or None if not found.
106
+ """
107
+ path = Path(repro_dir) / f"{exp_id}-repro.yaml"
108
+ if not path.exists():
109
+ return None
110
+ with open(path) as f:
111
+ data = yaml.safe_load(f)
112
+ return data if isinstance(data, dict) else None
@@ -101,6 +101,19 @@ def set_best(state: dict, experiment_id: str, metrics: dict) -> dict:
101
101
  "metrics": metrics,
102
102
  "updated_at": datetime.now(timezone.utc).isoformat(),
103
103
  }
104
+
105
+ # Check for seed study data
106
+ from scripts.turing_io import load_seed_study
107
+ seed_study = load_seed_study(experiment_id)
108
+ if seed_study and "mean" in seed_study:
109
+ state["best_result"]["seed_study"] = {
110
+ "mean": seed_study["mean"],
111
+ "std": seed_study.get("std", 0),
112
+ "cv_percent": seed_study.get("cv_percent", 0),
113
+ "seed_sensitive": seed_study.get("seed_sensitive", False),
114
+ "seeds_tested": len(seed_study.get("seeds_run", [])),
115
+ }
116
+
104
117
  return state
105
118
 
106
119