claude-turing 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +5 -2
  3. package/commands/fork.md +40 -0
  4. package/commands/queue.md +48 -0
  5. package/commands/retry.md +41 -0
  6. package/commands/turing.md +6 -0
  7. package/config/failure_modes.yaml +74 -0
  8. package/package.json +1 -1
  9. package/src/install.js +2 -1
  10. package/src/verify.js +4 -0
  11. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  12. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  13. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  14. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/experiment_queue.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/fork_experiment.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  24. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  25. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  26. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  27. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  28. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  29. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  30. package/templates/scripts/__pycache__/smart_retry.cpython-314.pyc +0 -0
  31. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  32. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  33. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  34. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  35. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  36. package/templates/scripts/experiment_queue.py +441 -0
  37. package/templates/scripts/fork_experiment.py +286 -0
  38. package/templates/scripts/generate_brief.py +25 -0
  39. package/templates/scripts/scaffold.py +6 -0
  40. package/templates/scripts/smart_retry.py +398 -0
  41. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  42. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  43. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  44. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
@@ -0,0 +1,286 @@
1
+ #!/usr/bin/env python3
2
+ """Experiment branching — run parallel tracks from a common parent.
3
+
4
+ "Try both A and B from this point" — creates child experiments,
5
+ runs both, reports which branch wins.
6
+
7
+ Usage:
8
+ python scripts/fork_experiment.py exp-042 --branches "LightGBM dart" "XGBoost deeper"
9
+ python scripts/fork_experiment.py exp-042 --branches "A" "B" --auto-promote
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import subprocess
17
+ import sys
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+
21
+ import yaml
22
+
23
+ from scripts.turing_io import load_config, load_experiments
24
+
25
+
26
+ def find_experiment(experiments: list[dict], exp_id: str) -> dict | None:
27
+ """Find experiment by ID."""
28
+ for exp in experiments:
29
+ if exp.get("experiment_id") == exp_id:
30
+ return exp
31
+ return None
32
+
33
+
34
+ def create_branch(
35
+ parent: dict,
36
+ branch_description: str,
37
+ branch_index: int,
38
+ ) -> dict:
39
+ """Create a branch descriptor from a parent experiment.
40
+
41
+ Returns dict with branch metadata (not yet executed).
42
+ """
43
+ parent_id = parent.get("experiment_id", "unknown")
44
+ return {
45
+ "branch_id": f"fork-{parent_id}-{branch_index + 1}",
46
+ "parent_id": parent_id,
47
+ "description": branch_description,
48
+ "status": "pending",
49
+ "created_at": datetime.now(timezone.utc).isoformat(),
50
+ "result_experiment": None,
51
+ "metrics": {},
52
+ }
53
+
54
+
55
+ def run_branch(branch: dict, seed: int = 42, timeout: int = 600) -> dict:
56
+ """Execute a single branch experiment.
57
+
58
+ Returns updated branch dict with status and metrics.
59
+ """
60
+ branch["status"] = "running"
61
+ branch["started_at"] = datetime.now(timezone.utc).isoformat()
62
+
63
+ cmd = ["python", "train.py", "--seed", str(seed)]
64
+ try:
65
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
66
+ except subprocess.TimeoutExpired:
67
+ branch["status"] = "failed"
68
+ branch["error"] = "timeout"
69
+ return branch
70
+
71
+ if proc.returncode != 0:
72
+ branch["status"] = "failed"
73
+ branch["error"] = proc.stderr[-300:] if proc.stderr else "unknown error"
74
+ return branch
75
+
76
+ # Parse metrics
77
+ metrics = {}
78
+ in_block = False
79
+ for line in proc.stdout.splitlines():
80
+ line = line.strip()
81
+ if line == "---":
82
+ if in_block:
83
+ break
84
+ in_block = True
85
+ continue
86
+ if in_block and ":" in line:
87
+ key, value = line.split(":", 1)
88
+ try:
89
+ metrics[key.strip()] = float(value.strip())
90
+ except ValueError:
91
+ metrics[key.strip()] = value.strip()
92
+
93
+ branch["status"] = "completed"
94
+ branch["completed_at"] = datetime.now(timezone.utc).isoformat()
95
+ branch["metrics"] = metrics
96
+ return branch
97
+
98
+
99
+ def determine_winner(
100
+ branches: list[dict],
101
+ metric: str,
102
+ lower_is_better: bool,
103
+ ) -> dict | None:
104
+ """Determine the winning branch by primary metric.
105
+
106
+ Returns the winning branch dict, or None if no branches completed.
107
+ """
108
+ completed = [b for b in branches if b.get("status") == "completed" and metric in b.get("metrics", {})]
109
+ if not completed:
110
+ return None
111
+
112
+ if lower_is_better:
113
+ return min(completed, key=lambda b: b["metrics"][metric])
114
+ else:
115
+ return max(completed, key=lambda b: b["metrics"][metric])
116
+
117
+
118
+ def format_fork_report(
119
+ parent_id: str,
120
+ branches: list[dict],
121
+ winner: dict | None,
122
+ metric: str,
123
+ ) -> str:
124
+ """Format fork results as a comparison tree."""
125
+ lines = [
126
+ f"# Fork from {parent_id}",
127
+ "",
128
+ ]
129
+
130
+ if not branches:
131
+ lines.append("No branches executed.")
132
+ return "\n".join(lines)
133
+
134
+ winner_id = winner["branch_id"] if winner else None
135
+
136
+ for branch in branches:
137
+ status = branch.get("status", "?")
138
+ desc = branch.get("description", "?")
139
+ bid = branch.get("branch_id", "?")
140
+
141
+ if status == "completed":
142
+ metric_val = branch.get("metrics", {}).get(metric, "N/A")
143
+ is_winner = bid == winner_id
144
+ marker = "WINNER" if is_winner else ""
145
+ if isinstance(metric_val, float):
146
+ lines.append(f"├── {bid}: {desc} → {metric}={metric_val:.4f} {marker}")
147
+ else:
148
+ lines.append(f"├── {bid}: {desc} → {metric}={metric_val} {marker}")
149
+ elif status == "failed":
150
+ error = branch.get("error", "unknown")
151
+ lines.append(f"├── {bid}: {desc} → FAILED ({error})")
152
+ else:
153
+ lines.append(f"├── {bid}: {desc} → {status}")
154
+
155
+ if winner:
156
+ lines.extend([
157
+ "",
158
+ f"**Recommendation:** promote {winner['branch_id']}, abandon the rest.",
159
+ ])
160
+
161
+ return "\n".join(lines)
162
+
163
+
164
+ def save_fork_report(report: dict, output_dir: str = "experiments/forks") -> Path:
165
+ """Save fork report to YAML."""
166
+ out_path = Path(output_dir)
167
+ out_path.mkdir(parents=True, exist_ok=True)
168
+ parent_id = report.get("parent_id", "unknown")
169
+ filepath = out_path / f"{parent_id}-fork.yaml"
170
+ with open(filepath, "w") as f:
171
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
172
+ return filepath
173
+
174
+
175
+ def run_fork(
176
+ exp_id: str,
177
+ branch_descriptions: list[str],
178
+ auto_promote: bool = False,
179
+ config_path: str = "config.yaml",
180
+ log_path: str = "experiments/log.jsonl",
181
+ timeout: int = 600,
182
+ ) -> dict:
183
+ """Fork an experiment into multiple branches and run all.
184
+
185
+ Args:
186
+ exp_id: Parent experiment ID.
187
+ branch_descriptions: List of branch descriptions.
188
+ auto_promote: Automatically keep winner and discard rest.
189
+ config_path: Path to config.yaml.
190
+ log_path: Path to experiment log.
191
+ timeout: Per-branch timeout.
192
+
193
+ Returns:
194
+ Fork result dict with branches, winner, and recommendation.
195
+ """
196
+ config = load_config(config_path)
197
+ eval_cfg = config.get("evaluation", {})
198
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
199
+ lower_is_better = eval_cfg.get("lower_is_better", False)
200
+
201
+ experiments = load_experiments(log_path)
202
+ parent = find_experiment(experiments, exp_id)
203
+
204
+ if not parent:
205
+ return {"error": f"Experiment {exp_id} not found"}
206
+
207
+ if not branch_descriptions:
208
+ return {"error": "No branches specified. Use --branches 'A' 'B'"}
209
+
210
+ # Create branches
211
+ branches = []
212
+ for i, desc in enumerate(branch_descriptions):
213
+ branches.append(create_branch(parent, desc, i))
214
+
215
+ print(f"Forking {exp_id} into {len(branches)} branches:", file=sys.stderr)
216
+ for b in branches:
217
+ print(f" {b['branch_id']}: {b['description']}", file=sys.stderr)
218
+ print(file=sys.stderr)
219
+
220
+ # Execute branches
221
+ for i, branch in enumerate(branches):
222
+ print(f" [{i+1}/{len(branches)}] Running {branch['branch_id']}...", end=" ",
223
+ flush=True, file=sys.stderr)
224
+ run_branch(branch, seed=42 + i, timeout=timeout)
225
+ if branch["status"] == "completed":
226
+ metric_val = branch.get("metrics", {}).get(primary_metric, "N/A")
227
+ print(f"{primary_metric}={metric_val}", file=sys.stderr)
228
+ else:
229
+ print(f"FAILED", file=sys.stderr)
230
+
231
+ # Determine winner
232
+ winner = determine_winner(branches, primary_metric, lower_is_better)
233
+
234
+ result = {
235
+ "parent_id": exp_id,
236
+ "timestamp": datetime.now(timezone.utc).isoformat(),
237
+ "metric": primary_metric,
238
+ "lower_is_better": lower_is_better,
239
+ "branches": branches,
240
+ "winner": winner["branch_id"] if winner else None,
241
+ "winner_metric": winner["metrics"].get(primary_metric) if winner else None,
242
+ "auto_promote": auto_promote,
243
+ "total_branches": len(branches),
244
+ "completed": sum(1 for b in branches if b["status"] == "completed"),
245
+ "failed": sum(1 for b in branches if b["status"] == "failed"),
246
+ }
247
+
248
+ return result
249
+
250
+
251
+ def main() -> None:
252
+ """CLI entry point."""
253
+ parser = argparse.ArgumentParser(description="Fork experiment into parallel branches")
254
+ parser.add_argument("exp_id", help="Parent experiment ID")
255
+ parser.add_argument("--branches", nargs="+", required=True, help="Branch descriptions")
256
+ parser.add_argument("--auto-promote", action="store_true", help="Auto-keep winner")
257
+ parser.add_argument("--config", default="config.yaml")
258
+ parser.add_argument("--log", default="experiments/log.jsonl")
259
+ parser.add_argument("--timeout", type=int, default=600)
260
+ parser.add_argument("--json", action="store_true")
261
+ args = parser.parse_args()
262
+
263
+ result = run_fork(
264
+ args.exp_id, args.branches, args.auto_promote,
265
+ args.config, args.log, args.timeout,
266
+ )
267
+
268
+ if "error" not in result:
269
+ filepath = save_fork_report(result)
270
+ print(f"\nSaved to {filepath}", file=sys.stderr)
271
+
272
+ if args.json:
273
+ print(json.dumps(result, indent=2, default=str))
274
+ else:
275
+ if "error" in result:
276
+ print(f"ERROR: {result['error']}")
277
+ else:
278
+ print(format_fork_report(
279
+ result["parent_id"], result["branches"],
280
+ next((b for b in result["branches"] if b["branch_id"] == result.get("winner")), None),
281
+ result["metric"],
282
+ ))
283
+
284
+
285
+ if __name__ == "__main__":
286
+ main()
@@ -212,6 +212,18 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
212
212
  return warnings
213
213
 
214
214
 
215
+ def load_queue_summary(queue_path: str = "experiments/queue-summary.yaml") -> dict | None:
216
+ """Load the most recent queue execution summary."""
217
+ path = Path(queue_path)
218
+ if not path.exists():
219
+ return None
220
+ try:
221
+ with open(path) as f:
222
+ return yaml.safe_load(f)
223
+ except (yaml.YAMLError, OSError):
224
+ return None
225
+
226
+
215
227
  def load_profiles(profile_dir: str = "experiments/profiles") -> list[dict]:
216
228
  """Load all profiling results from YAML files."""
217
229
  path = Path(profile_dir)
@@ -296,6 +308,7 @@ def format_brief(
296
308
  reproductions: list[dict] | None = None,
297
309
  diagnoses: list[dict] | None = None,
298
310
  profiles: list[dict] | None = None,
311
+ queue_summary: dict | None = None,
299
312
  ) -> str:
300
313
  """Format the research briefing as markdown."""
301
314
  direction = "lower" if lower_is_better else "higher"
@@ -472,6 +485,16 @@ def format_brief(
472
485
  if failed:
473
486
  lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
474
487
 
488
+ # Queue report
489
+ if queue_summary and queue_summary.get("total"):
490
+ qs = queue_summary
491
+ lines.extend(["", "## Queue Report", ""])
492
+ lines.append(
493
+ f"**{qs.get('status', '?')}** — {qs.get('completed', 0)} completed, "
494
+ f"{qs.get('failed', 0)} failed, {qs.get('skipped', 0)} skipped "
495
+ f"of {qs.get('total', 0)} queued"
496
+ )
497
+
475
498
  # Profiles
476
499
  if profiles:
477
500
  lines.extend(["", "## Performance Profile", ""])
@@ -569,6 +592,7 @@ def generate_brief(
569
592
  reproductions = load_reproductions()
570
593
  diagnoses = load_diagnoses()
571
594
  profiles = load_profiles()
595
+ queue_summary = load_queue_summary()
572
596
 
573
597
  return format_brief(
574
598
  campaign, best, trajectory, model_types, hypotheses,
@@ -579,6 +603,7 @@ def generate_brief(
579
603
  reproductions=reproductions if reproductions else None,
580
604
  diagnoses=diagnoses if diagnoses else None,
581
605
  profiles=profiles if profiles else None,
606
+ queue_summary=queue_summary,
582
607
  )
583
608
 
584
609
 
@@ -104,6 +104,9 @@ TEMPLATE_DIRS = {
104
104
  "export_card.py",
105
105
  "literature_search.py",
106
106
  "draft_paper_sections.py",
107
+ "experiment_queue.py",
108
+ "smart_retry.py",
109
+ "fork_experiment.py",
107
110
  ],
108
111
  "tests": ["__init__.py", "conftest.py"],
109
112
  }
@@ -122,6 +125,9 @@ DIRECTORIES_TO_CREATE = [
122
125
  "exports",
123
126
  "experiments/literature",
124
127
  "paper/sections",
128
+ "experiments/retries",
129
+ "experiments/forks",
130
+ "experiments/logs",
125
131
  "models/best",
126
132
  "models/archive",
127
133
  ]