claude-turing 4.3.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/.claude-plugin/plugin.json +5 -5
  2. package/LICENSE +1 -1
  3. package/README.md +78 -552
  4. package/bin/cli.js +23 -4
  5. package/commands/doctor.md +31 -0
  6. package/commands/init.md +21 -3
  7. package/commands/plan.md +27 -0
  8. package/commands/postmortem.md +28 -0
  9. package/commands/turing.md +6 -0
  10. package/config/defaults.yaml +2 -0
  11. package/package.json +5 -5
  12. package/src/install.js +18 -2
  13. package/src/verify.js +45 -2
  14. package/templates/README.md +1 -1
  15. package/templates/__pycache__/evaluate.cpython-312.pyc +0 -0
  16. package/templates/__pycache__/prepare.cpython-312.pyc +0 -0
  17. package/templates/config.yaml +1 -1
  18. package/templates/features/__pycache__/__init__.cpython-312.pyc +0 -0
  19. package/templates/features/__pycache__/featurizers.cpython-312.pyc +0 -0
  20. package/templates/program.md +1 -1
  21. package/templates/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
  22. package/templates/scripts/__pycache__/ablation_study.cpython-312.pyc +0 -0
  23. package/templates/scripts/__pycache__/architecture_surgery.cpython-312.pyc +0 -0
  24. package/templates/scripts/__pycache__/budget_manager.cpython-312.pyc +0 -0
  25. package/templates/scripts/__pycache__/build_ensemble.cpython-312.pyc +0 -0
  26. package/templates/scripts/__pycache__/calibration.cpython-312.pyc +0 -0
  27. package/templates/scripts/__pycache__/check_convergence.cpython-312.pyc +0 -0
  28. package/templates/scripts/__pycache__/checkpoint_manager.cpython-312.pyc +0 -0
  29. package/templates/scripts/__pycache__/citation_manager.cpython-312.pyc +0 -0
  30. package/templates/scripts/__pycache__/cost_frontier.cpython-312.pyc +0 -0
  31. package/templates/scripts/__pycache__/counterfactual_explanation.cpython-312.pyc +0 -0
  32. package/templates/scripts/__pycache__/critique_hypothesis.cpython-312.pyc +0 -0
  33. package/templates/scripts/__pycache__/curriculum_optimizer.cpython-312.pyc +0 -0
  34. package/templates/scripts/__pycache__/diagnose_errors.cpython-312.pyc +0 -0
  35. package/templates/scripts/__pycache__/draft_paper_sections.cpython-312.pyc +0 -0
  36. package/templates/scripts/__pycache__/equivalence_checker.cpython-312.pyc +0 -0
  37. package/templates/scripts/__pycache__/experiment_annotations.cpython-312.pyc +0 -0
  38. package/templates/scripts/__pycache__/experiment_archive.cpython-312.pyc +0 -0
  39. package/templates/scripts/__pycache__/experiment_diff.cpython-312.pyc +0 -0
  40. package/templates/scripts/__pycache__/experiment_index.cpython-312.pyc +0 -0
  41. package/templates/scripts/__pycache__/experiment_queue.cpython-312.pyc +0 -0
  42. package/templates/scripts/__pycache__/experiment_replay.cpython-312.pyc +0 -0
  43. package/templates/scripts/__pycache__/experiment_search.cpython-312.pyc +0 -0
  44. package/templates/scripts/__pycache__/experiment_simulator.cpython-312.pyc +0 -0
  45. package/templates/scripts/__pycache__/experiment_templates.cpython-312.pyc +0 -0
  46. package/templates/scripts/__pycache__/export_card.cpython-312.pyc +0 -0
  47. package/templates/scripts/__pycache__/export_formats.cpython-312.pyc +0 -0
  48. package/templates/scripts/__pycache__/failure_postmortem.cpython-312.pyc +0 -0
  49. package/templates/scripts/__pycache__/failure_postmortem.cpython-314.pyc +0 -0
  50. package/templates/scripts/__pycache__/feature_intelligence.cpython-312.pyc +0 -0
  51. package/templates/scripts/__pycache__/fork_experiment.cpython-312.pyc +0 -0
  52. package/templates/scripts/__pycache__/generate_baselines.cpython-312.pyc +0 -0
  53. package/templates/scripts/__pycache__/generate_brief.cpython-312.pyc +0 -0
  54. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/generate_changelog.cpython-312.pyc +0 -0
  56. package/templates/scripts/__pycache__/generate_figures.cpython-312.pyc +0 -0
  57. package/templates/scripts/__pycache__/generate_logbook.cpython-312.pyc +0 -0
  58. package/templates/scripts/__pycache__/generate_model_card.cpython-312.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_onboarding.cpython-312.pyc +0 -0
  60. package/templates/scripts/__pycache__/harness_doctor.cpython-312.pyc +0 -0
  61. package/templates/scripts/__pycache__/harness_doctor.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/incremental_update.cpython-312.pyc +0 -0
  63. package/templates/scripts/__pycache__/knowledge_transfer.cpython-312.pyc +0 -0
  64. package/templates/scripts/__pycache__/latency_benchmark.cpython-312.pyc +0 -0
  65. package/templates/scripts/__pycache__/leakage_detector.cpython-312.pyc +0 -0
  66. package/templates/scripts/__pycache__/literature_search.cpython-312.pyc +0 -0
  67. package/templates/scripts/__pycache__/log_experiment.cpython-312.pyc +0 -0
  68. package/templates/scripts/__pycache__/manage_hypotheses.cpython-312.pyc +0 -0
  69. package/templates/scripts/__pycache__/methodology_audit.cpython-312.pyc +0 -0
  70. package/templates/scripts/__pycache__/model_distiller.cpython-312.pyc +0 -0
  71. package/templates/scripts/__pycache__/model_lifecycle.cpython-312.pyc +0 -0
  72. package/templates/scripts/__pycache__/model_merger.cpython-312.pyc +0 -0
  73. package/templates/scripts/__pycache__/model_pruning.cpython-312.pyc +0 -0
  74. package/templates/scripts/__pycache__/model_quantization.cpython-312.pyc +0 -0
  75. package/templates/scripts/__pycache__/model_xray.cpython-312.pyc +0 -0
  76. package/templates/scripts/__pycache__/novelty_guard.cpython-312.pyc +0 -0
  77. package/templates/scripts/__pycache__/package_experiments.cpython-312.pyc +0 -0
  78. package/templates/scripts/__pycache__/pareto_frontier.cpython-312.pyc +0 -0
  79. package/templates/scripts/__pycache__/parse_metrics.cpython-312.pyc +0 -0
  80. package/templates/scripts/__pycache__/pipeline_manager.cpython-312.pyc +0 -0
  81. package/templates/scripts/__pycache__/profile_training.cpython-312.pyc +0 -0
  82. package/templates/scripts/__pycache__/regression_gate.cpython-312.pyc +0 -0
  83. package/templates/scripts/__pycache__/reproduce_experiment.cpython-312.pyc +0 -0
  84. package/templates/scripts/__pycache__/research_planner.cpython-312.pyc +0 -0
  85. package/templates/scripts/__pycache__/research_planner.cpython-314.pyc +0 -0
  86. package/templates/scripts/__pycache__/sanity_checks.cpython-312.pyc +0 -0
  87. package/templates/scripts/__pycache__/scaffold.cpython-312.pyc +0 -0
  88. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  89. package/templates/scripts/__pycache__/scaling_estimator.cpython-312.pyc +0 -0
  90. package/templates/scripts/__pycache__/seed_runner.cpython-312.pyc +0 -0
  91. package/templates/scripts/__pycache__/sensitivity_analysis.cpython-312.pyc +0 -0
  92. package/templates/scripts/__pycache__/session_flashback.cpython-312.pyc +0 -0
  93. package/templates/scripts/__pycache__/show_experiment_tree.cpython-312.pyc +0 -0
  94. package/templates/scripts/__pycache__/show_families.cpython-312.pyc +0 -0
  95. package/templates/scripts/__pycache__/simulate_review.cpython-312.pyc +0 -0
  96. package/templates/scripts/__pycache__/smart_retry.cpython-312.pyc +0 -0
  97. package/templates/scripts/__pycache__/statistical_compare.cpython-312.pyc +0 -0
  98. package/templates/scripts/__pycache__/suggest_next.cpython-312.pyc +0 -0
  99. package/templates/scripts/__pycache__/sweep.cpython-312.pyc +0 -0
  100. package/templates/scripts/__pycache__/synthesize_decision.cpython-312.pyc +0 -0
  101. package/templates/scripts/__pycache__/training_monitor.cpython-312.pyc +0 -0
  102. package/templates/scripts/__pycache__/treequest_suggest.cpython-312.pyc +0 -0
  103. package/templates/scripts/__pycache__/trend_analysis.cpython-312.pyc +0 -0
  104. package/templates/scripts/__pycache__/turing_io.cpython-312.pyc +0 -0
  105. package/templates/scripts/__pycache__/update_state.cpython-312.pyc +0 -0
  106. package/templates/scripts/__pycache__/verify_placeholders.cpython-312.pyc +0 -0
  107. package/templates/scripts/__pycache__/warm_start.cpython-312.pyc +0 -0
  108. package/templates/scripts/__pycache__/whatif_engine.cpython-312.pyc +0 -0
  109. package/templates/scripts/failure_postmortem.py +510 -0
  110. package/templates/scripts/generate_brief.py +61 -0
  111. package/templates/scripts/harness_doctor.py +610 -0
  112. package/templates/scripts/research_planner.py +470 -0
  113. package/templates/scripts/scaffold.py +56 -28
@@ -0,0 +1,470 @@
1
+ #!/usr/bin/env python3
2
+ """Research planning assistant for the autoresearch pipeline.
3
+
4
+ Given the current project state, generates a strategic research plan
5
+ that allocates experiments across strategies by expected ROI. Operates
6
+ one level above individual hypotheses — designs campaigns.
7
+
8
+ Usage:
9
+ python scripts/research_planner.py --budget 20
10
+ python scripts/research_planner.py --budget 20 --goal "maximize F1"
11
+ python scripts/research_planner.py --json
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import sys
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+
22
+ import numpy as np
23
+ import yaml
24
+
25
+ from scripts.turing_io import load_config, load_experiments
26
+
27
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
28
+ DEFAULT_BUDGET = 20
29
+
30
+ # Strategy definitions with base ROI and experiment templates
31
+ STRATEGIES = {
32
+ "feature_engineering": {
33
+ "label": "Feature Engineering",
34
+ "base_priority": 0.4,
35
+ "typical_gain": 0.005,
36
+ "templates": [
37
+ "Automated feature selection (top consensus features)",
38
+ "Interaction feature generation",
39
+ "Domain-specific feature engineering",
40
+ "Feature ablation to prune dead weight",
41
+ ],
42
+ },
43
+ "model_search": {
44
+ "label": "Model Architecture Search",
45
+ "base_priority": 0.25,
46
+ "typical_gain": 0.003,
47
+ "templates": [
48
+ "Try alternative model family",
49
+ "Hyperparameter optimization",
50
+ "Architecture modification",
51
+ ],
52
+ },
53
+ "ensemble": {
54
+ "label": "Ensemble & Composition",
55
+ "base_priority": 0.15,
56
+ "typical_gain": 0.008,
57
+ "templates": [
58
+ "Build stacking ensemble from top diverse models",
59
+ "Model soup from top checkpoints",
60
+ "Pipeline stitch: swap preprocessing into ensemble",
61
+ ],
62
+ },
63
+ "calibration": {
64
+ "label": "Production Readiness",
65
+ "base_priority": 0.1,
66
+ "typical_gain": 0.001,
67
+ "templates": [
68
+ "Probability calibration (Platt/isotonic)",
69
+ "Post-training quantization (INT8)",
70
+ "Weight pruning (find sparsity knee point)",
71
+ "Full seed study on final model",
72
+ ],
73
+ },
74
+ "verification": {
75
+ "label": "Verification & Documentation",
76
+ "base_priority": 0.1,
77
+ "typical_gain": 0.0,
78
+ "templates": [
79
+ "Reproduce final model",
80
+ "Run methodology audit",
81
+ "Generate model card",
82
+ ],
83
+ },
84
+ }
85
+
86
+
87
+ # --- ROI Analysis ---
88
+
89
+
90
+ def compute_family_roi(
91
+ experiments: list[dict],
92
+ primary_metric: str,
93
+ lower_is_better: bool = False,
94
+ ) -> dict[str, dict]:
95
+ """Compute ROI (improvement per experiment) for each experiment family.
96
+
97
+ Returns:
98
+ Dict of {family: {experiments, total_improvement, roi, exhausted}}.
99
+ """
100
+ families = {}
101
+
102
+ for exp in experiments:
103
+ family = exp.get("family", exp.get("config", {}).get("family", "unknown"))
104
+ if family not in families:
105
+ families[family] = {"experiments": [], "metrics": []}
106
+ families[family]["experiments"].append(exp)
107
+
108
+ val = exp.get("metrics", {}).get(primary_metric)
109
+ if val is not None:
110
+ families[family]["metrics"].append(float(val))
111
+
112
+ result = {}
113
+ for family, data in families.items():
114
+ metrics = data["metrics"]
115
+ n_exps = len(data["experiments"])
116
+
117
+ if len(metrics) < 2:
118
+ roi = 0
119
+ exhausted = False
120
+ else:
121
+ if lower_is_better:
122
+ improvement = metrics[0] - min(metrics)
123
+ else:
124
+ improvement = max(metrics) - metrics[0]
125
+ roi = improvement / n_exps if n_exps > 0 else 0
126
+
127
+ # Check if last 3 experiments showed no improvement
128
+ recent = metrics[-3:]
129
+ exhausted = len(recent) >= 3 and (max(recent) - min(recent)) < 0.002
130
+
131
+ result[family] = {
132
+ "n_experiments": n_exps,
133
+ "total_improvement": round(float(max(metrics) - min(metrics)) if metrics else 0, 6),
134
+ "roi_per_experiment": round(float(roi), 6),
135
+ "exhausted": exhausted,
136
+ "best_metric": round(float(max(metrics)), 6) if metrics else None,
137
+ }
138
+
139
+ return result
140
+
141
+
142
+ def adjust_priorities(
143
+ base_strategies: dict,
144
+ family_roi: dict[str, dict],
145
+ experiments: list[dict],
146
+ primary_metric: str,
147
+ goal: str | None = None,
148
+ ) -> dict[str, float]:
149
+ """Adjust strategy priorities based on project state.
150
+
151
+ Returns:
152
+ Dict of {strategy_name: adjusted_priority}.
153
+ """
154
+ priorities = {name: s["base_priority"] for name, s in base_strategies.items()}
155
+
156
+ n_total = len(experiments)
157
+
158
+ # Boost feature engineering if it has high ROI
159
+ fe_families = [f for f, data in family_roi.items()
160
+ if "feature" in f.lower() and not data["exhausted"]]
161
+ if fe_families:
162
+ priorities["feature_engineering"] *= 1.3
163
+
164
+ # Reduce model search if exhausted
165
+ model_families = [f for f, data in family_roi.items()
166
+ if ("tuning" in f.lower() or "architecture" in f.lower()) and data["exhausted"]]
167
+ if model_families:
168
+ priorities["model_search"] *= 0.5
169
+
170
+ # Boost ensemble if enough diverse models exist
171
+ n_kept = sum(1 for e in experiments if e.get("status") == "kept")
172
+ if n_kept >= 5:
173
+ priorities["ensemble"] *= 1.2
174
+
175
+ # Boost verification if many experiments done
176
+ if n_total >= 20:
177
+ priorities["verification"] *= 1.5
178
+
179
+ # Goal-based adjustments
180
+ if goal:
181
+ goal_lower = goal.lower()
182
+ if "production" in goal_lower or "deploy" in goal_lower:
183
+ priorities["calibration"] *= 2.0
184
+ priorities["verification"] *= 1.5
185
+ if "f1" in goal_lower or "accuracy" in goal_lower:
186
+ priorities["feature_engineering"] *= 1.2
187
+ priorities["ensemble"] *= 1.2
188
+
189
+ # Normalize
190
+ total = sum(priorities.values())
191
+ if total > 0:
192
+ priorities = {k: round(v / total, 3) for k, v in priorities.items()}
193
+
194
+ return priorities
195
+
196
+
197
+ # --- Plan Generation ---
198
+
199
+
200
+ def allocate_budget(
201
+ priorities: dict[str, float],
202
+ budget: int,
203
+ min_per_strategy: int = 1,
204
+ ) -> dict[str, int]:
205
+ """Allocate experiment budget across strategies.
206
+
207
+ Args:
208
+ priorities: Strategy priorities (sum to ~1.0).
209
+ budget: Total experiment budget.
210
+ min_per_strategy: Minimum experiments per active strategy.
211
+
212
+ Returns:
213
+ Dict of {strategy: n_experiments}.
214
+ """
215
+ if budget <= 0:
216
+ return {k: 0 for k in priorities}
217
+
218
+ # Initial allocation by priority
219
+ allocation = {}
220
+ remaining = budget
221
+
222
+ for strategy, priority in sorted(priorities.items(), key=lambda x: -x[1]):
223
+ n = max(min_per_strategy, round(budget * priority))
224
+ n = min(n, remaining)
225
+ allocation[strategy] = n
226
+ remaining -= n
227
+ if remaining <= 0:
228
+ break
229
+
230
+ # Distribute any remaining
231
+ if remaining > 0:
232
+ top_strategy = max(priorities, key=priorities.get)
233
+ allocation[top_strategy] = allocation.get(top_strategy, 0) + remaining
234
+
235
+ return allocation
236
+
237
+
238
+ def generate_plan(
239
+ allocation: dict[str, int],
240
+ strategies: dict,
241
+ family_roi: dict[str, dict],
242
+ current_best: float | None = None,
243
+ primary_metric: str = "accuracy",
244
+ ) -> dict:
245
+ """Generate a structured research plan from budget allocation.
246
+
247
+ Returns:
248
+ Plan with phases, experiment descriptions, and expected outcome.
249
+ """
250
+ phases = []
251
+ exp_counter = 1
252
+
253
+ for strategy_name, n_exps in allocation.items():
254
+ if n_exps <= 0:
255
+ continue
256
+
257
+ strategy = strategies.get(strategy_name, {})
258
+ templates = strategy.get("templates", [])
259
+ typical_gain = strategy.get("typical_gain", 0)
260
+
261
+ experiments = []
262
+ for i in range(n_exps):
263
+ template = templates[i % len(templates)] if templates else f"Experiment {exp_counter}"
264
+ experiments.append({
265
+ "number": exp_counter,
266
+ "description": template,
267
+ })
268
+ exp_counter += 1
269
+
270
+ pct = round(n_exps / sum(allocation.values()) * 100) if sum(allocation.values()) > 0 else 0
271
+
272
+ phases.append({
273
+ "name": strategy_name,
274
+ "label": strategy.get("label", strategy_name),
275
+ "n_experiments": n_exps,
276
+ "budget_pct": pct,
277
+ "rationale": _phase_rationale(strategy_name, family_roi),
278
+ "experiments": experiments,
279
+ "expected_gain": round(typical_gain * n_exps, 4),
280
+ })
281
+
282
+ # Estimate expected outcome
283
+ total_expected_gain = sum(p["expected_gain"] for p in phases)
284
+ expected_metric = round(current_best + total_expected_gain, 4) if current_best else None
285
+
286
+ return {
287
+ "phases": phases,
288
+ "total_experiments": exp_counter - 1,
289
+ "expected_metric": expected_metric,
290
+ "expected_gain": round(total_expected_gain, 4),
291
+ "primary_metric": primary_metric,
292
+ }
293
+
294
+
295
+ def _phase_rationale(strategy_name: str, family_roi: dict) -> str:
296
+ """Generate rationale for a phase allocation."""
297
+ rationales = {
298
+ "feature_engineering": "Highest ROI direction — feature improvements compound across models",
299
+ "model_search": "Explore alternative architectures for potential step-change improvement",
300
+ "ensemble": "Combine existing models for 1-3% improvement at zero additional training cost",
301
+ "calibration": "Required for production deployment — probability calibration and model compression",
302
+ "verification": "Final validation — reproduce results, audit methodology, generate model card",
303
+ }
304
+ return rationales.get(strategy_name, "Strategic allocation")
305
+
306
+
307
+ # --- Full Pipeline ---
308
+
309
+
310
+ def create_research_plan(
311
+ budget: int = DEFAULT_BUDGET,
312
+ goal: str | None = None,
313
+ config_path: str = "config.yaml",
314
+ log_path: str = DEFAULT_LOG_PATH,
315
+ ) -> dict:
316
+ """Create a strategic research plan.
317
+
318
+ Args:
319
+ budget: Total experiment budget.
320
+ goal: Optional goal description.
321
+ config_path: Path to config.yaml.
322
+ log_path: Path to experiment log.
323
+
324
+ Returns:
325
+ Research plan with phases, allocation, and expected outcome.
326
+ """
327
+ config = load_config(config_path)
328
+ eval_cfg = config.get("evaluation", {})
329
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
330
+ lower_is_better = eval_cfg.get("lower_is_better", False)
331
+
332
+ experiments = load_experiments(log_path)
333
+
334
+ if not experiments:
335
+ return {
336
+ "budget": budget,
337
+ "goal": goal,
338
+ "message": "No experiment history — start with /turing:train first",
339
+ "plan": generate_plan(
340
+ {"model_search": budget},
341
+ STRATEGIES, {},
342
+ primary_metric=primary_metric,
343
+ ),
344
+ "generated_at": datetime.now(timezone.utc).isoformat(),
345
+ }
346
+
347
+ # Compute family ROI
348
+ family_roi = compute_family_roi(experiments, primary_metric, lower_is_better)
349
+
350
+ # Get current best
351
+ best_metrics = [exp.get("metrics", {}).get(primary_metric)
352
+ for exp in experiments if exp.get("metrics", {}).get(primary_metric) is not None]
353
+ current_best = max(best_metrics) if best_metrics and not lower_is_better else (min(best_metrics) if best_metrics else None)
354
+
355
+ # Adjust priorities
356
+ priorities = adjust_priorities(STRATEGIES, family_roi, experiments, primary_metric, goal)
357
+
358
+ # Allocate budget
359
+ allocation = allocate_budget(priorities, budget)
360
+
361
+ # Generate plan
362
+ plan = generate_plan(allocation, STRATEGIES, family_roi, current_best, primary_metric)
363
+
364
+ return {
365
+ "budget": budget,
366
+ "goal": goal,
367
+ "current_best": current_best,
368
+ "primary_metric": primary_metric,
369
+ "n_experiments_so_far": len(experiments),
370
+ "family_roi": family_roi,
371
+ "priorities": priorities,
372
+ "allocation": allocation,
373
+ "plan": plan,
374
+ "generated_at": datetime.now(timezone.utc).isoformat(),
375
+ }
376
+
377
+
378
+ # --- Report Formatting ---
379
+
380
+
381
+ def save_plan_report(report: dict, output_dir: str = "experiments/plans") -> Path:
382
+ """Save research plan to YAML."""
383
+ out_path = Path(output_dir)
384
+ out_path.mkdir(parents=True, exist_ok=True)
385
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
386
+ filepath = out_path / f"plan-{ts}.yaml"
387
+ with open(filepath, "w") as f:
388
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
389
+ return filepath
390
+
391
+
392
+ def format_plan_report(report: dict) -> str:
393
+ """Format research plan as readable markdown."""
394
+ if "message" in report and "plan" not in report:
395
+ return report["message"]
396
+
397
+ plan = report.get("plan", {})
398
+ budget = report.get("budget", 0)
399
+ goal = report.get("goal", "maximize primary metric")
400
+
401
+ lines = [
402
+ f"# Research Plan ({budget} experiments, goal: {goal or 'maximize metric'})",
403
+ "",
404
+ ]
405
+
406
+ if report.get("current_best"):
407
+ lines.append(f"**Current best:** {report['primary_metric']}={report['current_best']}")
408
+ lines.append("")
409
+
410
+ phases = plan.get("phases", [])
411
+ phase_label = "A"
412
+
413
+ for phase in phases:
414
+ lines.append(f"## Phase {phase_label}: {phase['label']} ({phase['n_experiments']} experiments, {phase['budget_pct']}% of budget)")
415
+ lines.append(f"*Rationale: {phase['rationale']}*")
416
+ lines.append("")
417
+
418
+ for exp in phase.get("experiments", []):
419
+ lines.append(f" {exp['number']}. {exp['description']}")
420
+
421
+ lines.append("")
422
+ phase_label = chr(ord(phase_label) + 1)
423
+
424
+ expected = plan.get("expected_metric")
425
+ gain = plan.get("expected_gain", 0)
426
+ if expected:
427
+ lines.append(f"**Expected outcome:** {report.get('primary_metric', 'metric')} {report.get('current_best', '?')} → {expected} (+{gain})")
428
+ else:
429
+ lines.append(f"**Expected gain:** +{gain}")
430
+
431
+ lines.extend(["", f"*Generated: {report.get('generated_at', 'N/A')}*"])
432
+ return "\n".join(lines)
433
+
434
+
435
+ # --- CLI ---
436
+
437
+
438
+ def main():
439
+ parser = argparse.ArgumentParser(
440
+ description="Research planning assistant — strategic experiment campaign design"
441
+ )
442
+ parser.add_argument("--budget", type=int, default=DEFAULT_BUDGET,
443
+ help="Total experiment budget")
444
+ parser.add_argument("--goal", help="Goal description (e.g., 'maximize F1 for production')")
445
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
446
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
447
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
448
+
449
+ args = parser.parse_args()
450
+
451
+ report = create_research_plan(
452
+ budget=args.budget,
453
+ goal=args.goal,
454
+ config_path=args.config,
455
+ log_path=args.log,
456
+ )
457
+
458
+ if args.json:
459
+ print(json.dumps(report, indent=2))
460
+ else:
461
+ print(format_plan_report(report))
462
+
463
+ if "error" not in report:
464
+ saved = save_plan_report(report)
465
+ if not args.json:
466
+ print(f"\nSaved: {saved}")
467
+
468
+
469
+ if __name__ == "__main__":
470
+ main()
@@ -34,6 +34,8 @@ PLACEHOLDER_MAP = {
34
34
  "ML_DIR": "ml_dir",
35
35
  "DATA_SOURCE": "data_source",
36
36
  "METRIC_DIRECTION": "metric_direction",
37
+ "LOWER_IS_BETTER": "lower_is_better",
38
+ "MEMORY_DIR_NAME": "memory_dir_name",
37
39
  }
38
40
 
39
41
  # Files to copy from templates/ to the ML directory
@@ -148,6 +150,9 @@ TEMPLATE_DIRS = {
148
150
  "experiment_simulator.py",
149
151
  "incremental_update.py",
150
152
  "model_lifecycle.py",
153
+ "failure_postmortem.py",
154
+ "harness_doctor.py",
155
+ "research_planner.py",
151
156
  ],
152
157
  "tests": ["__init__.py", "conftest.py"],
153
158
  }
@@ -203,6 +208,9 @@ DIRECTORIES_TO_CREATE = [
203
208
  "experiments/counterfactuals",
204
209
  "experiments/simulations",
205
210
  "experiments/updates",
211
+ "experiments/postmortems",
212
+ "experiments/doctor",
213
+ "experiments/plans",
206
214
  "exports/model-cards",
207
215
  "experiments/logs",
208
216
  "models/best",
@@ -217,32 +225,49 @@ SHELL_SCRIPTS = [
217
225
 
218
226
  def find_templates_dir() -> Path | None:
219
227
  """Locate the templates directory relative to this script or plugin root."""
220
- # When running from a scaffolded project, templates are local
228
+ env_templates_dir = os.environ.get("TURING_TEMPLATES_DIR")
229
+ if env_templates_dir:
230
+ candidate = Path(env_templates_dir).expanduser()
231
+ if (candidate / "prepare.py").exists():
232
+ return candidate
233
+
221
234
  script_dir = Path(__file__).parent
222
235
 
223
- # Check: are we inside the plugin's templates/scripts/ ?
224
- candidate = script_dir.parent # templates/
225
- if (candidate / "prepare.py").exists():
226
- return candidate
227
-
228
- # Check: plugin root (two levels up from scripts/)
229
- plugin_root = script_dir.parent.parent
230
- candidate = plugin_root / "templates"
231
- if candidate.exists() and (candidate / "prepare.py").exists():
232
- return candidate
233
-
234
- # Search common plugin locations
235
- home = Path.home()
236
- for pattern in [
237
- home / ".claude" / "plugins" / "*" / "templates",
236
+ project_command_templates = [
237
+ path / ".claude" / "commands" / "turing" / "templates"
238
+ for path in [Path.cwd(), *Path.cwd().parents]
239
+ ]
240
+
241
+ for candidate in [
242
+ script_dir.parent,
243
+ script_dir.parent.parent / "templates",
244
+ *project_command_templates,
245
+ Path.home() / ".claude" / "commands" / "turing" / "templates",
246
+ Path.cwd() / "node_modules" / "claude-turing" / "templates",
238
247
  ]:
239
- for match in sorted(pattern.parent.glob(pattern.name)):
240
- if (match / "prepare.py").exists():
241
- return match
248
+ if (candidate / "prepare.py").exists():
249
+ return candidate
250
+
251
+ plugins_dir = Path.home() / ".claude" / "plugins"
252
+ for match in sorted(plugins_dir.glob("*/templates")):
253
+ if (match / "prepare.py").exists():
254
+ return match
242
255
 
243
256
  return None
244
257
 
245
258
 
259
+ def derive_values(values: dict[str, str]) -> dict[str, str]:
260
+ """Add scaffold values derived from user-provided fields."""
261
+ derived = dict(values)
262
+ derived["lower_is_better"] = (
263
+ "true" if derived.get("metric_direction", "").lower() == "lower" else "false"
264
+ )
265
+ derived["memory_dir_name"] = re.sub(
266
+ r"[^a-zA-Z0-9_.-]+", "-", derived["project_name"]
267
+ ).strip("-")
268
+ return derived
269
+
270
+
246
271
  def replace_placeholders(text: str, values: dict[str, str]) -> str:
247
272
  """Replace all {{PLACEHOLDER}} markers in text with values."""
248
273
  for placeholder, arg_name in PLACEHOLDER_MAP.items():
@@ -270,6 +295,7 @@ def scaffold_project(
270
295
  Returns:
271
296
  Dict with counts: files_copied, placeholders_replaced, dirs_created.
272
297
  """
298
+ values = derive_values(values)
273
299
  target = Path(ml_dir)
274
300
  target.mkdir(parents=True, exist_ok=True)
275
301
 
@@ -323,7 +349,7 @@ def scaffold_project(
323
349
  continue
324
350
 
325
351
  # Setup agent memory
326
- memory_dir = Path(".claude") / "agent-memory" / "ml-researcher"
352
+ memory_dir = Path(".claude") / "agent-memory" / f"ml-researcher-{values['memory_dir_name']}"
327
353
  memory_dir.mkdir(parents=True, exist_ok=True)
328
354
  memory_src = templates_dir / "MEMORY.md"
329
355
  if memory_src.exists():
@@ -342,6 +368,14 @@ def scaffold_project(
342
368
  return stats
343
369
 
344
370
 
371
+ def make_command_hook_group(command: str, matcher: str = "") -> dict:
372
+ """Build a Claude Code command hook group."""
373
+ return {
374
+ "matcher": matcher,
375
+ "hooks": [{"type": "command", "command": command}],
376
+ }
377
+
378
+
345
379
  def _setup_hooks(ml_dir: str) -> None:
346
380
  """Configure Claude Code hooks in .claude/settings.local.json."""
347
381
  settings_path = Path(".claude") / "settings.local.json"
@@ -360,20 +394,14 @@ def _setup_hooks(ml_dir: str) -> None:
360
394
  post_hooks = hooks.get("PostToolUse", [])
361
395
  post_hook_cmd = f"bash {ml_dir}/scripts/post-train-hook.sh"
362
396
  if not any(post_hook_cmd in str(h) for h in post_hooks):
363
- post_hooks.append({
364
- "matcher": "Bash",
365
- "hooks": [{"type": "command", "command": post_hook_cmd}],
366
- })
397
+ post_hooks.append(make_command_hook_group(post_hook_cmd, matcher="Bash"))
367
398
  hooks["PostToolUse"] = post_hooks
368
399
 
369
400
  # Stop hook for convergence
370
401
  stop_hooks = hooks.get("Stop", [])
371
402
  stop_hook_cmd = f"bash {ml_dir}/scripts/stop-hook.sh"
372
403
  if not any(stop_hook_cmd in str(h) for h in stop_hooks):
373
- stop_hooks.append({
374
- "type": "command",
375
- "command": stop_hook_cmd,
376
- })
404
+ stop_hooks.append(make_command_hook_group(stop_hook_cmd))
377
405
  hooks["Stop"] = stop_hooks
378
406
 
379
407
  settings["hooks"] = hooks