claude-turing 4.3.0 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +5 -5
- package/LICENSE +1 -1
- package/README.md +78 -552
- package/bin/cli.js +23 -4
- package/commands/doctor.md +31 -0
- package/commands/init.md +21 -3
- package/commands/plan.md +27 -0
- package/commands/postmortem.md +28 -0
- package/commands/turing.md +6 -0
- package/config/defaults.yaml +2 -0
- package/package.json +5 -5
- package/src/install.js +18 -2
- package/src/verify.js +45 -2
- package/templates/README.md +1 -1
- package/templates/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-312.pyc +0 -0
- package/templates/config.yaml +1 -1
- package/templates/features/__pycache__/__init__.cpython-312.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-312.pyc +0 -0
- package/templates/program.md +1 -1
- package/templates/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/ablation_study.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/architecture_surgery.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/budget_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/build_ensemble.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/calibration.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/checkpoint_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/citation_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/counterfactual_explanation.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/curriculum_optimizer.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/diagnose_errors.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/draft_paper_sections.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_annotations.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_archive.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_diff.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_queue.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_replay.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_search.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_simulator.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_templates.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/failure_postmortem.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/failure_postmortem.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/feature_intelligence.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/fork_experiment.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_baselines.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_changelog.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_figures.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/generate_onboarding.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/harness_doctor.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/harness_doctor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/incremental_update.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/knowledge_transfer.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/leakage_detector.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/literature_search.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/methodology_audit.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_distiller.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_lifecycle.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_merger.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_pruning.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_quantization.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/model_xray.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/package_experiments.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/pareto_frontier.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/pipeline_manager.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/profile_training.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/regression_gate.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/reproduce_experiment.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/research_planner.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/research_planner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sanity_checks.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaling_estimator.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/seed_runner.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/sensitivity_analysis.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/session_flashback.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/simulate_review.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/smart_retry.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/training_monitor.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/trend_analysis.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/warm_start.cpython-312.pyc +0 -0
- package/templates/scripts/__pycache__/whatif_engine.cpython-312.pyc +0 -0
- package/templates/scripts/failure_postmortem.py +510 -0
- package/templates/scripts/generate_brief.py +61 -0
- package/templates/scripts/harness_doctor.py +610 -0
- package/templates/scripts/research_planner.py +470 -0
- package/templates/scripts/scaffold.py +56 -28
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Research planning assistant for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Given the current project state, generates a strategic research plan
|
|
5
|
+
that allocates experiments across strategies by expected ROI. Operates
|
|
6
|
+
one level above individual hypotheses — designs campaigns.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/research_planner.py --budget 20
|
|
10
|
+
python scripts/research_planner.py --budget 20 --goal "maximize F1"
|
|
11
|
+
python scripts/research_planner.py --json
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
from scripts.turing_io import load_config, load_experiments
|
|
26
|
+
|
|
27
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
28
|
+
DEFAULT_BUDGET = 20
|
|
29
|
+
|
|
30
|
+
# Strategy definitions with base ROI and experiment templates
|
|
31
|
+
STRATEGIES = {
|
|
32
|
+
"feature_engineering": {
|
|
33
|
+
"label": "Feature Engineering",
|
|
34
|
+
"base_priority": 0.4,
|
|
35
|
+
"typical_gain": 0.005,
|
|
36
|
+
"templates": [
|
|
37
|
+
"Automated feature selection (top consensus features)",
|
|
38
|
+
"Interaction feature generation",
|
|
39
|
+
"Domain-specific feature engineering",
|
|
40
|
+
"Feature ablation to prune dead weight",
|
|
41
|
+
],
|
|
42
|
+
},
|
|
43
|
+
"model_search": {
|
|
44
|
+
"label": "Model Architecture Search",
|
|
45
|
+
"base_priority": 0.25,
|
|
46
|
+
"typical_gain": 0.003,
|
|
47
|
+
"templates": [
|
|
48
|
+
"Try alternative model family",
|
|
49
|
+
"Hyperparameter optimization",
|
|
50
|
+
"Architecture modification",
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
"ensemble": {
|
|
54
|
+
"label": "Ensemble & Composition",
|
|
55
|
+
"base_priority": 0.15,
|
|
56
|
+
"typical_gain": 0.008,
|
|
57
|
+
"templates": [
|
|
58
|
+
"Build stacking ensemble from top diverse models",
|
|
59
|
+
"Model soup from top checkpoints",
|
|
60
|
+
"Pipeline stitch: swap preprocessing into ensemble",
|
|
61
|
+
],
|
|
62
|
+
},
|
|
63
|
+
"calibration": {
|
|
64
|
+
"label": "Production Readiness",
|
|
65
|
+
"base_priority": 0.1,
|
|
66
|
+
"typical_gain": 0.001,
|
|
67
|
+
"templates": [
|
|
68
|
+
"Probability calibration (Platt/isotonic)",
|
|
69
|
+
"Post-training quantization (INT8)",
|
|
70
|
+
"Weight pruning (find sparsity knee point)",
|
|
71
|
+
"Full seed study on final model",
|
|
72
|
+
],
|
|
73
|
+
},
|
|
74
|
+
"verification": {
|
|
75
|
+
"label": "Verification & Documentation",
|
|
76
|
+
"base_priority": 0.1,
|
|
77
|
+
"typical_gain": 0.0,
|
|
78
|
+
"templates": [
|
|
79
|
+
"Reproduce final model",
|
|
80
|
+
"Run methodology audit",
|
|
81
|
+
"Generate model card",
|
|
82
|
+
],
|
|
83
|
+
},
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# --- ROI Analysis ---
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def compute_family_roi(
|
|
91
|
+
experiments: list[dict],
|
|
92
|
+
primary_metric: str,
|
|
93
|
+
lower_is_better: bool = False,
|
|
94
|
+
) -> dict[str, dict]:
|
|
95
|
+
"""Compute ROI (improvement per experiment) for each experiment family.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dict of {family: {experiments, total_improvement, roi, exhausted}}.
|
|
99
|
+
"""
|
|
100
|
+
families = {}
|
|
101
|
+
|
|
102
|
+
for exp in experiments:
|
|
103
|
+
family = exp.get("family", exp.get("config", {}).get("family", "unknown"))
|
|
104
|
+
if family not in families:
|
|
105
|
+
families[family] = {"experiments": [], "metrics": []}
|
|
106
|
+
families[family]["experiments"].append(exp)
|
|
107
|
+
|
|
108
|
+
val = exp.get("metrics", {}).get(primary_metric)
|
|
109
|
+
if val is not None:
|
|
110
|
+
families[family]["metrics"].append(float(val))
|
|
111
|
+
|
|
112
|
+
result = {}
|
|
113
|
+
for family, data in families.items():
|
|
114
|
+
metrics = data["metrics"]
|
|
115
|
+
n_exps = len(data["experiments"])
|
|
116
|
+
|
|
117
|
+
if len(metrics) < 2:
|
|
118
|
+
roi = 0
|
|
119
|
+
exhausted = False
|
|
120
|
+
else:
|
|
121
|
+
if lower_is_better:
|
|
122
|
+
improvement = metrics[0] - min(metrics)
|
|
123
|
+
else:
|
|
124
|
+
improvement = max(metrics) - metrics[0]
|
|
125
|
+
roi = improvement / n_exps if n_exps > 0 else 0
|
|
126
|
+
|
|
127
|
+
# Check if last 3 experiments showed no improvement
|
|
128
|
+
recent = metrics[-3:]
|
|
129
|
+
exhausted = len(recent) >= 3 and (max(recent) - min(recent)) < 0.002
|
|
130
|
+
|
|
131
|
+
result[family] = {
|
|
132
|
+
"n_experiments": n_exps,
|
|
133
|
+
"total_improvement": round(float(max(metrics) - min(metrics)) if metrics else 0, 6),
|
|
134
|
+
"roi_per_experiment": round(float(roi), 6),
|
|
135
|
+
"exhausted": exhausted,
|
|
136
|
+
"best_metric": round(float(max(metrics)), 6) if metrics else None,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def adjust_priorities(
|
|
143
|
+
base_strategies: dict,
|
|
144
|
+
family_roi: dict[str, dict],
|
|
145
|
+
experiments: list[dict],
|
|
146
|
+
primary_metric: str,
|
|
147
|
+
goal: str | None = None,
|
|
148
|
+
) -> dict[str, float]:
|
|
149
|
+
"""Adjust strategy priorities based on project state.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Dict of {strategy_name: adjusted_priority}.
|
|
153
|
+
"""
|
|
154
|
+
priorities = {name: s["base_priority"] for name, s in base_strategies.items()}
|
|
155
|
+
|
|
156
|
+
n_total = len(experiments)
|
|
157
|
+
|
|
158
|
+
# Boost feature engineering if it has high ROI
|
|
159
|
+
fe_families = [f for f, data in family_roi.items()
|
|
160
|
+
if "feature" in f.lower() and not data["exhausted"]]
|
|
161
|
+
if fe_families:
|
|
162
|
+
priorities["feature_engineering"] *= 1.3
|
|
163
|
+
|
|
164
|
+
# Reduce model search if exhausted
|
|
165
|
+
model_families = [f for f, data in family_roi.items()
|
|
166
|
+
if ("tuning" in f.lower() or "architecture" in f.lower()) and data["exhausted"]]
|
|
167
|
+
if model_families:
|
|
168
|
+
priorities["model_search"] *= 0.5
|
|
169
|
+
|
|
170
|
+
# Boost ensemble if enough diverse models exist
|
|
171
|
+
n_kept = sum(1 for e in experiments if e.get("status") == "kept")
|
|
172
|
+
if n_kept >= 5:
|
|
173
|
+
priorities["ensemble"] *= 1.2
|
|
174
|
+
|
|
175
|
+
# Boost verification if many experiments done
|
|
176
|
+
if n_total >= 20:
|
|
177
|
+
priorities["verification"] *= 1.5
|
|
178
|
+
|
|
179
|
+
# Goal-based adjustments
|
|
180
|
+
if goal:
|
|
181
|
+
goal_lower = goal.lower()
|
|
182
|
+
if "production" in goal_lower or "deploy" in goal_lower:
|
|
183
|
+
priorities["calibration"] *= 2.0
|
|
184
|
+
priorities["verification"] *= 1.5
|
|
185
|
+
if "f1" in goal_lower or "accuracy" in goal_lower:
|
|
186
|
+
priorities["feature_engineering"] *= 1.2
|
|
187
|
+
priorities["ensemble"] *= 1.2
|
|
188
|
+
|
|
189
|
+
# Normalize
|
|
190
|
+
total = sum(priorities.values())
|
|
191
|
+
if total > 0:
|
|
192
|
+
priorities = {k: round(v / total, 3) for k, v in priorities.items()}
|
|
193
|
+
|
|
194
|
+
return priorities
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# --- Plan Generation ---
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def allocate_budget(
|
|
201
|
+
priorities: dict[str, float],
|
|
202
|
+
budget: int,
|
|
203
|
+
min_per_strategy: int = 1,
|
|
204
|
+
) -> dict[str, int]:
|
|
205
|
+
"""Allocate experiment budget across strategies.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
priorities: Strategy priorities (sum to ~1.0).
|
|
209
|
+
budget: Total experiment budget.
|
|
210
|
+
min_per_strategy: Minimum experiments per active strategy.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Dict of {strategy: n_experiments}.
|
|
214
|
+
"""
|
|
215
|
+
if budget <= 0:
|
|
216
|
+
return {k: 0 for k in priorities}
|
|
217
|
+
|
|
218
|
+
# Initial allocation by priority
|
|
219
|
+
allocation = {}
|
|
220
|
+
remaining = budget
|
|
221
|
+
|
|
222
|
+
for strategy, priority in sorted(priorities.items(), key=lambda x: -x[1]):
|
|
223
|
+
n = max(min_per_strategy, round(budget * priority))
|
|
224
|
+
n = min(n, remaining)
|
|
225
|
+
allocation[strategy] = n
|
|
226
|
+
remaining -= n
|
|
227
|
+
if remaining <= 0:
|
|
228
|
+
break
|
|
229
|
+
|
|
230
|
+
# Distribute any remaining
|
|
231
|
+
if remaining > 0:
|
|
232
|
+
top_strategy = max(priorities, key=priorities.get)
|
|
233
|
+
allocation[top_strategy] = allocation.get(top_strategy, 0) + remaining
|
|
234
|
+
|
|
235
|
+
return allocation
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def generate_plan(
|
|
239
|
+
allocation: dict[str, int],
|
|
240
|
+
strategies: dict,
|
|
241
|
+
family_roi: dict[str, dict],
|
|
242
|
+
current_best: float | None = None,
|
|
243
|
+
primary_metric: str = "accuracy",
|
|
244
|
+
) -> dict:
|
|
245
|
+
"""Generate a structured research plan from budget allocation.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Plan with phases, experiment descriptions, and expected outcome.
|
|
249
|
+
"""
|
|
250
|
+
phases = []
|
|
251
|
+
exp_counter = 1
|
|
252
|
+
|
|
253
|
+
for strategy_name, n_exps in allocation.items():
|
|
254
|
+
if n_exps <= 0:
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
strategy = strategies.get(strategy_name, {})
|
|
258
|
+
templates = strategy.get("templates", [])
|
|
259
|
+
typical_gain = strategy.get("typical_gain", 0)
|
|
260
|
+
|
|
261
|
+
experiments = []
|
|
262
|
+
for i in range(n_exps):
|
|
263
|
+
template = templates[i % len(templates)] if templates else f"Experiment {exp_counter}"
|
|
264
|
+
experiments.append({
|
|
265
|
+
"number": exp_counter,
|
|
266
|
+
"description": template,
|
|
267
|
+
})
|
|
268
|
+
exp_counter += 1
|
|
269
|
+
|
|
270
|
+
pct = round(n_exps / sum(allocation.values()) * 100) if sum(allocation.values()) > 0 else 0
|
|
271
|
+
|
|
272
|
+
phases.append({
|
|
273
|
+
"name": strategy_name,
|
|
274
|
+
"label": strategy.get("label", strategy_name),
|
|
275
|
+
"n_experiments": n_exps,
|
|
276
|
+
"budget_pct": pct,
|
|
277
|
+
"rationale": _phase_rationale(strategy_name, family_roi),
|
|
278
|
+
"experiments": experiments,
|
|
279
|
+
"expected_gain": round(typical_gain * n_exps, 4),
|
|
280
|
+
})
|
|
281
|
+
|
|
282
|
+
# Estimate expected outcome
|
|
283
|
+
total_expected_gain = sum(p["expected_gain"] for p in phases)
|
|
284
|
+
expected_metric = round(current_best + total_expected_gain, 4) if current_best else None
|
|
285
|
+
|
|
286
|
+
return {
|
|
287
|
+
"phases": phases,
|
|
288
|
+
"total_experiments": exp_counter - 1,
|
|
289
|
+
"expected_metric": expected_metric,
|
|
290
|
+
"expected_gain": round(total_expected_gain, 4),
|
|
291
|
+
"primary_metric": primary_metric,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _phase_rationale(strategy_name: str, family_roi: dict) -> str:
|
|
296
|
+
"""Generate rationale for a phase allocation."""
|
|
297
|
+
rationales = {
|
|
298
|
+
"feature_engineering": "Highest ROI direction — feature improvements compound across models",
|
|
299
|
+
"model_search": "Explore alternative architectures for potential step-change improvement",
|
|
300
|
+
"ensemble": "Combine existing models for 1-3% improvement at zero additional training cost",
|
|
301
|
+
"calibration": "Required for production deployment — probability calibration and model compression",
|
|
302
|
+
"verification": "Final validation — reproduce results, audit methodology, generate model card",
|
|
303
|
+
}
|
|
304
|
+
return rationales.get(strategy_name, "Strategic allocation")
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# --- Full Pipeline ---
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def create_research_plan(
|
|
311
|
+
budget: int = DEFAULT_BUDGET,
|
|
312
|
+
goal: str | None = None,
|
|
313
|
+
config_path: str = "config.yaml",
|
|
314
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
315
|
+
) -> dict:
|
|
316
|
+
"""Create a strategic research plan.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
budget: Total experiment budget.
|
|
320
|
+
goal: Optional goal description.
|
|
321
|
+
config_path: Path to config.yaml.
|
|
322
|
+
log_path: Path to experiment log.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Research plan with phases, allocation, and expected outcome.
|
|
326
|
+
"""
|
|
327
|
+
config = load_config(config_path)
|
|
328
|
+
eval_cfg = config.get("evaluation", {})
|
|
329
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
330
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
331
|
+
|
|
332
|
+
experiments = load_experiments(log_path)
|
|
333
|
+
|
|
334
|
+
if not experiments:
|
|
335
|
+
return {
|
|
336
|
+
"budget": budget,
|
|
337
|
+
"goal": goal,
|
|
338
|
+
"message": "No experiment history — start with /turing:train first",
|
|
339
|
+
"plan": generate_plan(
|
|
340
|
+
{"model_search": budget},
|
|
341
|
+
STRATEGIES, {},
|
|
342
|
+
primary_metric=primary_metric,
|
|
343
|
+
),
|
|
344
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
# Compute family ROI
|
|
348
|
+
family_roi = compute_family_roi(experiments, primary_metric, lower_is_better)
|
|
349
|
+
|
|
350
|
+
# Get current best
|
|
351
|
+
best_metrics = [exp.get("metrics", {}).get(primary_metric)
|
|
352
|
+
for exp in experiments if exp.get("metrics", {}).get(primary_metric) is not None]
|
|
353
|
+
current_best = max(best_metrics) if best_metrics and not lower_is_better else (min(best_metrics) if best_metrics else None)
|
|
354
|
+
|
|
355
|
+
# Adjust priorities
|
|
356
|
+
priorities = adjust_priorities(STRATEGIES, family_roi, experiments, primary_metric, goal)
|
|
357
|
+
|
|
358
|
+
# Allocate budget
|
|
359
|
+
allocation = allocate_budget(priorities, budget)
|
|
360
|
+
|
|
361
|
+
# Generate plan
|
|
362
|
+
plan = generate_plan(allocation, STRATEGIES, family_roi, current_best, primary_metric)
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
"budget": budget,
|
|
366
|
+
"goal": goal,
|
|
367
|
+
"current_best": current_best,
|
|
368
|
+
"primary_metric": primary_metric,
|
|
369
|
+
"n_experiments_so_far": len(experiments),
|
|
370
|
+
"family_roi": family_roi,
|
|
371
|
+
"priorities": priorities,
|
|
372
|
+
"allocation": allocation,
|
|
373
|
+
"plan": plan,
|
|
374
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
# --- Report Formatting ---
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def save_plan_report(report: dict, output_dir: str = "experiments/plans") -> Path:
|
|
382
|
+
"""Save research plan to YAML."""
|
|
383
|
+
out_path = Path(output_dir)
|
|
384
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
385
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
386
|
+
filepath = out_path / f"plan-{ts}.yaml"
|
|
387
|
+
with open(filepath, "w") as f:
|
|
388
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
389
|
+
return filepath
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def format_plan_report(report: dict) -> str:
|
|
393
|
+
"""Format research plan as readable markdown."""
|
|
394
|
+
if "message" in report and "plan" not in report:
|
|
395
|
+
return report["message"]
|
|
396
|
+
|
|
397
|
+
plan = report.get("plan", {})
|
|
398
|
+
budget = report.get("budget", 0)
|
|
399
|
+
goal = report.get("goal", "maximize primary metric")
|
|
400
|
+
|
|
401
|
+
lines = [
|
|
402
|
+
f"# Research Plan ({budget} experiments, goal: {goal or 'maximize metric'})",
|
|
403
|
+
"",
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
if report.get("current_best"):
|
|
407
|
+
lines.append(f"**Current best:** {report['primary_metric']}={report['current_best']}")
|
|
408
|
+
lines.append("")
|
|
409
|
+
|
|
410
|
+
phases = plan.get("phases", [])
|
|
411
|
+
phase_label = "A"
|
|
412
|
+
|
|
413
|
+
for phase in phases:
|
|
414
|
+
lines.append(f"## Phase {phase_label}: {phase['label']} ({phase['n_experiments']} experiments, {phase['budget_pct']}% of budget)")
|
|
415
|
+
lines.append(f"*Rationale: {phase['rationale']}*")
|
|
416
|
+
lines.append("")
|
|
417
|
+
|
|
418
|
+
for exp in phase.get("experiments", []):
|
|
419
|
+
lines.append(f" {exp['number']}. {exp['description']}")
|
|
420
|
+
|
|
421
|
+
lines.append("")
|
|
422
|
+
phase_label = chr(ord(phase_label) + 1)
|
|
423
|
+
|
|
424
|
+
expected = plan.get("expected_metric")
|
|
425
|
+
gain = plan.get("expected_gain", 0)
|
|
426
|
+
if expected:
|
|
427
|
+
lines.append(f"**Expected outcome:** {report.get('primary_metric', 'metric')} {report.get('current_best', '?')} → {expected} (+{gain})")
|
|
428
|
+
else:
|
|
429
|
+
lines.append(f"**Expected gain:** +{gain}")
|
|
430
|
+
|
|
431
|
+
lines.extend(["", f"*Generated: {report.get('generated_at', 'N/A')}*"])
|
|
432
|
+
return "\n".join(lines)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
# --- CLI ---
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def main():
|
|
439
|
+
parser = argparse.ArgumentParser(
|
|
440
|
+
description="Research planning assistant — strategic experiment campaign design"
|
|
441
|
+
)
|
|
442
|
+
parser.add_argument("--budget", type=int, default=DEFAULT_BUDGET,
|
|
443
|
+
help="Total experiment budget")
|
|
444
|
+
parser.add_argument("--goal", help="Goal description (e.g., 'maximize F1 for production')")
|
|
445
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
446
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
|
|
447
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
448
|
+
|
|
449
|
+
args = parser.parse_args()
|
|
450
|
+
|
|
451
|
+
report = create_research_plan(
|
|
452
|
+
budget=args.budget,
|
|
453
|
+
goal=args.goal,
|
|
454
|
+
config_path=args.config,
|
|
455
|
+
log_path=args.log,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if args.json:
|
|
459
|
+
print(json.dumps(report, indent=2))
|
|
460
|
+
else:
|
|
461
|
+
print(format_plan_report(report))
|
|
462
|
+
|
|
463
|
+
if "error" not in report:
|
|
464
|
+
saved = save_plan_report(report)
|
|
465
|
+
if not args.json:
|
|
466
|
+
print(f"\nSaved: {saved}")
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
if __name__ == "__main__":
|
|
470
|
+
main()
|
|
@@ -34,6 +34,8 @@ PLACEHOLDER_MAP = {
|
|
|
34
34
|
"ML_DIR": "ml_dir",
|
|
35
35
|
"DATA_SOURCE": "data_source",
|
|
36
36
|
"METRIC_DIRECTION": "metric_direction",
|
|
37
|
+
"LOWER_IS_BETTER": "lower_is_better",
|
|
38
|
+
"MEMORY_DIR_NAME": "memory_dir_name",
|
|
37
39
|
}
|
|
38
40
|
|
|
39
41
|
# Files to copy from templates/ to the ML directory
|
|
@@ -148,6 +150,9 @@ TEMPLATE_DIRS = {
|
|
|
148
150
|
"experiment_simulator.py",
|
|
149
151
|
"incremental_update.py",
|
|
150
152
|
"model_lifecycle.py",
|
|
153
|
+
"failure_postmortem.py",
|
|
154
|
+
"harness_doctor.py",
|
|
155
|
+
"research_planner.py",
|
|
151
156
|
],
|
|
152
157
|
"tests": ["__init__.py", "conftest.py"],
|
|
153
158
|
}
|
|
@@ -203,6 +208,9 @@ DIRECTORIES_TO_CREATE = [
|
|
|
203
208
|
"experiments/counterfactuals",
|
|
204
209
|
"experiments/simulations",
|
|
205
210
|
"experiments/updates",
|
|
211
|
+
"experiments/postmortems",
|
|
212
|
+
"experiments/doctor",
|
|
213
|
+
"experiments/plans",
|
|
206
214
|
"exports/model-cards",
|
|
207
215
|
"experiments/logs",
|
|
208
216
|
"models/best",
|
|
@@ -217,32 +225,49 @@ SHELL_SCRIPTS = [
|
|
|
217
225
|
|
|
218
226
|
def find_templates_dir() -> Path | None:
|
|
219
227
|
"""Locate the templates directory relative to this script or plugin root."""
|
|
220
|
-
|
|
228
|
+
env_templates_dir = os.environ.get("TURING_TEMPLATES_DIR")
|
|
229
|
+
if env_templates_dir:
|
|
230
|
+
candidate = Path(env_templates_dir).expanduser()
|
|
231
|
+
if (candidate / "prepare.py").exists():
|
|
232
|
+
return candidate
|
|
233
|
+
|
|
221
234
|
script_dir = Path(__file__).parent
|
|
222
235
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
# Search common plugin locations
|
|
235
|
-
home = Path.home()
|
|
236
|
-
for pattern in [
|
|
237
|
-
home / ".claude" / "plugins" / "*" / "templates",
|
|
236
|
+
project_command_templates = [
|
|
237
|
+
path / ".claude" / "commands" / "turing" / "templates"
|
|
238
|
+
for path in [Path.cwd(), *Path.cwd().parents]
|
|
239
|
+
]
|
|
240
|
+
|
|
241
|
+
for candidate in [
|
|
242
|
+
script_dir.parent,
|
|
243
|
+
script_dir.parent.parent / "templates",
|
|
244
|
+
*project_command_templates,
|
|
245
|
+
Path.home() / ".claude" / "commands" / "turing" / "templates",
|
|
246
|
+
Path.cwd() / "node_modules" / "claude-turing" / "templates",
|
|
238
247
|
]:
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
248
|
+
if (candidate / "prepare.py").exists():
|
|
249
|
+
return candidate
|
|
250
|
+
|
|
251
|
+
plugins_dir = Path.home() / ".claude" / "plugins"
|
|
252
|
+
for match in sorted(plugins_dir.glob("*/templates")):
|
|
253
|
+
if (match / "prepare.py").exists():
|
|
254
|
+
return match
|
|
242
255
|
|
|
243
256
|
return None
|
|
244
257
|
|
|
245
258
|
|
|
259
|
+
def derive_values(values: dict[str, str]) -> dict[str, str]:
|
|
260
|
+
"""Add scaffold values derived from user-provided fields."""
|
|
261
|
+
derived = dict(values)
|
|
262
|
+
derived["lower_is_better"] = (
|
|
263
|
+
"true" if derived.get("metric_direction", "").lower() == "lower" else "false"
|
|
264
|
+
)
|
|
265
|
+
derived["memory_dir_name"] = re.sub(
|
|
266
|
+
r"[^a-zA-Z0-9_.-]+", "-", derived["project_name"]
|
|
267
|
+
).strip("-")
|
|
268
|
+
return derived
|
|
269
|
+
|
|
270
|
+
|
|
246
271
|
def replace_placeholders(text: str, values: dict[str, str]) -> str:
|
|
247
272
|
"""Replace all {{PLACEHOLDER}} markers in text with values."""
|
|
248
273
|
for placeholder, arg_name in PLACEHOLDER_MAP.items():
|
|
@@ -270,6 +295,7 @@ def scaffold_project(
|
|
|
270
295
|
Returns:
|
|
271
296
|
Dict with counts: files_copied, placeholders_replaced, dirs_created.
|
|
272
297
|
"""
|
|
298
|
+
values = derive_values(values)
|
|
273
299
|
target = Path(ml_dir)
|
|
274
300
|
target.mkdir(parents=True, exist_ok=True)
|
|
275
301
|
|
|
@@ -323,7 +349,7 @@ def scaffold_project(
|
|
|
323
349
|
continue
|
|
324
350
|
|
|
325
351
|
# Setup agent memory
|
|
326
|
-
memory_dir = Path(".claude") / "agent-memory" / "ml-researcher"
|
|
352
|
+
memory_dir = Path(".claude") / "agent-memory" / f"ml-researcher-{values['memory_dir_name']}"
|
|
327
353
|
memory_dir.mkdir(parents=True, exist_ok=True)
|
|
328
354
|
memory_src = templates_dir / "MEMORY.md"
|
|
329
355
|
if memory_src.exists():
|
|
@@ -342,6 +368,14 @@ def scaffold_project(
|
|
|
342
368
|
return stats
|
|
343
369
|
|
|
344
370
|
|
|
371
|
+
def make_command_hook_group(command: str, matcher: str = "") -> dict:
|
|
372
|
+
"""Build a Claude Code command hook group."""
|
|
373
|
+
return {
|
|
374
|
+
"matcher": matcher,
|
|
375
|
+
"hooks": [{"type": "command", "command": command}],
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
|
|
345
379
|
def _setup_hooks(ml_dir: str) -> None:
|
|
346
380
|
"""Configure Claude Code hooks in .claude/settings.local.json."""
|
|
347
381
|
settings_path = Path(".claude") / "settings.local.json"
|
|
@@ -360,20 +394,14 @@ def _setup_hooks(ml_dir: str) -> None:
|
|
|
360
394
|
post_hooks = hooks.get("PostToolUse", [])
|
|
361
395
|
post_hook_cmd = f"bash {ml_dir}/scripts/post-train-hook.sh"
|
|
362
396
|
if not any(post_hook_cmd in str(h) for h in post_hooks):
|
|
363
|
-
post_hooks.append(
|
|
364
|
-
"matcher": "Bash",
|
|
365
|
-
"hooks": [{"type": "command", "command": post_hook_cmd}],
|
|
366
|
-
})
|
|
397
|
+
post_hooks.append(make_command_hook_group(post_hook_cmd, matcher="Bash"))
|
|
367
398
|
hooks["PostToolUse"] = post_hooks
|
|
368
399
|
|
|
369
400
|
# Stop hook for convergence
|
|
370
401
|
stop_hooks = hooks.get("Stop", [])
|
|
371
402
|
stop_hook_cmd = f"bash {ml_dir}/scripts/stop-hook.sh"
|
|
372
403
|
if not any(stop_hook_cmd in str(h) for h in stop_hooks):
|
|
373
|
-
stop_hooks.append(
|
|
374
|
-
"type": "command",
|
|
375
|
-
"command": stop_hook_cmd,
|
|
376
|
-
})
|
|
404
|
+
stop_hooks.append(make_command_hook_group(stop_hook_cmd))
|
|
377
405
|
hooks["Stop"] = stop_hooks
|
|
378
406
|
|
|
379
407
|
settings["hooks"] = hooks
|