claude-turing 4.3.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,470 @@
1
+ #!/usr/bin/env python3
2
+ """Research planning assistant for the autoresearch pipeline.
3
+
4
+ Given the current project state, generates a strategic research plan
5
+ that allocates experiments across strategies by expected ROI. Operates
6
+ one level above individual hypotheses — designs campaigns.
7
+
8
+ Usage:
9
+ python scripts/research_planner.py --budget 20
10
+ python scripts/research_planner.py --budget 20 --goal "maximize F1"
11
+ python scripts/research_planner.py --json
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import sys
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+
22
+ import numpy as np
23
+ import yaml
24
+
25
+ from scripts.turing_io import load_config, load_experiments
26
+
27
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
28
+ DEFAULT_BUDGET = 20
29
+
30
+ # Strategy definitions with base ROI and experiment templates
31
+ STRATEGIES = {
32
+ "feature_engineering": {
33
+ "label": "Feature Engineering",
34
+ "base_priority": 0.4,
35
+ "typical_gain": 0.005,
36
+ "templates": [
37
+ "Automated feature selection (top consensus features)",
38
+ "Interaction feature generation",
39
+ "Domain-specific feature engineering",
40
+ "Feature ablation to prune dead weight",
41
+ ],
42
+ },
43
+ "model_search": {
44
+ "label": "Model Architecture Search",
45
+ "base_priority": 0.25,
46
+ "typical_gain": 0.003,
47
+ "templates": [
48
+ "Try alternative model family",
49
+ "Hyperparameter optimization",
50
+ "Architecture modification",
51
+ ],
52
+ },
53
+ "ensemble": {
54
+ "label": "Ensemble & Composition",
55
+ "base_priority": 0.15,
56
+ "typical_gain": 0.008,
57
+ "templates": [
58
+ "Build stacking ensemble from top diverse models",
59
+ "Model soup from top checkpoints",
60
+ "Pipeline stitch: swap preprocessing into ensemble",
61
+ ],
62
+ },
63
+ "calibration": {
64
+ "label": "Production Readiness",
65
+ "base_priority": 0.1,
66
+ "typical_gain": 0.001,
67
+ "templates": [
68
+ "Probability calibration (Platt/isotonic)",
69
+ "Post-training quantization (INT8)",
70
+ "Weight pruning (find sparsity knee point)",
71
+ "Full seed study on final model",
72
+ ],
73
+ },
74
+ "verification": {
75
+ "label": "Verification & Documentation",
76
+ "base_priority": 0.1,
77
+ "typical_gain": 0.0,
78
+ "templates": [
79
+ "Reproduce final model",
80
+ "Run methodology audit",
81
+ "Generate model card",
82
+ ],
83
+ },
84
+ }
85
+
86
+
87
+ # --- ROI Analysis ---
88
+
89
+
90
+ def compute_family_roi(
91
+ experiments: list[dict],
92
+ primary_metric: str,
93
+ lower_is_better: bool = False,
94
+ ) -> dict[str, dict]:
95
+ """Compute ROI (improvement per experiment) for each experiment family.
96
+
97
+ Returns:
98
+ Dict of {family: {experiments, total_improvement, roi, exhausted}}.
99
+ """
100
+ families = {}
101
+
102
+ for exp in experiments:
103
+ family = exp.get("family", exp.get("config", {}).get("family", "unknown"))
104
+ if family not in families:
105
+ families[family] = {"experiments": [], "metrics": []}
106
+ families[family]["experiments"].append(exp)
107
+
108
+ val = exp.get("metrics", {}).get(primary_metric)
109
+ if val is not None:
110
+ families[family]["metrics"].append(float(val))
111
+
112
+ result = {}
113
+ for family, data in families.items():
114
+ metrics = data["metrics"]
115
+ n_exps = len(data["experiments"])
116
+
117
+ if len(metrics) < 2:
118
+ roi = 0
119
+ exhausted = False
120
+ else:
121
+ if lower_is_better:
122
+ improvement = metrics[0] - min(metrics)
123
+ else:
124
+ improvement = max(metrics) - metrics[0]
125
+ roi = improvement / n_exps if n_exps > 0 else 0
126
+
127
+ # Check if last 3 experiments showed no improvement
128
+ recent = metrics[-3:]
129
+ exhausted = len(recent) >= 3 and (max(recent) - min(recent)) < 0.002
130
+
131
+ result[family] = {
132
+ "n_experiments": n_exps,
133
+ "total_improvement": round(float(max(metrics) - min(metrics)) if metrics else 0, 6),
134
+ "roi_per_experiment": round(float(roi), 6),
135
+ "exhausted": exhausted,
136
+ "best_metric": round(float(max(metrics)), 6) if metrics else None,
137
+ }
138
+
139
+ return result
140
+
141
+
142
+ def adjust_priorities(
143
+ base_strategies: dict,
144
+ family_roi: dict[str, dict],
145
+ experiments: list[dict],
146
+ primary_metric: str,
147
+ goal: str | None = None,
148
+ ) -> dict[str, float]:
149
+ """Adjust strategy priorities based on project state.
150
+
151
+ Returns:
152
+ Dict of {strategy_name: adjusted_priority}.
153
+ """
154
+ priorities = {name: s["base_priority"] for name, s in base_strategies.items()}
155
+
156
+ n_total = len(experiments)
157
+
158
+ # Boost feature engineering if it has high ROI
159
+ fe_families = [f for f, data in family_roi.items()
160
+ if "feature" in f.lower() and not data["exhausted"]]
161
+ if fe_families:
162
+ priorities["feature_engineering"] *= 1.3
163
+
164
+ # Reduce model search if exhausted
165
+ model_families = [f for f, data in family_roi.items()
166
+ if ("tuning" in f.lower() or "architecture" in f.lower()) and data["exhausted"]]
167
+ if model_families:
168
+ priorities["model_search"] *= 0.5
169
+
170
+ # Boost ensemble if enough diverse models exist
171
+ n_kept = sum(1 for e in experiments if e.get("status") == "kept")
172
+ if n_kept >= 5:
173
+ priorities["ensemble"] *= 1.2
174
+
175
+ # Boost verification if many experiments done
176
+ if n_total >= 20:
177
+ priorities["verification"] *= 1.5
178
+
179
+ # Goal-based adjustments
180
+ if goal:
181
+ goal_lower = goal.lower()
182
+ if "production" in goal_lower or "deploy" in goal_lower:
183
+ priorities["calibration"] *= 2.0
184
+ priorities["verification"] *= 1.5
185
+ if "f1" in goal_lower or "accuracy" in goal_lower:
186
+ priorities["feature_engineering"] *= 1.2
187
+ priorities["ensemble"] *= 1.2
188
+
189
+ # Normalize
190
+ total = sum(priorities.values())
191
+ if total > 0:
192
+ priorities = {k: round(v / total, 3) for k, v in priorities.items()}
193
+
194
+ return priorities
195
+
196
+
197
+ # --- Plan Generation ---
198
+
199
+
200
+ def allocate_budget(
201
+ priorities: dict[str, float],
202
+ budget: int,
203
+ min_per_strategy: int = 1,
204
+ ) -> dict[str, int]:
205
+ """Allocate experiment budget across strategies.
206
+
207
+ Args:
208
+ priorities: Strategy priorities (sum to ~1.0).
209
+ budget: Total experiment budget.
210
+ min_per_strategy: Minimum experiments per active strategy.
211
+
212
+ Returns:
213
+ Dict of {strategy: n_experiments}.
214
+ """
215
+ if budget <= 0:
216
+ return {k: 0 for k in priorities}
217
+
218
+ # Initial allocation by priority
219
+ allocation = {}
220
+ remaining = budget
221
+
222
+ for strategy, priority in sorted(priorities.items(), key=lambda x: -x[1]):
223
+ n = max(min_per_strategy, round(budget * priority))
224
+ n = min(n, remaining)
225
+ allocation[strategy] = n
226
+ remaining -= n
227
+ if remaining <= 0:
228
+ break
229
+
230
+ # Distribute any remaining
231
+ if remaining > 0:
232
+ top_strategy = max(priorities, key=priorities.get)
233
+ allocation[top_strategy] = allocation.get(top_strategy, 0) + remaining
234
+
235
+ return allocation
236
+
237
+
238
+ def generate_plan(
239
+ allocation: dict[str, int],
240
+ strategies: dict,
241
+ family_roi: dict[str, dict],
242
+ current_best: float | None = None,
243
+ primary_metric: str = "accuracy",
244
+ ) -> dict:
245
+ """Generate a structured research plan from budget allocation.
246
+
247
+ Returns:
248
+ Plan with phases, experiment descriptions, and expected outcome.
249
+ """
250
+ phases = []
251
+ exp_counter = 1
252
+
253
+ for strategy_name, n_exps in allocation.items():
254
+ if n_exps <= 0:
255
+ continue
256
+
257
+ strategy = strategies.get(strategy_name, {})
258
+ templates = strategy.get("templates", [])
259
+ typical_gain = strategy.get("typical_gain", 0)
260
+
261
+ experiments = []
262
+ for i in range(n_exps):
263
+ template = templates[i % len(templates)] if templates else f"Experiment {exp_counter}"
264
+ experiments.append({
265
+ "number": exp_counter,
266
+ "description": template,
267
+ })
268
+ exp_counter += 1
269
+
270
+ pct = round(n_exps / sum(allocation.values()) * 100) if sum(allocation.values()) > 0 else 0
271
+
272
+ phases.append({
273
+ "name": strategy_name,
274
+ "label": strategy.get("label", strategy_name),
275
+ "n_experiments": n_exps,
276
+ "budget_pct": pct,
277
+ "rationale": _phase_rationale(strategy_name, family_roi),
278
+ "experiments": experiments,
279
+ "expected_gain": round(typical_gain * n_exps, 4),
280
+ })
281
+
282
+ # Estimate expected outcome
283
+ total_expected_gain = sum(p["expected_gain"] for p in phases)
284
+ expected_metric = round(current_best + total_expected_gain, 4) if current_best else None
285
+
286
+ return {
287
+ "phases": phases,
288
+ "total_experiments": exp_counter - 1,
289
+ "expected_metric": expected_metric,
290
+ "expected_gain": round(total_expected_gain, 4),
291
+ "primary_metric": primary_metric,
292
+ }
293
+
294
+
295
+ def _phase_rationale(strategy_name: str, family_roi: dict) -> str:
296
+ """Generate rationale for a phase allocation."""
297
+ rationales = {
298
+ "feature_engineering": "Highest ROI direction — feature improvements compound across models",
299
+ "model_search": "Explore alternative architectures for potential step-change improvement",
300
+ "ensemble": "Combine existing models for 1-3% improvement at zero additional training cost",
301
+ "calibration": "Required for production deployment — probability calibration and model compression",
302
+ "verification": "Final validation — reproduce results, audit methodology, generate model card",
303
+ }
304
+ return rationales.get(strategy_name, "Strategic allocation")
305
+
306
+
307
+ # --- Full Pipeline ---
308
+
309
+
310
+ def create_research_plan(
311
+ budget: int = DEFAULT_BUDGET,
312
+ goal: str | None = None,
313
+ config_path: str = "config.yaml",
314
+ log_path: str = DEFAULT_LOG_PATH,
315
+ ) -> dict:
316
+ """Create a strategic research plan.
317
+
318
+ Args:
319
+ budget: Total experiment budget.
320
+ goal: Optional goal description.
321
+ config_path: Path to config.yaml.
322
+ log_path: Path to experiment log.
323
+
324
+ Returns:
325
+ Research plan with phases, allocation, and expected outcome.
326
+ """
327
+ config = load_config(config_path)
328
+ eval_cfg = config.get("evaluation", {})
329
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
330
+ lower_is_better = eval_cfg.get("lower_is_better", False)
331
+
332
+ experiments = load_experiments(log_path)
333
+
334
+ if not experiments:
335
+ return {
336
+ "budget": budget,
337
+ "goal": goal,
338
+ "message": "No experiment history — start with /turing:train first",
339
+ "plan": generate_plan(
340
+ {"model_search": budget},
341
+ STRATEGIES, {},
342
+ primary_metric=primary_metric,
343
+ ),
344
+ "generated_at": datetime.now(timezone.utc).isoformat(),
345
+ }
346
+
347
+ # Compute family ROI
348
+ family_roi = compute_family_roi(experiments, primary_metric, lower_is_better)
349
+
350
+ # Get current best
351
+ best_metrics = [exp.get("metrics", {}).get(primary_metric)
352
+ for exp in experiments if exp.get("metrics", {}).get(primary_metric) is not None]
353
+ current_best = max(best_metrics) if best_metrics and not lower_is_better else (min(best_metrics) if best_metrics else None)
354
+
355
+ # Adjust priorities
356
+ priorities = adjust_priorities(STRATEGIES, family_roi, experiments, primary_metric, goal)
357
+
358
+ # Allocate budget
359
+ allocation = allocate_budget(priorities, budget)
360
+
361
+ # Generate plan
362
+ plan = generate_plan(allocation, STRATEGIES, family_roi, current_best, primary_metric)
363
+
364
+ return {
365
+ "budget": budget,
366
+ "goal": goal,
367
+ "current_best": current_best,
368
+ "primary_metric": primary_metric,
369
+ "n_experiments_so_far": len(experiments),
370
+ "family_roi": family_roi,
371
+ "priorities": priorities,
372
+ "allocation": allocation,
373
+ "plan": plan,
374
+ "generated_at": datetime.now(timezone.utc).isoformat(),
375
+ }
376
+
377
+
378
+ # --- Report Formatting ---
379
+
380
+
381
+ def save_plan_report(report: dict, output_dir: str = "experiments/plans") -> Path:
382
+ """Save research plan to YAML."""
383
+ out_path = Path(output_dir)
384
+ out_path.mkdir(parents=True, exist_ok=True)
385
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
386
+ filepath = out_path / f"plan-{ts}.yaml"
387
+ with open(filepath, "w") as f:
388
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
389
+ return filepath
390
+
391
+
392
+ def format_plan_report(report: dict) -> str:
393
+ """Format research plan as readable markdown."""
394
+ if "message" in report and "plan" not in report:
395
+ return report["message"]
396
+
397
+ plan = report.get("plan", {})
398
+ budget = report.get("budget", 0)
399
+ goal = report.get("goal", "maximize primary metric")
400
+
401
+ lines = [
402
+ f"# Research Plan ({budget} experiments, goal: {goal or 'maximize metric'})",
403
+ "",
404
+ ]
405
+
406
+ if report.get("current_best"):
407
+ lines.append(f"**Current best:** {report['primary_metric']}={report['current_best']}")
408
+ lines.append("")
409
+
410
+ phases = plan.get("phases", [])
411
+ phase_label = "A"
412
+
413
+ for phase in phases:
414
+ lines.append(f"## Phase {phase_label}: {phase['label']} ({phase['n_experiments']} experiments, {phase['budget_pct']}% of budget)")
415
+ lines.append(f"*Rationale: {phase['rationale']}*")
416
+ lines.append("")
417
+
418
+ for exp in phase.get("experiments", []):
419
+ lines.append(f" {exp['number']}. {exp['description']}")
420
+
421
+ lines.append("")
422
+ phase_label = chr(ord(phase_label) + 1)
423
+
424
+ expected = plan.get("expected_metric")
425
+ gain = plan.get("expected_gain", 0)
426
+ if expected:
427
+ lines.append(f"**Expected outcome:** {report.get('primary_metric', 'metric')} {report.get('current_best', '?')} → {expected} (+{gain})")
428
+ else:
429
+ lines.append(f"**Expected gain:** +{gain}")
430
+
431
+ lines.extend(["", f"*Generated: {report.get('generated_at', 'N/A')}*"])
432
+ return "\n".join(lines)
433
+
434
+
435
+ # --- CLI ---
436
+
437
+
438
+ def main():
439
+ parser = argparse.ArgumentParser(
440
+ description="Research planning assistant — strategic experiment campaign design"
441
+ )
442
+ parser.add_argument("--budget", type=int, default=DEFAULT_BUDGET,
443
+ help="Total experiment budget")
444
+ parser.add_argument("--goal", help="Goal description (e.g., 'maximize F1 for production')")
445
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
446
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
447
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
448
+
449
+ args = parser.parse_args()
450
+
451
+ report = create_research_plan(
452
+ budget=args.budget,
453
+ goal=args.goal,
454
+ config_path=args.config,
455
+ log_path=args.log,
456
+ )
457
+
458
+ if args.json:
459
+ print(json.dumps(report, indent=2))
460
+ else:
461
+ print(format_plan_report(report))
462
+
463
+ if "error" not in report:
464
+ saved = save_plan_report(report)
465
+ if not args.json:
466
+ print(f"\nSaved: {saved}")
467
+
468
+
469
+ if __name__ == "__main__":
470
+ main()
@@ -148,6 +148,9 @@ TEMPLATE_DIRS = {
148
148
  "experiment_simulator.py",
149
149
  "incremental_update.py",
150
150
  "model_lifecycle.py",
151
+ "failure_postmortem.py",
152
+ "harness_doctor.py",
153
+ "research_planner.py",
151
154
  ],
152
155
  "tests": ["__init__.py", "conftest.py"],
153
156
  }
@@ -203,6 +206,9 @@ DIRECTORIES_TO_CREATE = [
203
206
  "experiments/counterfactuals",
204
207
  "experiments/simulations",
205
208
  "experiments/updates",
209
+ "experiments/postmortems",
210
+ "experiments/doctor",
211
+ "experiments/plans",
206
212
  "exports/model-cards",
207
213
  "experiments/logs",
208
214
  "models/best",