claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,389 @@
1
+ #!/usr/bin/env python3
2
+ """Research briefing generator for the autoresearch pipeline.
3
+
4
+ Produces a structured intelligence report from experiment history:
5
+ what's been learned, what's promising, what's exhausted, and what
6
+ the human should consider next.
7
+
8
+ This closes the taste-leverage loop: the agent reports intelligence,
9
+ the human applies taste, the human injects hypotheses, the agent
10
+ executes them.
11
+
12
+ Usage:
13
+ python scripts/generate_brief.py [--config config.yaml] [--log experiments/log.jsonl]
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import json
20
+ import sys
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+
24
+ import yaml
25
+
26
+ from scripts.turing_io import load_config, load_experiments, load_hypotheses
27
+
28
+
29
+ def compute_campaign_summary(experiments: list[dict]) -> dict:
30
+ """Compute high-level campaign statistics."""
31
+ total = len(experiments)
32
+ kept = sum(1 for e in experiments if e.get("status") == "kept")
33
+ discarded = sum(1 for e in experiments if e.get("status") == "discarded")
34
+
35
+ timestamps = []
36
+ for e in experiments:
37
+ ts = e.get("timestamp", "")
38
+ if ts:
39
+ timestamps.append(ts)
40
+
41
+ first_ts = min(timestamps) if timestamps else None
42
+ last_ts = max(timestamps) if timestamps else None
43
+
44
+ return {
45
+ "total": total,
46
+ "kept": kept,
47
+ "discarded": discarded,
48
+ "keep_rate": round(kept / total, 2) if total > 0 else 0,
49
+ "first_experiment": first_ts,
50
+ "last_experiment": last_ts,
51
+ }
52
+
53
+
54
+ def find_best(experiments: list[dict], metric: str, lower_is_better: bool) -> dict | None:
55
+ """Find best kept experiment by primary metric."""
56
+ best = None
57
+ best_val = float("inf") if lower_is_better else float("-inf")
58
+ for e in experiments:
59
+ if e.get("status") != "kept":
60
+ continue
61
+ val = e.get("metrics", {}).get(metric)
62
+ if val is None:
63
+ continue
64
+ if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
65
+ best_val = val
66
+ best = e
67
+ return best
68
+
69
+
70
+ def compute_trajectory(experiments: list[dict], metric: str, lower_is_better: bool) -> list[dict]:
71
+ """Compute improvement trajectory — best metric value after each kept experiment."""
72
+ trajectory = []
73
+ best_val = float("inf") if lower_is_better else float("-inf")
74
+ for e in experiments:
75
+ if e.get("status") != "kept":
76
+ continue
77
+ val = e.get("metrics", {}).get(metric)
78
+ if val is None:
79
+ continue
80
+ if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
81
+ best_val = val
82
+ trajectory.append({
83
+ "experiment_id": e.get("experiment_id", "?"),
84
+ "value": val,
85
+ "best_so_far": best_val,
86
+ })
87
+ return trajectory
88
+
89
+
90
+ def identify_model_types(experiments: list[dict]) -> dict[str, dict]:
91
+ """Group experiments by model type and compute stats."""
92
+ models: dict[str, list[dict]] = {}
93
+ for e in experiments:
94
+ mt = e.get("config", {}).get("model_type", "unknown")
95
+ models.setdefault(mt, []).append(e)
96
+
97
+ result = {}
98
+ for mt, exps in models.items():
99
+ kept = [e for e in exps if e.get("status") == "kept"]
100
+ result[mt] = {
101
+ "total": len(exps),
102
+ "kept": len(kept),
103
+ "discarded": len(exps) - len(kept),
104
+ }
105
+ return result
106
+
107
+
108
+ def cluster_failures(experiments: list[dict]) -> list[dict]:
109
+ """Identify patterns across failed (discarded) experiments.
110
+
111
+ Groups discarded experiments by common traits and reports clusters
112
+ where multiple experiments share the same failure characteristic.
113
+
114
+ Returns list of cluster dicts with: trait, count, experiments, description.
115
+ """
116
+ discarded = [e for e in experiments if e.get("status") == "discarded"]
117
+ if len(discarded) < 2:
118
+ return []
119
+
120
+ clusters: dict[str, list[str]] = {}
121
+
122
+ for exp in discarded:
123
+ exp_id = exp.get("experiment_id", "?")
124
+ config = exp.get("config", {})
125
+ hyperparams = config.get("hyperparams", {})
126
+
127
+ # Cluster by model type
128
+ mt = config.get("model_type", "unknown")
129
+ key = f"model_type={mt}"
130
+ clusters.setdefault(key, []).append(exp_id)
131
+
132
+ # Cluster by hyperparameter ranges
133
+ for param, value in hyperparams.items():
134
+ if isinstance(value, (int, float)):
135
+ # Bin into high/low relative to a simple threshold
136
+ key = f"{param}>={value}" if isinstance(value, int) else f"{param}~{value}"
137
+ clusters.setdefault(key, []).append(exp_id)
138
+
139
+ # Cluster by family
140
+ family = exp.get("family")
141
+ if family:
142
+ key = f"family={family}"
143
+ clusters.setdefault(key, []).append(exp_id)
144
+
145
+ # Filter to clusters with 2+ experiments
146
+ result = []
147
+ for trait, exp_ids in clusters.items():
148
+ if len(exp_ids) >= 2:
149
+ result.append({
150
+ "trait": trait,
151
+ "count": len(exp_ids),
152
+ "experiments": exp_ids,
153
+ "description": f"{len(exp_ids)} discarded experiments share: {trait}",
154
+ })
155
+
156
+ # Sort by count descending
157
+ result.sort(key=lambda c: -c["count"])
158
+ return result[:5] # Top 5 clusters
159
+
160
+
161
+ def detect_environment_drift(experiments: list[dict]) -> list[str]:
162
+ """Detect environment changes across experiments.
163
+
164
+ Compares the most recent experiment's environment against the best
165
+ experiment's environment. Flags differences in python version,
166
+ key package versions, or hardware that could affect reproducibility.
167
+ """
168
+ if len(experiments) < 2:
169
+ return []
170
+
171
+ # Find most recent and best experiments with environment data
172
+ recent = None
173
+ best_env_exp = None
174
+ for e in reversed(experiments):
175
+ if e.get("environment") and not recent:
176
+ recent = e
177
+ if e.get("status") == "kept" and e.get("environment") and not best_env_exp:
178
+ best_env_exp = e
179
+ if recent and best_env_exp:
180
+ break
181
+
182
+ if not recent or not best_env_exp or recent == best_env_exp:
183
+ return []
184
+
185
+ warnings = []
186
+ env_new = recent["environment"]
187
+ env_old = best_env_exp["environment"]
188
+
189
+ # Python version
190
+ if env_new.get("python_version") != env_old.get("python_version"):
191
+ warnings.append(
192
+ f"Python version changed: {env_old.get('python_version')} -> {env_new.get('python_version')}"
193
+ )
194
+
195
+ # Key packages
196
+ pkgs_new = env_new.get("packages", {})
197
+ pkgs_old = env_old.get("packages", {})
198
+ for pkg in set(pkgs_new) | set(pkgs_old):
199
+ v_new = pkgs_new.get(pkg)
200
+ v_old = pkgs_old.get(pkg)
201
+ if v_new and v_old and v_new != v_old:
202
+ warnings.append(f"{pkg}: {v_old} -> {v_new}")
203
+
204
+ # Config hash drift
205
+ hash_new = env_new.get("config_hash")
206
+ hash_old = env_old.get("config_hash")
207
+ if hash_new and hash_old and hash_new != hash_old:
208
+ warnings.append("config.yaml has changed since best experiment")
209
+
210
+ return warnings
211
+
212
+
213
+ def format_brief(
214
+ campaign: dict,
215
+ best: dict | None,
216
+ trajectory: list[dict],
217
+ model_types: dict[str, dict],
218
+ hypotheses: list[dict],
219
+ metric: str,
220
+ lower_is_better: bool,
221
+ failure_clusters: list[dict] | None = None,
222
+ env_warnings: list[str] | None = None,
223
+ ) -> str:
224
+ """Format the research briefing as markdown."""
225
+ direction = "lower" if lower_is_better else "higher"
226
+ lines = [
227
+ "# Research Briefing",
228
+ "",
229
+ f"*Generated {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
230
+ "",
231
+ "---",
232
+ "",
233
+ "## Campaign Summary",
234
+ "",
235
+ f"| Metric | Value |",
236
+ f"|--------|-------|",
237
+ f"| Total experiments | {campaign['total']} |",
238
+ f"| Kept | {campaign['kept']} ({campaign['keep_rate']:.0%} keep rate) |",
239
+ f"| Discarded | {campaign['discarded']} |",
240
+ f"| Primary metric | {metric} ({direction} is better) |",
241
+ ]
242
+ if campaign["first_experiment"]:
243
+ lines.append(f"| First experiment | {campaign['first_experiment'][:19]} |")
244
+ if campaign["last_experiment"]:
245
+ lines.append(f"| Last experiment | {campaign['last_experiment'][:19]} |")
246
+
247
+ lines.extend(["", "## Current Best", ""])
248
+ if best:
249
+ metrics_str = ", ".join(f"{k}={v}" for k, v in best.get("metrics", {}).items())
250
+ lines.extend([
251
+ f"- **Experiment:** {best.get('experiment_id', '?')}",
252
+ f"- **Model:** {best.get('config', {}).get('model_type', '?')}",
253
+ f"- **Metrics:** {metrics_str}",
254
+ f"- **Description:** {best.get('description', 'N/A')}",
255
+ ])
256
+ else:
257
+ lines.append("No kept experiments yet.")
258
+
259
+ lines.extend(["", "## Improvement Trajectory", ""])
260
+ if trajectory:
261
+ lines.append(f"| Experiment | {metric} | Best So Far |")
262
+ lines.append(f"|------------|{'---' * len(metric)}--|-------------|")
263
+ for t in trajectory[-10:]: # Last 10
264
+ lines.append(f"| {t['experiment_id']} | {t['value']:.4f} | {t['best_so_far']:.4f} |")
265
+ if len(trajectory) > 10:
266
+ lines.insert(-10, f"*...showing last 10 of {len(trajectory)} kept experiments*")
267
+ else:
268
+ lines.append("No trajectory data yet.")
269
+
270
+ lines.extend(["", "## Model Types Explored", ""])
271
+ if model_types:
272
+ lines.append("| Model | Experiments | Kept | Discarded |")
273
+ lines.append("|-------|-------------|------|-----------|")
274
+ for mt, stats in sorted(model_types.items()):
275
+ lines.append(f"| {mt} | {stats['total']} | {stats['kept']} | {stats['discarded']} |")
276
+ else:
277
+ lines.append("No experiments yet.")
278
+
279
+ lines.extend(["", "## Hypothesis Queue", ""])
280
+ queued = [h for h in hypotheses if h.get("status") == "queued"]
281
+ tested = [h for h in hypotheses if h.get("status") in ("tested", "promising", "dead-end")]
282
+ if queued:
283
+ lines.append(f"**{len(queued)} queued:**")
284
+ for h in queued:
285
+ priority_marker = " (HIGH)" if h.get("priority") == "high" else ""
286
+ source_marker = " [human]" if h.get("source") == "human" else ""
287
+ lines.append(f"- {h['id']}: {h.get('description', '?')}{priority_marker}{source_marker}")
288
+ else:
289
+ lines.append("No queued hypotheses. Use `/turing:try` to inject ideas.")
290
+
291
+ if tested:
292
+ lines.extend(["", f"**{len(tested)} tested:**"])
293
+ for h in tested:
294
+ result = f" -> {h['result_experiment']}" if h.get("result_experiment") else ""
295
+ lines.append(f"- {h['id']}: {h.get('description', '?')} [{h.get('status')}]{result}")
296
+
297
+ # Failure patterns
298
+ if failure_clusters:
299
+ lines.extend(["", "## Failure Patterns", ""])
300
+ for cluster in failure_clusters:
301
+ exps = ", ".join(cluster["experiments"][:5])
302
+ lines.append(f"- **{cluster['trait']}** — {cluster['count']} discarded experiments ({exps})")
303
+ lines.append("")
304
+ lines.append("*Consider avoiding these traits in future experiments.*")
305
+
306
+ # Environment drift warnings
307
+ if env_warnings:
308
+ lines.extend(["", "## Environment Drift", ""])
309
+ lines.append("The runtime environment has changed since the best experiment:")
310
+ for w in env_warnings:
311
+ lines.append(f"- {w}")
312
+ lines.append("")
313
+ lines.append("*Results may not be directly comparable. Consider re-running the best experiment in the current environment.*")
314
+
315
+ lines.extend([
316
+ "",
317
+ "## Recommendations",
318
+ "",
319
+ "Based on experiment history:",
320
+ "",
321
+ ])
322
+
323
+ if not trajectory:
324
+ lines.append("- Run `/turing:train` to begin the experiment loop")
325
+ elif len(trajectory) < 3:
326
+ lines.append("- Too few experiments for meaningful recommendations. Continue training.")
327
+ else:
328
+ # Check if recent experiments are improving
329
+ recent = trajectory[-3:]
330
+ improving = recent[-1]["best_so_far"] != recent[0]["best_so_far"]
331
+ if improving:
332
+ lines.append("- Current direction is productive — continue exploring this model type")
333
+ else:
334
+ lines.append("- Improvement has plateaued — consider switching model type or feature approach")
335
+
336
+ # Check model diversity
337
+ if len(model_types) == 1:
338
+ lines.append("- Only one model type explored — try alternatives (LightGBM, RandomForest, MLP)")
339
+
340
+ # Check if hypotheses are exhausted
341
+ if not queued:
342
+ lines.append("- No hypotheses queued — inject ideas with `/turing:try`")
343
+
344
+ lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses. Use `/turing:train` to execute.*"])
345
+
346
+ return "\n".join(lines)
347
+
348
+
349
+ def generate_brief(
350
+ config_path: str = "config.yaml",
351
+ log_path: str = "experiments/log.jsonl",
352
+ hypotheses_path: str = "hypotheses.yaml",
353
+ ) -> str:
354
+ """Generate a research briefing from experiment history."""
355
+ config = load_config(config_path)
356
+ eval_cfg = config.get("evaluation", {})
357
+ metric = eval_cfg.get("primary_metric", "accuracy")
358
+ lower_is_better = eval_cfg.get("lower_is_better", False)
359
+
360
+ experiments = load_experiments(log_path)
361
+ hypotheses = load_hypotheses(hypotheses_path)
362
+
363
+ campaign = compute_campaign_summary(experiments)
364
+ best = find_best(experiments, metric, lower_is_better)
365
+ trajectory = compute_trajectory(experiments, metric, lower_is_better)
366
+ model_types = identify_model_types(experiments)
367
+ failures = cluster_failures(experiments)
368
+ env_warnings = detect_environment_drift(experiments)
369
+
370
+ return format_brief(
371
+ campaign, best, trajectory, model_types, hypotheses,
372
+ metric, lower_is_better, failures, env_warnings,
373
+ )
374
+
375
+
376
+ def main() -> None:
377
+ """CLI entry point."""
378
+ parser = argparse.ArgumentParser(description="Generate research briefing")
379
+ parser.add_argument("--config", default="config.yaml")
380
+ parser.add_argument("--log", default="experiments/log.jsonl")
381
+ parser.add_argument("--hypotheses", default="hypotheses.yaml")
382
+ args = parser.parse_args()
383
+
384
+ brief = generate_brief(args.config, args.log, args.hypotheses)
385
+ print(brief)
386
+
387
+
388
+ if __name__ == "__main__":
389
+ main()