claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,423 @@
1
+ #!/usr/bin/env python3
2
+ """Research logbook generator for the autoresearch pipeline.
3
+
4
+ Generates a self-contained HTML logbook showing the full research
5
+ narrative: hypotheses proposed, experiments run, decisions made,
6
+ and progress over time. Designed to be shared with collaborators
7
+ or archived as a research artifact.
8
+
9
+ Usage:
10
+ python scripts/generate_logbook.py
11
+ python scripts/generate_logbook.py --output logbook.html
12
+ python scripts/generate_logbook.py --since 2026-03-01
13
+ python scripts/generate_logbook.py --format markdown --output logbook.md
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import json
20
+ import sys
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+
24
+ import yaml
25
+
26
+ from scripts.turing_io import load_config, load_experiments, load_hypotheses
27
+
28
+
29
+ def load_decisions(decisions_dir: str) -> list[dict]:
30
+ """Load decision packets."""
31
+ path = Path(decisions_dir)
32
+ if not path.exists():
33
+ return []
34
+ decisions = []
35
+ for f in sorted(path.glob("*.json")):
36
+ try:
37
+ with open(f) as fh:
38
+ decisions.append(json.load(fh))
39
+ except (json.JSONDecodeError, OSError):
40
+ continue
41
+ return decisions
42
+
43
+
44
+ def build_timeline(experiments: list[dict], hypotheses: list[dict]) -> list[dict]:
45
+ """Build a unified timeline of events from experiments and hypotheses."""
46
+ events = []
47
+
48
+ for exp in experiments:
49
+ ts = exp.get("timestamp", "")
50
+ events.append({
51
+ "timestamp": ts,
52
+ "type": "experiment",
53
+ "id": exp.get("experiment_id", "?"),
54
+ "description": exp.get("description", ""),
55
+ "status": exp.get("status", "unknown"),
56
+ "metrics": exp.get("metrics", {}),
57
+ "config": exp.get("config", {}),
58
+ })
59
+
60
+ for hyp in hypotheses:
61
+ ts = hyp.get("created_at", "")
62
+ events.append({
63
+ "timestamp": ts,
64
+ "type": "hypothesis",
65
+ "id": hyp.get("id", "?"),
66
+ "description": hyp.get("description", ""),
67
+ "status": hyp.get("status", "queued"),
68
+ "source": hyp.get("source", "unknown"),
69
+ "priority": hyp.get("priority", "medium"),
70
+ })
71
+
72
+ events.sort(key=lambda e: e.get("timestamp", ""))
73
+ return events
74
+
75
+
76
+ def compute_trajectory(experiments: list[dict], metric_name: str,
77
+ lower_is_better: bool = False) -> list[dict]:
78
+ """Compute the improvement trajectory over time."""
79
+ trajectory = []
80
+ best_val = None
81
+
82
+ for exp in experiments:
83
+ val = exp.get("metrics", {}).get(metric_name)
84
+ if val is None or not isinstance(val, (int, float)):
85
+ continue
86
+
87
+ is_new_best = False
88
+ if best_val is None:
89
+ best_val = val
90
+ is_new_best = True
91
+ elif lower_is_better and val < best_val:
92
+ best_val = val
93
+ is_new_best = True
94
+ elif not lower_is_better and val > best_val:
95
+ best_val = val
96
+ is_new_best = True
97
+
98
+ trajectory.append({
99
+ "experiment_id": exp.get("experiment_id", "?"),
100
+ "timestamp": exp.get("timestamp", ""),
101
+ "value": val,
102
+ "best_so_far": best_val,
103
+ "is_new_best": is_new_best,
104
+ "status": exp.get("status", "unknown"),
105
+ })
106
+
107
+ return trajectory
108
+
109
+
110
+ def generate_markdown(config: dict, experiments: list[dict],
111
+ hypotheses: list[dict], trajectory: list[dict],
112
+ since: str | None = None) -> str:
113
+ """Generate a markdown logbook."""
114
+ metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
115
+ task_desc = config.get("task", {}).get("description", "ML experiment campaign")
116
+ lower_is_better = config.get("evaluation", {}).get("lower_is_better", False)
117
+
118
+ lines = []
119
+ lines.append(f"# Research Logbook: {task_desc}")
120
+ lines.append(f"")
121
+ lines.append(f"Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
122
+ if since:
123
+ lines.append(f"Period: {since} to present")
124
+ lines.append("")
125
+
126
+ # Campaign summary
127
+ kept = [e for e in experiments if e.get("status") == "kept"]
128
+ discarded = [e for e in experiments if e.get("status") == "discarded"]
129
+ lines.append("## Campaign Summary")
130
+ lines.append("")
131
+ lines.append(f"- **Total experiments:** {len(experiments)}")
132
+ lines.append(f"- **Kept:** {len(kept)} ({len(kept)/len(experiments)*100:.0f}%)" if experiments else "- **Kept:** 0")
133
+ lines.append(f"- **Discarded:** {len(discarded)}")
134
+ lines.append(f"- **Hypotheses proposed:** {len(hypotheses)}")
135
+ human_hyps = [h for h in hypotheses if h.get("source") == "human"]
136
+ lines.append(f"- **Human-injected:** {len(human_hyps)}")
137
+ lines.append("")
138
+
139
+ # Best result
140
+ if trajectory:
141
+ best = max(trajectory, key=lambda t: -t["best_so_far"] if lower_is_better else t["best_so_far"])
142
+ lines.append("## Best Result")
143
+ lines.append("")
144
+ lines.append(f"- **{metric_name}:** {best['best_so_far']}")
145
+ lines.append(f"- **Experiment:** {best['experiment_id']}")
146
+ lines.append("")
147
+
148
+ # Improvement trajectory
149
+ if trajectory:
150
+ lines.append("## Improvement Trajectory")
151
+ lines.append("")
152
+ lines.append(f"| # | Experiment | {metric_name} | Best | New Best? |")
153
+ lines.append("|---|-----------|--------|------|-----------|")
154
+ for i, t in enumerate(trajectory, 1):
155
+ marker = "**yes**" if t["is_new_best"] else ""
156
+ lines.append(f"| {i} | {t['experiment_id']} | {t['value']:.4f} | {t['best_so_far']:.4f} | {marker} |")
157
+ lines.append("")
158
+
159
+ # Experiment log
160
+ lines.append("## Experiment Log")
161
+ lines.append("")
162
+ for exp in experiments:
163
+ status_icon = "kept" if exp.get("status") == "kept" else "discarded"
164
+ exp_id = exp.get("experiment_id", "?")
165
+ desc = exp.get("description", "No description")
166
+ metrics = exp.get("metrics", {})
167
+ metric_str = ", ".join(f"{k}={v:.4f}" for k, v in metrics.items()
168
+ if isinstance(v, (int, float)))
169
+
170
+ lines.append(f"### {exp_id} [{status_icon}]")
171
+ lines.append("")
172
+ lines.append(f"**Description:** {desc}")
173
+ lines.append(f"**Metrics:** {metric_str}")
174
+ ts = exp.get("timestamp", "")
175
+ if ts:
176
+ lines.append(f"**Time:** {ts[:19]}")
177
+ lines.append("")
178
+
179
+ # Hypothesis log
180
+ if hypotheses:
181
+ lines.append("## Hypothesis Queue")
182
+ lines.append("")
183
+ lines.append("| ID | Description | Source | Status | Priority |")
184
+ lines.append("|---|-----------|--------|--------|----------|")
185
+ for h in hypotheses:
186
+ lines.append(
187
+ f"| {h.get('id', '?')} "
188
+ f"| {h.get('description', '')[:60]} "
189
+ f"| {h.get('source', '?')} "
190
+ f"| {h.get('status', '?')} "
191
+ f"| {h.get('priority', '?')} |"
192
+ )
193
+ lines.append("")
194
+
195
+ return "\n".join(lines)
196
+
197
+
198
+ def generate_html(config: dict, experiments: list[dict],
199
+ hypotheses: list[dict], trajectory: list[dict],
200
+ since: str | None = None) -> str:
201
+ """Generate a self-contained HTML logbook."""
202
+ metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
203
+ task_desc = config.get("task", {}).get("description", "ML experiment campaign")
204
+ lower_is_better = config.get("evaluation", {}).get("lower_is_better", False)
205
+
206
+ kept = [e for e in experiments if e.get("status") == "kept"]
207
+ discarded = [e for e in experiments if e.get("status") == "discarded"]
208
+ keep_rate = f"{len(kept)/len(experiments)*100:.0f}" if experiments else "0"
209
+
210
+ best_metric = ""
211
+ best_exp = ""
212
+ if trajectory:
213
+ best = max(trajectory, key=lambda t: -t["best_so_far"] if lower_is_better else t["best_so_far"])
214
+ best_metric = f"{best['best_so_far']:.4f}"
215
+ best_exp = best["experiment_id"]
216
+
217
+ # Build trajectory data for the chart
218
+ traj_labels = json.dumps([t["experiment_id"] for t in trajectory])
219
+ traj_values = json.dumps([round(t["value"], 4) for t in trajectory])
220
+ traj_best = json.dumps([round(t["best_so_far"], 4) for t in trajectory])
221
+
222
+ # Build experiment rows
223
+ exp_rows = ""
224
+ for exp in experiments:
225
+ status = exp.get("status", "unknown")
226
+ cls = "kept" if status == "kept" else "discarded"
227
+ metrics = exp.get("metrics", {})
228
+ metric_val = metrics.get(metric_name, "—")
229
+ if isinstance(metric_val, float):
230
+ metric_val = f"{metric_val:.4f}"
231
+ exp_rows += f"""
232
+ <tr class="{cls}">
233
+ <td><code>{exp.get('experiment_id', '?')}</code></td>
234
+ <td>{exp.get('description', '')}</td>
235
+ <td>{metric_val}</td>
236
+ <td><span class="badge {cls}">{status}</span></td>
237
+ <td>{exp.get('timestamp', '')[:10]}</td>
238
+ </tr>"""
239
+
240
+ # Build hypothesis rows
241
+ hyp_rows = ""
242
+ for h in hypotheses:
243
+ status = h.get("status", "queued")
244
+ hyp_rows += f"""
245
+ <tr>
246
+ <td><code>{h.get('id', '?')}</code></td>
247
+ <td>{h.get('description', '')}</td>
248
+ <td><span class="badge source-{h.get('source', 'unknown')}">{h.get('source', '?')}</span></td>
249
+ <td><span class="badge status-{status}">{status}</span></td>
250
+ <td>{h.get('priority', '?')}</td>
251
+ </tr>"""
252
+
253
+ html = f"""<!DOCTYPE html>
254
+ <html lang="en">
255
+ <head>
256
+ <meta charset="UTF-8">
257
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
258
+ <title>Research Logbook — {task_desc}</title>
259
+ <style>
260
+ :root {{ --blue: #2563eb; --green: #16a34a; --red: #dc2626; --gray: #6b7280;
261
+ --bg: #f8fafc; --card: #fff; --border: #e2e8f0; }}
262
+ * {{ margin: 0; padding: 0; box-sizing: border-box; }}
263
+ body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
264
+ background: var(--bg); color: #1e293b; line-height: 1.6; }}
265
+ .container {{ max-width: 1100px; margin: 0 auto; padding: 2rem; }}
266
+ header {{ background: linear-gradient(135deg, #1e40af, #3b82f6); color: #fff;
267
+ padding: 2rem; border-radius: 12px; margin-bottom: 2rem; }}
268
+ header h1 {{ font-size: 1.8rem; margin-bottom: 0.5rem; }}
269
+ header p {{ opacity: 0.85; font-size: 0.95rem; }}
270
+ .stats {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
271
+ gap: 1rem; margin-bottom: 2rem; }}
272
+ .stat {{ background: var(--card); border: 1px solid var(--border); border-radius: 8px;
273
+ padding: 1.2rem; text-align: center; }}
274
+ .stat .value {{ font-size: 1.8rem; font-weight: 700; color: var(--blue); }}
275
+ .stat .label {{ font-size: 0.85rem; color: var(--gray); margin-top: 0.25rem; }}
276
+ .section {{ background: var(--card); border: 1px solid var(--border); border-radius: 8px;
277
+ padding: 1.5rem; margin-bottom: 1.5rem; }}
278
+ .section h2 {{ font-size: 1.2rem; margin-bottom: 1rem; color: #1e293b;
279
+ border-bottom: 2px solid var(--blue); padding-bottom: 0.5rem; }}
280
+ table {{ width: 100%; border-collapse: collapse; font-size: 0.9rem; }}
281
+ th {{ text-align: left; padding: 0.6rem; background: #f1f5f9; border-bottom: 2px solid var(--border); }}
282
+ td {{ padding: 0.6rem; border-bottom: 1px solid var(--border); }}
283
+ tr.kept td:first-child {{ border-left: 3px solid var(--green); }}
284
+ tr.discarded td:first-child {{ border-left: 3px solid var(--red); }}
285
+ .badge {{ display: inline-block; padding: 0.15rem 0.5rem; border-radius: 4px;
286
+ font-size: 0.75rem; font-weight: 600; }}
287
+ .badge.kept {{ background: #dcfce7; color: #166534; }}
288
+ .badge.discarded {{ background: #fee2e2; color: #991b1b; }}
289
+ .badge.source-human {{ background: #dbeafe; color: #1e40af; }}
290
+ .badge.source-agent {{ background: #f3e8ff; color: #6b21a8; }}
291
+ .badge.source-literature {{ background: #fef3c7; color: #92400e; }}
292
+ .badge.status-queued {{ background: #e0e7ff; color: #3730a3; }}
293
+ .badge.status-tested {{ background: #d1fae5; color: #065f46; }}
294
+ .badge.status-dead-end {{ background: #fee2e2; color: #991b1b; }}
295
+ .badge.status-promising {{ background: #dcfce7; color: #166534; }}
296
+ .badge.status-in-progress {{ background: #fef3c7; color: #92400e; }}
297
+ .chart {{ width: 100%; height: 250px; position: relative; }}
298
+ canvas {{ width: 100% !important; height: 100% !important; }}
299
+ code {{ background: #f1f5f9; padding: 0.1rem 0.3rem; border-radius: 3px; font-size: 0.85em; }}
300
+ .footer {{ text-align: center; color: var(--gray); font-size: 0.8rem; margin-top: 2rem; }}
301
+ @media print {{
302
+ body {{ background: #fff; }}
303
+ .container {{ max-width: none; padding: 1rem; }}
304
+ header {{ break-inside: avoid; }}
305
+ .section {{ break-inside: avoid; }}
306
+ }}
307
+ </style>
308
+ </head>
309
+ <body>
310
+ <div class="container">
311
+ <header>
312
+ <h1>{task_desc}</h1>
313
+ <p>Research Logbook &mdash; Generated {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}
314
+ {f' &mdash; Since {since}' if since else ''}</p>
315
+ </header>
316
+
317
+ <div class="stats">
318
+ <div class="stat"><div class="value">{len(experiments)}</div><div class="label">Experiments</div></div>
319
+ <div class="stat"><div class="value">{keep_rate}%</div><div class="label">Keep Rate</div></div>
320
+ <div class="stat"><div class="value">{best_metric}</div><div class="label">Best {metric_name}</div></div>
321
+ <div class="stat"><div class="value">{best_exp}</div><div class="label">Best Experiment</div></div>
322
+ <div class="stat"><div class="value">{len(hypotheses)}</div><div class="label">Hypotheses</div></div>
323
+ </div>
324
+
325
+ <div class="section">
326
+ <h2>Improvement Trajectory</h2>
327
+ <div class="chart"><canvas id="trajChart"></canvas></div>
328
+ </div>
329
+
330
+ <div class="section">
331
+ <h2>Experiment Log</h2>
332
+ <table>
333
+ <thead><tr><th>ID</th><th>Description</th><th>{metric_name}</th><th>Status</th><th>Date</th></tr></thead>
334
+ <tbody>{exp_rows}</tbody>
335
+ </table>
336
+ </div>
337
+
338
+ <div class="section">
339
+ <h2>Hypothesis Queue</h2>
340
+ <table>
341
+ <thead><tr><th>ID</th><th>Description</th><th>Source</th><th>Status</th><th>Priority</th></tr></thead>
342
+ <tbody>{hyp_rows if hyp_rows else '<tr><td colspan="5" style="text-align:center;color:var(--gray)">No hypotheses yet</td></tr>'}</tbody>
343
+ </table>
344
+ </div>
345
+
346
+ <div class="footer">
347
+ <p>Generated by Turing &mdash; Autonomous ML Research Harness</p>
348
+ </div>
349
+ </div>
350
+
351
+ <script src="https://cdn.jsdelivr.net/npm/chart.js@4/dist/chart.umd.min.js"></script>
352
+ <script>
353
+ const labels = {traj_labels};
354
+ const values = {traj_values};
355
+ const best = {traj_best};
356
+ if (labels.length > 0) {{
357
+ new Chart(document.getElementById('trajChart'), {{
358
+ type: 'line',
359
+ data: {{
360
+ labels,
361
+ datasets: [
362
+ {{ label: '{metric_name}', data: values, borderColor: '#3b82f6', backgroundColor: 'rgba(59,130,246,0.1)',
363
+ fill: true, tension: 0.3, pointRadius: 4 }},
364
+ {{ label: 'Best so far', data: best, borderColor: '#16a34a', borderDash: [5,5],
365
+ fill: false, tension: 0, pointRadius: 0 }}
366
+ ]
367
+ }},
368
+ options: {{
369
+ responsive: true, maintainAspectRatio: false,
370
+ plugins: {{ legend: {{ position: 'top' }} }},
371
+ scales: {{ y: {{ title: {{ display: true, text: '{metric_name}' }} }} }}
372
+ }}
373
+ }});
374
+ }}
375
+ </script>
376
+ </body>
377
+ </html>"""
378
+
379
+ return html
380
+
381
+
382
+ def main() -> None:
383
+ parser = argparse.ArgumentParser(description="Generate research logbook")
384
+ parser.add_argument("--config", default="config.yaml")
385
+ parser.add_argument("--log", default="experiments/log.jsonl")
386
+ parser.add_argument("--hypotheses", default="hypotheses.yaml")
387
+ parser.add_argument("--output", default=None, help="Output file path")
388
+ parser.add_argument("--format", choices=["html", "markdown"], default="html")
389
+ parser.add_argument("--since", default=None, help="Filter events after this date (YYYY-MM-DD)")
390
+ args = parser.parse_args()
391
+
392
+ config = load_config(args.config)
393
+ experiments = load_experiments(args.log)
394
+ hypotheses = load_hypotheses(args.hypotheses)
395
+
396
+ if args.since:
397
+ experiments = [e for e in experiments
398
+ if e.get("timestamp", "") >= args.since]
399
+ hypotheses = [h for h in hypotheses
400
+ if h.get("created_at", "") >= args.since]
401
+
402
+ metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
403
+ lower_is_better = config.get("evaluation", {}).get("lower_is_better", False)
404
+ trajectory = compute_trajectory(experiments, metric_name, lower_is_better)
405
+
406
+ if args.format == "markdown":
407
+ output = generate_markdown(config, experiments, hypotheses, trajectory, args.since)
408
+ default_name = "logbook.md"
409
+ else:
410
+ output = generate_html(config, experiments, hypotheses, trajectory, args.since)
411
+ default_name = "logbook.html"
412
+
413
+ if args.output:
414
+ Path(args.output).parent.mkdir(parents=True, exist_ok=True)
415
+ with open(args.output, "w") as f:
416
+ f.write(output)
417
+ print(f"Logbook written to {args.output}")
418
+ else:
419
+ print(output)
420
+
421
+
422
+ if __name__ == "__main__":
423
+ main()
@@ -0,0 +1,243 @@
1
+ """Experiment logging utility for the autoresearch pipeline.
2
+
3
+ The ground truth record of all experiments — kept and discarded alike.
4
+ Appends structured JSONL entries to experiments/log.jsonl with full
5
+ metadata: experiment_id, timestamp, git_commit, status, config, metrics,
6
+ model_path, and description.
7
+
8
+ Also maintains a TSV summary at experiments/results.tsv for quick reference.
9
+ Every experiment is logged. No information is lost.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import sys
17
+ from datetime import datetime, timezone
18
+ from pathlib import Path
19
+
20
+
21
+ def get_next_experiment_id(log_path: str) -> str:
22
+ """Get the next sequential experiment ID from the log.
23
+
24
+ Returns "exp-001" for empty/nonexistent log, "exp-NNN" otherwise.
25
+ """
26
+ path = Path(log_path)
27
+ if not path.exists() or path.stat().st_size == 0:
28
+ return "exp-001"
29
+
30
+ max_id = 0
31
+ with open(path) as f:
32
+ for line in f:
33
+ line = line.strip()
34
+ if not line:
35
+ continue
36
+ try:
37
+ entry = json.loads(line)
38
+ exp_id = entry.get("experiment_id", "")
39
+ if exp_id.startswith("exp-"):
40
+ num = int(exp_id.split("-")[1])
41
+ max_id = max(max_id, num)
42
+ except (json.JSONDecodeError, ValueError, IndexError):
43
+ continue
44
+
45
+ return f"exp-{max_id + 1:03d}"
46
+
47
+
48
+ def log_tsv_row(
49
+ tsv_path: str,
50
+ experiment_id: str,
51
+ status: str,
52
+ config: dict,
53
+ metrics: dict,
54
+ description: str,
55
+ ) -> None:
56
+ """Append a TSV row to the results summary file.
57
+
58
+ Creates header row if file doesn't exist or is empty.
59
+ Columns: experiment_id, status, model_type, primary_metric, description, timestamp
60
+ """
61
+ path = Path(tsv_path)
62
+ path.parent.mkdir(parents=True, exist_ok=True)
63
+
64
+ write_header = not path.exists() or path.stat().st_size == 0
65
+
66
+ # Build metric columns dynamically
67
+ metric_cols = "\t".join(str(v) for v in metrics.values())
68
+ metric_headers = "\t".join(metrics.keys())
69
+
70
+ row_data = [
71
+ experiment_id,
72
+ status,
73
+ config.get("model_type", "unknown"),
74
+ metric_cols,
75
+ description.replace("\t", " "),
76
+ datetime.now(timezone.utc).isoformat(),
77
+ ]
78
+
79
+ with open(path, "a") as f:
80
+ if write_header:
81
+ f.write(f"experiment_id\tstatus\tmodel_type\t{metric_headers}\tdescription\ttimestamp\n")
82
+ f.write("\t".join(row_data) + "\n")
83
+
84
+
85
+ def log_experiment(
86
+ log_path: str,
87
+ experiment_id: str,
88
+ config: dict,
89
+ metrics: dict,
90
+ model_path: str,
91
+ description: str,
92
+ status: str = "kept",
93
+ git_commit: str | None = None,
94
+ parent_experiment: str | None = None,
95
+ hypothesis_id: str | None = None,
96
+ family: str | None = None,
97
+ tags: list[str] | None = None,
98
+ ) -> None:
99
+ """Append one experiment entry to the JSONL log.
100
+
101
+ Args:
102
+ log_path: Path to experiments/log.jsonl.
103
+ experiment_id: e.g. "exp-001".
104
+ config: Dict with model_type, hyperparams, features.
105
+ metrics: Dict with metric values.
106
+ model_path: Path to saved model artifact.
107
+ description: Human-readable experiment description.
108
+ status: "kept" or "discarded".
109
+ git_commit: Optional git commit hash.
110
+ parent_experiment: Optional parent experiment ID (for dependency tree).
111
+ hypothesis_id: Optional hypothesis ID (links to hypotheses.yaml).
112
+ family: Optional experiment family for strategic grouping.
113
+ tags: Optional list of tags for categorization.
114
+ """
115
+ path = Path(log_path)
116
+ path.parent.mkdir(parents=True, exist_ok=True)
117
+
118
+ # Load environment snapshot from train_metadata.json if available
119
+ environment = None
120
+ metadata_path = Path("train_metadata.json")
121
+ if metadata_path.exists():
122
+ try:
123
+ with open(metadata_path) as mf:
124
+ meta = json.load(mf)
125
+ environment = meta.get("environment")
126
+ except (json.JSONDecodeError, OSError):
127
+ pass
128
+
129
+ entry = {
130
+ "experiment_id": experiment_id,
131
+ "timestamp": datetime.now(timezone.utc).isoformat(),
132
+ "git_commit": git_commit,
133
+ "status": status,
134
+ "parent_experiment": parent_experiment,
135
+ "hypothesis_id": hypothesis_id,
136
+ "family": family,
137
+ "tags": tags or [],
138
+ "config": config,
139
+ "metrics": metrics,
140
+ "model_path": model_path,
141
+ "description": description,
142
+ "environment": environment,
143
+ }
144
+
145
+ with open(path, "a") as f:
146
+ f.write(json.dumps(entry) + "\n")
147
+
148
+ # Also append TSV summary row
149
+ tsv_path = str(Path(log_path).parent / "results.tsv")
150
+ log_tsv_row(tsv_path, experiment_id, status, config, metrics, description)
151
+
152
+
153
+ def get_best_experiment(log_path: str, primary_metric: str = "accuracy", lower_is_better: bool = False) -> dict | None:
154
+ """Return the experiment with the best primary metric among 'keep' entries.
155
+
156
+ Args:
157
+ log_path: Path to experiments/log.jsonl.
158
+ primary_metric: Metric name to optimize.
159
+ lower_is_better: True for metrics like MAE/MSE, False for accuracy/F1.
160
+
161
+ Returns:
162
+ Best experiment dict, or None if no 'keep' entries exist.
163
+ """
164
+ path = Path(log_path)
165
+ if not path.exists():
166
+ return None
167
+
168
+ best = None
169
+ best_value = float("inf") if lower_is_better else float("-inf")
170
+
171
+ with open(path) as f:
172
+ for line in f:
173
+ line = line.strip()
174
+ if not line:
175
+ continue
176
+ try:
177
+ entry = json.loads(line)
178
+ except json.JSONDecodeError:
179
+ continue
180
+
181
+ if entry.get("status") != "kept":
182
+ continue
183
+
184
+ value = entry.get("metrics", {}).get(primary_metric)
185
+ if value is None:
186
+ continue
187
+
188
+ if lower_is_better and value < best_value:
189
+ best_value = value
190
+ best = entry
191
+ elif not lower_is_better and value > best_value:
192
+ best_value = value
193
+ best = entry
194
+
195
+ return best
196
+
197
+
198
+ def main() -> None:
199
+ """CLI entry point for logging experiments."""
200
+ parser = argparse.ArgumentParser(
201
+ description="Log an experiment to the append-only JSONL experiment log.",
202
+ )
203
+ parser.add_argument("log_path", help="Path to experiments/log.jsonl")
204
+ parser.add_argument("experiment_id", help='Experiment ID, e.g. "exp-001"')
205
+ parser.add_argument("status", help='"kept" or "discarded"')
206
+ parser.add_argument("metrics_json", help="Metrics as a JSON string")
207
+ parser.add_argument("config_json", help="Config as a JSON string")
208
+ parser.add_argument("model_path", help="Path to saved model artifact")
209
+ parser.add_argument(
210
+ "description",
211
+ nargs="*",
212
+ help="Human-readable experiment description",
213
+ )
214
+ parser.add_argument("--parent", default=None, help="Parent experiment ID")
215
+ parser.add_argument("--hypothesis", default=None, help="Hypothesis ID")
216
+ parser.add_argument("--family", default=None, help="Experiment family name")
217
+ parser.add_argument("--tags", default=None, help="Comma-separated tags")
218
+
219
+ args = parser.parse_args()
220
+
221
+ metrics = json.loads(args.metrics_json)
222
+ config = json.loads(args.config_json)
223
+ description = " ".join(args.description) if args.description else ""
224
+ tags = [t.strip() for t in args.tags.split(",")] if args.tags else None
225
+
226
+ log_experiment(
227
+ log_path=args.log_path,
228
+ experiment_id=args.experiment_id,
229
+ config=config,
230
+ metrics=metrics,
231
+ model_path=args.model_path,
232
+ description=description,
233
+ status=args.status,
234
+ parent_experiment=args.parent,
235
+ hypothesis_id=args.hypothesis,
236
+ family=args.family,
237
+ tags=tags,
238
+ )
239
+ print(f"Logged {args.experiment_id} ({args.status})")
240
+
241
+
242
+ if __name__ == "__main__":
243
+ main()