claude-turing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +34 -0
- package/LICENSE +21 -0
- package/README.md +457 -0
- package/agents/ml-evaluator.md +43 -0
- package/agents/ml-researcher.md +74 -0
- package/bin/cli.js +46 -0
- package/bin/turing-init.sh +57 -0
- package/commands/brief.md +83 -0
- package/commands/compare.md +24 -0
- package/commands/design.md +97 -0
- package/commands/init.md +123 -0
- package/commands/logbook.md +51 -0
- package/commands/mode.md +43 -0
- package/commands/poster.md +89 -0
- package/commands/preflight.md +75 -0
- package/commands/report.md +97 -0
- package/commands/rules/loop-protocol.md +91 -0
- package/commands/status.md +24 -0
- package/commands/suggest.md +95 -0
- package/commands/sweep.md +45 -0
- package/commands/train.md +66 -0
- package/commands/try.md +63 -0
- package/commands/turing.md +54 -0
- package/commands/validate.md +34 -0
- package/config/defaults.yaml +45 -0
- package/config/experiment_archetypes.yaml +127 -0
- package/config/lifecycle.toml +31 -0
- package/config/novelty_aliases.yaml +107 -0
- package/config/relationships.toml +125 -0
- package/config/state.toml +24 -0
- package/config/task_taxonomy.yaml +110 -0
- package/config/taxonomy.toml +37 -0
- package/package.json +54 -0
- package/src/claude-md.js +55 -0
- package/src/install.js +107 -0
- package/src/paths.js +20 -0
- package/src/postinstall.js +22 -0
- package/src/verify.js +109 -0
- package/templates/MEMORY.md +36 -0
- package/templates/README.md +93 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/config.yaml +48 -0
- package/templates/evaluate.py +237 -0
- package/templates/features/__init__.py +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/features/featurizers.py +138 -0
- package/templates/prepare.py +171 -0
- package/templates/program.md +216 -0
- package/templates/pyproject.toml +8 -0
- package/templates/requirements.txt +8 -0
- package/templates/scripts/__init__.py +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/check_convergence.py +230 -0
- package/templates/scripts/compare_runs.py +124 -0
- package/templates/scripts/critique_hypothesis.py +350 -0
- package/templates/scripts/experiment_index.py +288 -0
- package/templates/scripts/generate_brief.py +389 -0
- package/templates/scripts/generate_logbook.py +423 -0
- package/templates/scripts/log_experiment.py +243 -0
- package/templates/scripts/manage_hypotheses.py +543 -0
- package/templates/scripts/novelty_guard.py +343 -0
- package/templates/scripts/parse_metrics.py +139 -0
- package/templates/scripts/post-train-hook.sh +74 -0
- package/templates/scripts/preflight.py +549 -0
- package/templates/scripts/scaffold.py +409 -0
- package/templates/scripts/show_environment.py +92 -0
- package/templates/scripts/show_experiment_tree.py +144 -0
- package/templates/scripts/show_families.py +133 -0
- package/templates/scripts/show_metrics.py +157 -0
- package/templates/scripts/statistical_compare.py +259 -0
- package/templates/scripts/stop-hook.sh +34 -0
- package/templates/scripts/suggest_next.py +301 -0
- package/templates/scripts/sweep.py +276 -0
- package/templates/scripts/synthesize_decision.py +300 -0
- package/templates/scripts/turing_io.py +76 -0
- package/templates/scripts/update_state.py +296 -0
- package/templates/scripts/validate_stability.py +167 -0
- package/templates/scripts/verify_placeholders.py +119 -0
- package/templates/sweep_config.yaml +14 -0
- package/templates/tests/__init__.py +0 -0
- package/templates/tests/conftest.py +91 -0
- package/templates/train.py +240 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Research logbook generator for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Generates a self-contained HTML logbook showing the full research
|
|
5
|
+
narrative: hypotheses proposed, experiments run, decisions made,
|
|
6
|
+
and progress over time. Designed to be shared with collaborators
|
|
7
|
+
or archived as a research artifact.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python scripts/generate_logbook.py
|
|
11
|
+
python scripts/generate_logbook.py --output logbook.html
|
|
12
|
+
python scripts/generate_logbook.py --since 2026-03-01
|
|
13
|
+
python scripts/generate_logbook.py --format markdown --output logbook.md
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import json
|
|
20
|
+
import sys
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments, load_hypotheses
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_decisions(decisions_dir: str) -> list[dict]:
|
|
30
|
+
"""Load decision packets."""
|
|
31
|
+
path = Path(decisions_dir)
|
|
32
|
+
if not path.exists():
|
|
33
|
+
return []
|
|
34
|
+
decisions = []
|
|
35
|
+
for f in sorted(path.glob("*.json")):
|
|
36
|
+
try:
|
|
37
|
+
with open(f) as fh:
|
|
38
|
+
decisions.append(json.load(fh))
|
|
39
|
+
except (json.JSONDecodeError, OSError):
|
|
40
|
+
continue
|
|
41
|
+
return decisions
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def build_timeline(experiments: list[dict], hypotheses: list[dict]) -> list[dict]:
|
|
45
|
+
"""Build a unified timeline of events from experiments and hypotheses."""
|
|
46
|
+
events = []
|
|
47
|
+
|
|
48
|
+
for exp in experiments:
|
|
49
|
+
ts = exp.get("timestamp", "")
|
|
50
|
+
events.append({
|
|
51
|
+
"timestamp": ts,
|
|
52
|
+
"type": "experiment",
|
|
53
|
+
"id": exp.get("experiment_id", "?"),
|
|
54
|
+
"description": exp.get("description", ""),
|
|
55
|
+
"status": exp.get("status", "unknown"),
|
|
56
|
+
"metrics": exp.get("metrics", {}),
|
|
57
|
+
"config": exp.get("config", {}),
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
for hyp in hypotheses:
|
|
61
|
+
ts = hyp.get("created_at", "")
|
|
62
|
+
events.append({
|
|
63
|
+
"timestamp": ts,
|
|
64
|
+
"type": "hypothesis",
|
|
65
|
+
"id": hyp.get("id", "?"),
|
|
66
|
+
"description": hyp.get("description", ""),
|
|
67
|
+
"status": hyp.get("status", "queued"),
|
|
68
|
+
"source": hyp.get("source", "unknown"),
|
|
69
|
+
"priority": hyp.get("priority", "medium"),
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
events.sort(key=lambda e: e.get("timestamp", ""))
|
|
73
|
+
return events
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def compute_trajectory(experiments: list[dict], metric_name: str,
|
|
77
|
+
lower_is_better: bool = False) -> list[dict]:
|
|
78
|
+
"""Compute the improvement trajectory over time."""
|
|
79
|
+
trajectory = []
|
|
80
|
+
best_val = None
|
|
81
|
+
|
|
82
|
+
for exp in experiments:
|
|
83
|
+
val = exp.get("metrics", {}).get(metric_name)
|
|
84
|
+
if val is None or not isinstance(val, (int, float)):
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
is_new_best = False
|
|
88
|
+
if best_val is None:
|
|
89
|
+
best_val = val
|
|
90
|
+
is_new_best = True
|
|
91
|
+
elif lower_is_better and val < best_val:
|
|
92
|
+
best_val = val
|
|
93
|
+
is_new_best = True
|
|
94
|
+
elif not lower_is_better and val > best_val:
|
|
95
|
+
best_val = val
|
|
96
|
+
is_new_best = True
|
|
97
|
+
|
|
98
|
+
trajectory.append({
|
|
99
|
+
"experiment_id": exp.get("experiment_id", "?"),
|
|
100
|
+
"timestamp": exp.get("timestamp", ""),
|
|
101
|
+
"value": val,
|
|
102
|
+
"best_so_far": best_val,
|
|
103
|
+
"is_new_best": is_new_best,
|
|
104
|
+
"status": exp.get("status", "unknown"),
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
return trajectory
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def generate_markdown(config: dict, experiments: list[dict],
|
|
111
|
+
hypotheses: list[dict], trajectory: list[dict],
|
|
112
|
+
since: str | None = None) -> str:
|
|
113
|
+
"""Generate a markdown logbook."""
|
|
114
|
+
metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
115
|
+
task_desc = config.get("task", {}).get("description", "ML experiment campaign")
|
|
116
|
+
lower_is_better = config.get("evaluation", {}).get("lower_is_better", False)
|
|
117
|
+
|
|
118
|
+
lines = []
|
|
119
|
+
lines.append(f"# Research Logbook: {task_desc}")
|
|
120
|
+
lines.append(f"")
|
|
121
|
+
lines.append(f"Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
|
|
122
|
+
if since:
|
|
123
|
+
lines.append(f"Period: {since} to present")
|
|
124
|
+
lines.append("")
|
|
125
|
+
|
|
126
|
+
# Campaign summary
|
|
127
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
128
|
+
discarded = [e for e in experiments if e.get("status") == "discarded"]
|
|
129
|
+
lines.append("## Campaign Summary")
|
|
130
|
+
lines.append("")
|
|
131
|
+
lines.append(f"- **Total experiments:** {len(experiments)}")
|
|
132
|
+
lines.append(f"- **Kept:** {len(kept)} ({len(kept)/len(experiments)*100:.0f}%)" if experiments else "- **Kept:** 0")
|
|
133
|
+
lines.append(f"- **Discarded:** {len(discarded)}")
|
|
134
|
+
lines.append(f"- **Hypotheses proposed:** {len(hypotheses)}")
|
|
135
|
+
human_hyps = [h for h in hypotheses if h.get("source") == "human"]
|
|
136
|
+
lines.append(f"- **Human-injected:** {len(human_hyps)}")
|
|
137
|
+
lines.append("")
|
|
138
|
+
|
|
139
|
+
# Best result
|
|
140
|
+
if trajectory:
|
|
141
|
+
best = max(trajectory, key=lambda t: -t["best_so_far"] if lower_is_better else t["best_so_far"])
|
|
142
|
+
lines.append("## Best Result")
|
|
143
|
+
lines.append("")
|
|
144
|
+
lines.append(f"- **{metric_name}:** {best['best_so_far']}")
|
|
145
|
+
lines.append(f"- **Experiment:** {best['experiment_id']}")
|
|
146
|
+
lines.append("")
|
|
147
|
+
|
|
148
|
+
# Improvement trajectory
|
|
149
|
+
if trajectory:
|
|
150
|
+
lines.append("## Improvement Trajectory")
|
|
151
|
+
lines.append("")
|
|
152
|
+
lines.append(f"| # | Experiment | {metric_name} | Best | New Best? |")
|
|
153
|
+
lines.append("|---|-----------|--------|------|-----------|")
|
|
154
|
+
for i, t in enumerate(trajectory, 1):
|
|
155
|
+
marker = "**yes**" if t["is_new_best"] else ""
|
|
156
|
+
lines.append(f"| {i} | {t['experiment_id']} | {t['value']:.4f} | {t['best_so_far']:.4f} | {marker} |")
|
|
157
|
+
lines.append("")
|
|
158
|
+
|
|
159
|
+
# Experiment log
|
|
160
|
+
lines.append("## Experiment Log")
|
|
161
|
+
lines.append("")
|
|
162
|
+
for exp in experiments:
|
|
163
|
+
status_icon = "kept" if exp.get("status") == "kept" else "discarded"
|
|
164
|
+
exp_id = exp.get("experiment_id", "?")
|
|
165
|
+
desc = exp.get("description", "No description")
|
|
166
|
+
metrics = exp.get("metrics", {})
|
|
167
|
+
metric_str = ", ".join(f"{k}={v:.4f}" for k, v in metrics.items()
|
|
168
|
+
if isinstance(v, (int, float)))
|
|
169
|
+
|
|
170
|
+
lines.append(f"### {exp_id} [{status_icon}]")
|
|
171
|
+
lines.append("")
|
|
172
|
+
lines.append(f"**Description:** {desc}")
|
|
173
|
+
lines.append(f"**Metrics:** {metric_str}")
|
|
174
|
+
ts = exp.get("timestamp", "")
|
|
175
|
+
if ts:
|
|
176
|
+
lines.append(f"**Time:** {ts[:19]}")
|
|
177
|
+
lines.append("")
|
|
178
|
+
|
|
179
|
+
# Hypothesis log
|
|
180
|
+
if hypotheses:
|
|
181
|
+
lines.append("## Hypothesis Queue")
|
|
182
|
+
lines.append("")
|
|
183
|
+
lines.append("| ID | Description | Source | Status | Priority |")
|
|
184
|
+
lines.append("|---|-----------|--------|--------|----------|")
|
|
185
|
+
for h in hypotheses:
|
|
186
|
+
lines.append(
|
|
187
|
+
f"| {h.get('id', '?')} "
|
|
188
|
+
f"| {h.get('description', '')[:60]} "
|
|
189
|
+
f"| {h.get('source', '?')} "
|
|
190
|
+
f"| {h.get('status', '?')} "
|
|
191
|
+
f"| {h.get('priority', '?')} |"
|
|
192
|
+
)
|
|
193
|
+
lines.append("")
|
|
194
|
+
|
|
195
|
+
return "\n".join(lines)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def generate_html(config: dict, experiments: list[dict],
|
|
199
|
+
hypotheses: list[dict], trajectory: list[dict],
|
|
200
|
+
since: str | None = None) -> str:
|
|
201
|
+
"""Generate a self-contained HTML logbook."""
|
|
202
|
+
metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
203
|
+
task_desc = config.get("task", {}).get("description", "ML experiment campaign")
|
|
204
|
+
lower_is_better = config.get("evaluation", {}).get("lower_is_better", False)
|
|
205
|
+
|
|
206
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
207
|
+
discarded = [e for e in experiments if e.get("status") == "discarded"]
|
|
208
|
+
keep_rate = f"{len(kept)/len(experiments)*100:.0f}" if experiments else "0"
|
|
209
|
+
|
|
210
|
+
best_metric = ""
|
|
211
|
+
best_exp = ""
|
|
212
|
+
if trajectory:
|
|
213
|
+
best = max(trajectory, key=lambda t: -t["best_so_far"] if lower_is_better else t["best_so_far"])
|
|
214
|
+
best_metric = f"{best['best_so_far']:.4f}"
|
|
215
|
+
best_exp = best["experiment_id"]
|
|
216
|
+
|
|
217
|
+
# Build trajectory data for the chart
|
|
218
|
+
traj_labels = json.dumps([t["experiment_id"] for t in trajectory])
|
|
219
|
+
traj_values = json.dumps([round(t["value"], 4) for t in trajectory])
|
|
220
|
+
traj_best = json.dumps([round(t["best_so_far"], 4) for t in trajectory])
|
|
221
|
+
|
|
222
|
+
# Build experiment rows
|
|
223
|
+
exp_rows = ""
|
|
224
|
+
for exp in experiments:
|
|
225
|
+
status = exp.get("status", "unknown")
|
|
226
|
+
cls = "kept" if status == "kept" else "discarded"
|
|
227
|
+
metrics = exp.get("metrics", {})
|
|
228
|
+
metric_val = metrics.get(metric_name, "—")
|
|
229
|
+
if isinstance(metric_val, float):
|
|
230
|
+
metric_val = f"{metric_val:.4f}"
|
|
231
|
+
exp_rows += f"""
|
|
232
|
+
<tr class="{cls}">
|
|
233
|
+
<td><code>{exp.get('experiment_id', '?')}</code></td>
|
|
234
|
+
<td>{exp.get('description', '')}</td>
|
|
235
|
+
<td>{metric_val}</td>
|
|
236
|
+
<td><span class="badge {cls}">{status}</span></td>
|
|
237
|
+
<td>{exp.get('timestamp', '')[:10]}</td>
|
|
238
|
+
</tr>"""
|
|
239
|
+
|
|
240
|
+
# Build hypothesis rows
|
|
241
|
+
hyp_rows = ""
|
|
242
|
+
for h in hypotheses:
|
|
243
|
+
status = h.get("status", "queued")
|
|
244
|
+
hyp_rows += f"""
|
|
245
|
+
<tr>
|
|
246
|
+
<td><code>{h.get('id', '?')}</code></td>
|
|
247
|
+
<td>{h.get('description', '')}</td>
|
|
248
|
+
<td><span class="badge source-{h.get('source', 'unknown')}">{h.get('source', '?')}</span></td>
|
|
249
|
+
<td><span class="badge status-{status}">{status}</span></td>
|
|
250
|
+
<td>{h.get('priority', '?')}</td>
|
|
251
|
+
</tr>"""
|
|
252
|
+
|
|
253
|
+
html = f"""<!DOCTYPE html>
|
|
254
|
+
<html lang="en">
|
|
255
|
+
<head>
|
|
256
|
+
<meta charset="UTF-8">
|
|
257
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
258
|
+
<title>Research Logbook — {task_desc}</title>
|
|
259
|
+
<style>
|
|
260
|
+
:root {{ --blue: #2563eb; --green: #16a34a; --red: #dc2626; --gray: #6b7280;
|
|
261
|
+
--bg: #f8fafc; --card: #fff; --border: #e2e8f0; }}
|
|
262
|
+
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
|
263
|
+
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
264
|
+
background: var(--bg); color: #1e293b; line-height: 1.6; }}
|
|
265
|
+
.container {{ max-width: 1100px; margin: 0 auto; padding: 2rem; }}
|
|
266
|
+
header {{ background: linear-gradient(135deg, #1e40af, #3b82f6); color: #fff;
|
|
267
|
+
padding: 2rem; border-radius: 12px; margin-bottom: 2rem; }}
|
|
268
|
+
header h1 {{ font-size: 1.8rem; margin-bottom: 0.5rem; }}
|
|
269
|
+
header p {{ opacity: 0.85; font-size: 0.95rem; }}
|
|
270
|
+
.stats {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
|
|
271
|
+
gap: 1rem; margin-bottom: 2rem; }}
|
|
272
|
+
.stat {{ background: var(--card); border: 1px solid var(--border); border-radius: 8px;
|
|
273
|
+
padding: 1.2rem; text-align: center; }}
|
|
274
|
+
.stat .value {{ font-size: 1.8rem; font-weight: 700; color: var(--blue); }}
|
|
275
|
+
.stat .label {{ font-size: 0.85rem; color: var(--gray); margin-top: 0.25rem; }}
|
|
276
|
+
.section {{ background: var(--card); border: 1px solid var(--border); border-radius: 8px;
|
|
277
|
+
padding: 1.5rem; margin-bottom: 1.5rem; }}
|
|
278
|
+
.section h2 {{ font-size: 1.2rem; margin-bottom: 1rem; color: #1e293b;
|
|
279
|
+
border-bottom: 2px solid var(--blue); padding-bottom: 0.5rem; }}
|
|
280
|
+
table {{ width: 100%; border-collapse: collapse; font-size: 0.9rem; }}
|
|
281
|
+
th {{ text-align: left; padding: 0.6rem; background: #f1f5f9; border-bottom: 2px solid var(--border); }}
|
|
282
|
+
td {{ padding: 0.6rem; border-bottom: 1px solid var(--border); }}
|
|
283
|
+
tr.kept td:first-child {{ border-left: 3px solid var(--green); }}
|
|
284
|
+
tr.discarded td:first-child {{ border-left: 3px solid var(--red); }}
|
|
285
|
+
.badge {{ display: inline-block; padding: 0.15rem 0.5rem; border-radius: 4px;
|
|
286
|
+
font-size: 0.75rem; font-weight: 600; }}
|
|
287
|
+
.badge.kept {{ background: #dcfce7; color: #166534; }}
|
|
288
|
+
.badge.discarded {{ background: #fee2e2; color: #991b1b; }}
|
|
289
|
+
.badge.source-human {{ background: #dbeafe; color: #1e40af; }}
|
|
290
|
+
.badge.source-agent {{ background: #f3e8ff; color: #6b21a8; }}
|
|
291
|
+
.badge.source-literature {{ background: #fef3c7; color: #92400e; }}
|
|
292
|
+
.badge.status-queued {{ background: #e0e7ff; color: #3730a3; }}
|
|
293
|
+
.badge.status-tested {{ background: #d1fae5; color: #065f46; }}
|
|
294
|
+
.badge.status-dead-end {{ background: #fee2e2; color: #991b1b; }}
|
|
295
|
+
.badge.status-promising {{ background: #dcfce7; color: #166534; }}
|
|
296
|
+
.badge.status-in-progress {{ background: #fef3c7; color: #92400e; }}
|
|
297
|
+
.chart {{ width: 100%; height: 250px; position: relative; }}
|
|
298
|
+
canvas {{ width: 100% !important; height: 100% !important; }}
|
|
299
|
+
code {{ background: #f1f5f9; padding: 0.1rem 0.3rem; border-radius: 3px; font-size: 0.85em; }}
|
|
300
|
+
.footer {{ text-align: center; color: var(--gray); font-size: 0.8rem; margin-top: 2rem; }}
|
|
301
|
+
@media print {{
|
|
302
|
+
body {{ background: #fff; }}
|
|
303
|
+
.container {{ max-width: none; padding: 1rem; }}
|
|
304
|
+
header {{ break-inside: avoid; }}
|
|
305
|
+
.section {{ break-inside: avoid; }}
|
|
306
|
+
}}
|
|
307
|
+
</style>
|
|
308
|
+
</head>
|
|
309
|
+
<body>
|
|
310
|
+
<div class="container">
|
|
311
|
+
<header>
|
|
312
|
+
<h1>{task_desc}</h1>
|
|
313
|
+
<p>Research Logbook — Generated {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}
|
|
314
|
+
{f' — Since {since}' if since else ''}</p>
|
|
315
|
+
</header>
|
|
316
|
+
|
|
317
|
+
<div class="stats">
|
|
318
|
+
<div class="stat"><div class="value">{len(experiments)}</div><div class="label">Experiments</div></div>
|
|
319
|
+
<div class="stat"><div class="value">{keep_rate}%</div><div class="label">Keep Rate</div></div>
|
|
320
|
+
<div class="stat"><div class="value">{best_metric}</div><div class="label">Best {metric_name}</div></div>
|
|
321
|
+
<div class="stat"><div class="value">{best_exp}</div><div class="label">Best Experiment</div></div>
|
|
322
|
+
<div class="stat"><div class="value">{len(hypotheses)}</div><div class="label">Hypotheses</div></div>
|
|
323
|
+
</div>
|
|
324
|
+
|
|
325
|
+
<div class="section">
|
|
326
|
+
<h2>Improvement Trajectory</h2>
|
|
327
|
+
<div class="chart"><canvas id="trajChart"></canvas></div>
|
|
328
|
+
</div>
|
|
329
|
+
|
|
330
|
+
<div class="section">
|
|
331
|
+
<h2>Experiment Log</h2>
|
|
332
|
+
<table>
|
|
333
|
+
<thead><tr><th>ID</th><th>Description</th><th>{metric_name}</th><th>Status</th><th>Date</th></tr></thead>
|
|
334
|
+
<tbody>{exp_rows}</tbody>
|
|
335
|
+
</table>
|
|
336
|
+
</div>
|
|
337
|
+
|
|
338
|
+
<div class="section">
|
|
339
|
+
<h2>Hypothesis Queue</h2>
|
|
340
|
+
<table>
|
|
341
|
+
<thead><tr><th>ID</th><th>Description</th><th>Source</th><th>Status</th><th>Priority</th></tr></thead>
|
|
342
|
+
<tbody>{hyp_rows if hyp_rows else '<tr><td colspan="5" style="text-align:center;color:var(--gray)">No hypotheses yet</td></tr>'}</tbody>
|
|
343
|
+
</table>
|
|
344
|
+
</div>
|
|
345
|
+
|
|
346
|
+
<div class="footer">
|
|
347
|
+
<p>Generated by Turing — Autonomous ML Research Harness</p>
|
|
348
|
+
</div>
|
|
349
|
+
</div>
|
|
350
|
+
|
|
351
|
+
<script src="https://cdn.jsdelivr.net/npm/chart.js@4/dist/chart.umd.min.js"></script>
|
|
352
|
+
<script>
|
|
353
|
+
const labels = {traj_labels};
|
|
354
|
+
const values = {traj_values};
|
|
355
|
+
const best = {traj_best};
|
|
356
|
+
if (labels.length > 0) {{
|
|
357
|
+
new Chart(document.getElementById('trajChart'), {{
|
|
358
|
+
type: 'line',
|
|
359
|
+
data: {{
|
|
360
|
+
labels,
|
|
361
|
+
datasets: [
|
|
362
|
+
{{ label: '{metric_name}', data: values, borderColor: '#3b82f6', backgroundColor: 'rgba(59,130,246,0.1)',
|
|
363
|
+
fill: true, tension: 0.3, pointRadius: 4 }},
|
|
364
|
+
{{ label: 'Best so far', data: best, borderColor: '#16a34a', borderDash: [5,5],
|
|
365
|
+
fill: false, tension: 0, pointRadius: 0 }}
|
|
366
|
+
]
|
|
367
|
+
}},
|
|
368
|
+
options: {{
|
|
369
|
+
responsive: true, maintainAspectRatio: false,
|
|
370
|
+
plugins: {{ legend: {{ position: 'top' }} }},
|
|
371
|
+
scales: {{ y: {{ title: {{ display: true, text: '{metric_name}' }} }} }}
|
|
372
|
+
}}
|
|
373
|
+
}});
|
|
374
|
+
}}
|
|
375
|
+
</script>
|
|
376
|
+
</body>
|
|
377
|
+
</html>"""
|
|
378
|
+
|
|
379
|
+
return html
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def main() -> None:
|
|
383
|
+
parser = argparse.ArgumentParser(description="Generate research logbook")
|
|
384
|
+
parser.add_argument("--config", default="config.yaml")
|
|
385
|
+
parser.add_argument("--log", default="experiments/log.jsonl")
|
|
386
|
+
parser.add_argument("--hypotheses", default="hypotheses.yaml")
|
|
387
|
+
parser.add_argument("--output", default=None, help="Output file path")
|
|
388
|
+
parser.add_argument("--format", choices=["html", "markdown"], default="html")
|
|
389
|
+
parser.add_argument("--since", default=None, help="Filter events after this date (YYYY-MM-DD)")
|
|
390
|
+
args = parser.parse_args()
|
|
391
|
+
|
|
392
|
+
config = load_config(args.config)
|
|
393
|
+
experiments = load_experiments(args.log)
|
|
394
|
+
hypotheses = load_hypotheses(args.hypotheses)
|
|
395
|
+
|
|
396
|
+
if args.since:
|
|
397
|
+
experiments = [e for e in experiments
|
|
398
|
+
if e.get("timestamp", "") >= args.since]
|
|
399
|
+
hypotheses = [h for h in hypotheses
|
|
400
|
+
if h.get("created_at", "") >= args.since]
|
|
401
|
+
|
|
402
|
+
metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
403
|
+
lower_is_better = config.get("evaluation", {}).get("lower_is_better", False)
|
|
404
|
+
trajectory = compute_trajectory(experiments, metric_name, lower_is_better)
|
|
405
|
+
|
|
406
|
+
if args.format == "markdown":
|
|
407
|
+
output = generate_markdown(config, experiments, hypotheses, trajectory, args.since)
|
|
408
|
+
default_name = "logbook.md"
|
|
409
|
+
else:
|
|
410
|
+
output = generate_html(config, experiments, hypotheses, trajectory, args.since)
|
|
411
|
+
default_name = "logbook.html"
|
|
412
|
+
|
|
413
|
+
if args.output:
|
|
414
|
+
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
|
|
415
|
+
with open(args.output, "w") as f:
|
|
416
|
+
f.write(output)
|
|
417
|
+
print(f"Logbook written to {args.output}")
|
|
418
|
+
else:
|
|
419
|
+
print(output)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
if __name__ == "__main__":
|
|
423
|
+
main()
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""Experiment logging utility for the autoresearch pipeline.
|
|
2
|
+
|
|
3
|
+
The ground truth record of all experiments — kept and discarded alike.
|
|
4
|
+
Appends structured JSONL entries to experiments/log.jsonl with full
|
|
5
|
+
metadata: experiment_id, timestamp, git_commit, status, config, metrics,
|
|
6
|
+
model_path, and description.
|
|
7
|
+
|
|
8
|
+
Also maintains a TSV summary at experiments/results.tsv for quick reference.
|
|
9
|
+
Every experiment is logged. No information is lost.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_next_experiment_id(log_path: str) -> str:
|
|
22
|
+
"""Get the next sequential experiment ID from the log.
|
|
23
|
+
|
|
24
|
+
Returns "exp-001" for empty/nonexistent log, "exp-NNN" otherwise.
|
|
25
|
+
"""
|
|
26
|
+
path = Path(log_path)
|
|
27
|
+
if not path.exists() or path.stat().st_size == 0:
|
|
28
|
+
return "exp-001"
|
|
29
|
+
|
|
30
|
+
max_id = 0
|
|
31
|
+
with open(path) as f:
|
|
32
|
+
for line in f:
|
|
33
|
+
line = line.strip()
|
|
34
|
+
if not line:
|
|
35
|
+
continue
|
|
36
|
+
try:
|
|
37
|
+
entry = json.loads(line)
|
|
38
|
+
exp_id = entry.get("experiment_id", "")
|
|
39
|
+
if exp_id.startswith("exp-"):
|
|
40
|
+
num = int(exp_id.split("-")[1])
|
|
41
|
+
max_id = max(max_id, num)
|
|
42
|
+
except (json.JSONDecodeError, ValueError, IndexError):
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
return f"exp-{max_id + 1:03d}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def log_tsv_row(
|
|
49
|
+
tsv_path: str,
|
|
50
|
+
experiment_id: str,
|
|
51
|
+
status: str,
|
|
52
|
+
config: dict,
|
|
53
|
+
metrics: dict,
|
|
54
|
+
description: str,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Append a TSV row to the results summary file.
|
|
57
|
+
|
|
58
|
+
Creates header row if file doesn't exist or is empty.
|
|
59
|
+
Columns: experiment_id, status, model_type, primary_metric, description, timestamp
|
|
60
|
+
"""
|
|
61
|
+
path = Path(tsv_path)
|
|
62
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
|
|
64
|
+
write_header = not path.exists() or path.stat().st_size == 0
|
|
65
|
+
|
|
66
|
+
# Build metric columns dynamically
|
|
67
|
+
metric_cols = "\t".join(str(v) for v in metrics.values())
|
|
68
|
+
metric_headers = "\t".join(metrics.keys())
|
|
69
|
+
|
|
70
|
+
row_data = [
|
|
71
|
+
experiment_id,
|
|
72
|
+
status,
|
|
73
|
+
config.get("model_type", "unknown"),
|
|
74
|
+
metric_cols,
|
|
75
|
+
description.replace("\t", " "),
|
|
76
|
+
datetime.now(timezone.utc).isoformat(),
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
with open(path, "a") as f:
|
|
80
|
+
if write_header:
|
|
81
|
+
f.write(f"experiment_id\tstatus\tmodel_type\t{metric_headers}\tdescription\ttimestamp\n")
|
|
82
|
+
f.write("\t".join(row_data) + "\n")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def log_experiment(
|
|
86
|
+
log_path: str,
|
|
87
|
+
experiment_id: str,
|
|
88
|
+
config: dict,
|
|
89
|
+
metrics: dict,
|
|
90
|
+
model_path: str,
|
|
91
|
+
description: str,
|
|
92
|
+
status: str = "kept",
|
|
93
|
+
git_commit: str | None = None,
|
|
94
|
+
parent_experiment: str | None = None,
|
|
95
|
+
hypothesis_id: str | None = None,
|
|
96
|
+
family: str | None = None,
|
|
97
|
+
tags: list[str] | None = None,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Append one experiment entry to the JSONL log.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
log_path: Path to experiments/log.jsonl.
|
|
103
|
+
experiment_id: e.g. "exp-001".
|
|
104
|
+
config: Dict with model_type, hyperparams, features.
|
|
105
|
+
metrics: Dict with metric values.
|
|
106
|
+
model_path: Path to saved model artifact.
|
|
107
|
+
description: Human-readable experiment description.
|
|
108
|
+
status: "kept" or "discarded".
|
|
109
|
+
git_commit: Optional git commit hash.
|
|
110
|
+
parent_experiment: Optional parent experiment ID (for dependency tree).
|
|
111
|
+
hypothesis_id: Optional hypothesis ID (links to hypotheses.yaml).
|
|
112
|
+
family: Optional experiment family for strategic grouping.
|
|
113
|
+
tags: Optional list of tags for categorization.
|
|
114
|
+
"""
|
|
115
|
+
path = Path(log_path)
|
|
116
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
|
|
118
|
+
# Load environment snapshot from train_metadata.json if available
|
|
119
|
+
environment = None
|
|
120
|
+
metadata_path = Path("train_metadata.json")
|
|
121
|
+
if metadata_path.exists():
|
|
122
|
+
try:
|
|
123
|
+
with open(metadata_path) as mf:
|
|
124
|
+
meta = json.load(mf)
|
|
125
|
+
environment = meta.get("environment")
|
|
126
|
+
except (json.JSONDecodeError, OSError):
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
entry = {
|
|
130
|
+
"experiment_id": experiment_id,
|
|
131
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
132
|
+
"git_commit": git_commit,
|
|
133
|
+
"status": status,
|
|
134
|
+
"parent_experiment": parent_experiment,
|
|
135
|
+
"hypothesis_id": hypothesis_id,
|
|
136
|
+
"family": family,
|
|
137
|
+
"tags": tags or [],
|
|
138
|
+
"config": config,
|
|
139
|
+
"metrics": metrics,
|
|
140
|
+
"model_path": model_path,
|
|
141
|
+
"description": description,
|
|
142
|
+
"environment": environment,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
with open(path, "a") as f:
|
|
146
|
+
f.write(json.dumps(entry) + "\n")
|
|
147
|
+
|
|
148
|
+
# Also append TSV summary row
|
|
149
|
+
tsv_path = str(Path(log_path).parent / "results.tsv")
|
|
150
|
+
log_tsv_row(tsv_path, experiment_id, status, config, metrics, description)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get_best_experiment(log_path: str, primary_metric: str = "accuracy", lower_is_better: bool = False) -> dict | None:
|
|
154
|
+
"""Return the experiment with the best primary metric among 'keep' entries.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
log_path: Path to experiments/log.jsonl.
|
|
158
|
+
primary_metric: Metric name to optimize.
|
|
159
|
+
lower_is_better: True for metrics like MAE/MSE, False for accuracy/F1.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Best experiment dict, or None if no 'keep' entries exist.
|
|
163
|
+
"""
|
|
164
|
+
path = Path(log_path)
|
|
165
|
+
if not path.exists():
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
best = None
|
|
169
|
+
best_value = float("inf") if lower_is_better else float("-inf")
|
|
170
|
+
|
|
171
|
+
with open(path) as f:
|
|
172
|
+
for line in f:
|
|
173
|
+
line = line.strip()
|
|
174
|
+
if not line:
|
|
175
|
+
continue
|
|
176
|
+
try:
|
|
177
|
+
entry = json.loads(line)
|
|
178
|
+
except json.JSONDecodeError:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
if entry.get("status") != "kept":
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
value = entry.get("metrics", {}).get(primary_metric)
|
|
185
|
+
if value is None:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
if lower_is_better and value < best_value:
|
|
189
|
+
best_value = value
|
|
190
|
+
best = entry
|
|
191
|
+
elif not lower_is_better and value > best_value:
|
|
192
|
+
best_value = value
|
|
193
|
+
best = entry
|
|
194
|
+
|
|
195
|
+
return best
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def main() -> None:
|
|
199
|
+
"""CLI entry point for logging experiments."""
|
|
200
|
+
parser = argparse.ArgumentParser(
|
|
201
|
+
description="Log an experiment to the append-only JSONL experiment log.",
|
|
202
|
+
)
|
|
203
|
+
parser.add_argument("log_path", help="Path to experiments/log.jsonl")
|
|
204
|
+
parser.add_argument("experiment_id", help='Experiment ID, e.g. "exp-001"')
|
|
205
|
+
parser.add_argument("status", help='"kept" or "discarded"')
|
|
206
|
+
parser.add_argument("metrics_json", help="Metrics as a JSON string")
|
|
207
|
+
parser.add_argument("config_json", help="Config as a JSON string")
|
|
208
|
+
parser.add_argument("model_path", help="Path to saved model artifact")
|
|
209
|
+
parser.add_argument(
|
|
210
|
+
"description",
|
|
211
|
+
nargs="*",
|
|
212
|
+
help="Human-readable experiment description",
|
|
213
|
+
)
|
|
214
|
+
parser.add_argument("--parent", default=None, help="Parent experiment ID")
|
|
215
|
+
parser.add_argument("--hypothesis", default=None, help="Hypothesis ID")
|
|
216
|
+
parser.add_argument("--family", default=None, help="Experiment family name")
|
|
217
|
+
parser.add_argument("--tags", default=None, help="Comma-separated tags")
|
|
218
|
+
|
|
219
|
+
args = parser.parse_args()
|
|
220
|
+
|
|
221
|
+
metrics = json.loads(args.metrics_json)
|
|
222
|
+
config = json.loads(args.config_json)
|
|
223
|
+
description = " ".join(args.description) if args.description else ""
|
|
224
|
+
tags = [t.strip() for t in args.tags.split(",")] if args.tags else None
|
|
225
|
+
|
|
226
|
+
log_experiment(
|
|
227
|
+
log_path=args.log_path,
|
|
228
|
+
experiment_id=args.experiment_id,
|
|
229
|
+
config=config,
|
|
230
|
+
metrics=metrics,
|
|
231
|
+
model_path=args.model_path,
|
|
232
|
+
description=description,
|
|
233
|
+
status=args.status,
|
|
234
|
+
parent_experiment=args.parent,
|
|
235
|
+
hypothesis_id=args.hypothesis,
|
|
236
|
+
family=args.family,
|
|
237
|
+
tags=tags,
|
|
238
|
+
)
|
|
239
|
+
print(f"Logged {args.experiment_id} ({args.status})")
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
main()
|