leanlab 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,553 @@
1
+ """leanlab — generic live dashboard for a lab.
2
+
3
+ Schema-driven: it reads the lab's lab.json (the objective) and results.jsonl
4
+ (flexible metrics), so it works for any lab. Shows:
5
+ - PROGRESS chart: the objective metric across experiments + the best-so-far line.
6
+ - RESULTS table: every metric column, best row highlighted.
7
+ - LIVE STREAM: the running agent's messages, tool calls, timings, and cost.
8
+ - SESSIONS: each agent run (worker / director / critic) with its cost.
9
+
10
+ Run:
11
+ uv run python core/monitor.py --lab labs/house-prices
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import re
19
+ import sys
20
+ import time
21
+ import webbrowser
22
+ from datetime import datetime
23
+ from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
24
+ from pathlib import Path
25
+ from urllib.parse import urlparse, parse_qs
26
+
27
+ RUNNING_WINDOW = 15
28
+ PRICES = {"opus": (15.0, 75.0, 1.50), "sonnet": (3.0, 15.0, 0.30),
29
+ "haiku": (0.80, 4.0, 0.08), "fable": (15.0, 75.0, 1.50)}
30
+ _DEFAULT_PRICE = (15.0, 75.0, 1.50)
31
+
32
+ LAB = None # set in main()
33
+
34
+
35
+ def cfg():
36
+ return json.loads((LAB / "lab.json").read_text())
37
+
38
+
39
+ def objective():
40
+ o = cfg().get("objective", {})
41
+ return o.get("metric", "score"), o.get("direction", "max")
42
+
43
+
44
+ # --- pricing / timestamps ---------------------------------------------------
45
+ def _price_for(model):
46
+ name = (model or "").lower()
47
+ for k, r in PRICES.items():
48
+ if k in name:
49
+ return r
50
+ return _DEFAULT_PRICE
51
+
52
+
53
+ def _turn_cost(model, u):
54
+ i, o, cr = _price_for(model)
55
+ return (((u.get("input_tokens") or 0) * i + (u.get("output_tokens") or 0) * o
56
+ + (u.get("cache_read_input_tokens") or 0) * cr
57
+ + (u.get("cache_creation_input_tokens") or 0) * i * 1.25) / 1e6)
58
+
59
+
60
+ def _parse_ts(ts):
61
+ if not ts:
62
+ return None
63
+ try:
64
+ return datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
65
+ except ValueError:
66
+ return None
67
+
68
+
69
+ def _clip(t, limit=200000):
70
+ if t is None:
71
+ return ""
72
+ t = str(t)
73
+ return t if len(t) <= limit else t[:limit] + " …(clipped)"
74
+
75
+
76
+ def _stringify(v):
77
+ if isinstance(v, str):
78
+ return v
79
+ try:
80
+ return json.dumps(v, ensure_ascii=False)
81
+ except (TypeError, ValueError):
82
+ return str(v)
83
+
84
+
85
+ # --- transcripts ------------------------------------------------------------
86
+ def transcript_dir():
87
+ base = Path.home() / ".claude" / "projects"
88
+ exact = base / str(LAB).replace("/", "-")
89
+ if exact.is_dir():
90
+ return exact
91
+ matches = sorted(base.glob(f"*{LAB.name}*"))
92
+ return matches[-1] if matches else exact
93
+
94
+
95
+ _EXP_RE = re.compile(r"experiments/[A-Za-z0-9_]+\.py")
96
+
97
+
98
+ def parse_session(path):
99
+ events, first_user, artifact, pending = [], None, None, {}
100
+ try:
101
+ lines = path.read_text(errors="replace").splitlines()
102
+ except OSError:
103
+ return {"role": "unknown", "artifact": None}, []
104
+ for line in lines:
105
+ line = line.strip()
106
+ if not line:
107
+ continue
108
+ try:
109
+ obj = json.loads(line)
110
+ except json.JSONDecodeError:
111
+ continue
112
+ kind, msg = obj.get("type"), obj.get("message") or {}
113
+ ts = obj.get("timestamp")
114
+ ts_s = _parse_ts(ts)
115
+ if kind == "user":
116
+ content = msg.get("content")
117
+ if isinstance(content, str):
118
+ if first_user is None:
119
+ first_user = content
120
+ events.append({"kind": "user", "text": _clip(content), "ts": ts})
121
+ elif isinstance(content, list):
122
+ for b in content:
123
+ if not isinstance(b, dict):
124
+ continue
125
+ if b.get("type") == "tool_result":
126
+ dur = None
127
+ start = pending.pop(b.get("tool_use_id"), None)
128
+ if start is not None and ts_s is not None:
129
+ dur = round(ts_s - start, 2)
130
+ events.append({"kind": "result", "ts": ts, "dur": dur,
131
+ "text": _clip(_stringify(b.get("content")))})
132
+ elif b.get("type") == "text":
133
+ if first_user is None:
134
+ first_user = b.get("text", "")
135
+ events.append({"kind": "user", "text": _clip(b.get("text", "")), "ts": ts})
136
+ elif kind == "assistant":
137
+ u = msg.get("usage") or {}
138
+ turn = {"model": msg.get("model"), "in_tok": u.get("input_tokens"),
139
+ "out_tok": u.get("output_tokens"),
140
+ "cache_tok": u.get("cache_read_input_tokens"),
141
+ "cost": round(_turn_cost(msg.get("model"), u), 6) if u else None}
142
+ firstb = True
143
+ for b in msg.get("content") or []:
144
+ if not isinstance(b, dict):
145
+ continue
146
+ if b.get("type") == "text" and b.get("text", "").strip():
147
+ ev = {"kind": "text", "text": _clip(b["text"]), "ts": ts}
148
+ if firstb:
149
+ ev.update(turn); firstb = False
150
+ events.append(ev)
151
+ elif b.get("type") == "tool_use":
152
+ raw = _stringify(b.get("input"))
153
+ m = _EXP_RE.search(raw)
154
+ if m and "sample.py" not in m.group(0):
155
+ artifact = m.group(0)
156
+ if ts_s is not None and b.get("id"):
157
+ pending[b["id"]] = ts_s
158
+ ev = {"kind": "tool", "name": b.get("name", "tool"), "text": _clip(raw), "ts": ts}
159
+ if firstb:
160
+ ev.update(turn); firstb = False
161
+ events.append(ev)
162
+ role = "unknown"
163
+ if first_user:
164
+ head = first_user.strip().splitlines()[0].lower()
165
+ # New prompts start "You are the WORKER/DIRECTOR/CRITIC ..."; the older markers
166
+ # ("read director.md", etc.) are kept so historical transcripts still classify.
167
+ if head.startswith("you are the director") or head.startswith("read director.md"):
168
+ role = "director"
169
+ elif head.startswith("you are the critic") or head.startswith("read critic.md"):
170
+ role = "critic"
171
+ elif (head.startswith("you are the worker") or head.startswith("read claude.md")
172
+ or "exactly one experiment" in head):
173
+ role = "worker"
174
+ return {"role": role, "artifact": artifact}, events
175
+
176
+
177
+ _META = {}
178
+
179
+
180
+ def session_meta(path):
181
+ mt = path.stat().st_mtime
182
+ c = _META.get(str(path))
183
+ if c and c[0] == mt:
184
+ return c[1]
185
+ meta, events = parse_session(path)
186
+ info = {"role": meta["role"], "artifact": meta["artifact"], "events": len(events),
187
+ "cost": round(sum(e.get("cost") or 0 for e in events), 4)}
188
+ _META[str(path)] = (mt, info)
189
+ return info
190
+
191
+
192
+ def list_sessions():
193
+ out, now = [], time.time()
194
+ for p in sorted(transcript_dir().glob("*.jsonl"), key=lambda x: x.stat().st_mtime, reverse=True):
195
+ info = session_meta(p)
196
+ mt = p.stat().st_mtime
197
+ out.append({"id": p.stem, **info, "mtime": mt, "running": (now - mt) < RUNNING_WINDOW})
198
+ return out
199
+
200
+
201
+ # --- results ----------------------------------------------------------------
202
+ def read_results():
203
+ path = LAB / cfg().get("results_file", "results.jsonl")
204
+ if not path.exists():
205
+ return []
206
+ out = []
207
+ for line in path.read_text().splitlines():
208
+ line = line.strip()
209
+ if line:
210
+ try:
211
+ out.append(json.loads(line))
212
+ except json.JSONDecodeError:
213
+ pass
214
+ return out
215
+
216
+
217
+ def best_value(rows):
218
+ metric, d = objective()
219
+ vals = []
220
+ for r in rows:
221
+ try:
222
+ vals.append(float(r.get(metric)))
223
+ except (TypeError, ValueError):
224
+ pass
225
+ if not vals:
226
+ return None
227
+ return min(vals) if d == "min" else max(vals)
228
+
229
+
230
+ def latest_value(rows, metric):
231
+ """The metric of the most recent (last) experiment, or None if unparseable."""
232
+ if not rows:
233
+ return None
234
+ try:
235
+ return float(rows[-1].get(metric))
236
+ except (TypeError, ValueError):
237
+ return None
238
+
239
+
240
+ def total_cost(sessions):
241
+ """Sum the cost across every agent session (worker / director / critic)."""
242
+ return round(sum(s.get("cost") or 0 for s in sessions), 4)
243
+
244
+
245
+ def build_state():
246
+ sessions = list_sessions()
247
+ active = next((s["id"] for s in sessions if s["running"]), None) or (sessions[0]["id"] if sessions else None)
248
+ rows = read_results()
249
+ metric, d = objective()
250
+ return {
251
+ "lab": cfg().get("name", LAB.name),
252
+ "metric": metric, "direction": d,
253
+ "results": rows, "best": best_value(rows),
254
+ "latest": latest_value(rows, metric), "total_cost": total_cost(sessions),
255
+ "sessions": sessions, "active": active,
256
+ "directions": (LAB / "Director_Notes.md").read_text() if (LAB / "Director_Notes.md").exists() else "",
257
+ "critique": (LAB / "Critic_Feedback.md").read_text() if (LAB / "Critic_Feedback.md").exists() else "",
258
+ "now": time.time(),
259
+ }
260
+
261
+
262
+ def session_payload(sid):
263
+ path = transcript_dir() / f"{sid}.jsonl"
264
+ if not path.exists():
265
+ return {"id": sid, "meta": {}, "events": [], "totals": {}}
266
+ meta, events = parse_session(path)
267
+ tot = {"in": 0, "out": 0, "cost": 0.0, "turns": 0, "model": None}
268
+ for e in events:
269
+ if e.get("cost") is not None:
270
+ tot["in"] += e.get("in_tok") or 0; tot["out"] += e.get("out_tok") or 0
271
+ tot["cost"] += e.get("cost") or 0; tot["turns"] += 1
272
+ if e.get("model"):
273
+ tot["model"] = e["model"]
274
+ tot["cost"] = round(tot["cost"], 4)
275
+ return {"id": sid, "meta": meta, "events": events, "totals": tot,
276
+ "running": (time.time() - path.stat().st_mtime) < RUNNING_WINDOW}
277
+
278
+
279
+ PAGE = r"""<!doctype html><html><head><meta charset="utf-8">
280
+ <title>leanlab · monitor</title>
281
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.1/chart.umd.min.js"></script>
282
+ <style>
283
+ :root{--bg:#0d1117;--panel:#161b22;--panel2:#1c2230;--border:#2a3240;--text:#e6edf3;
284
+ --muted:#8b949e;--accent:#58a6ff;--good:#3fb950;--bad:#f85149;--tool:#a371f7}
285
+ *{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--text);
286
+ font:14px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif}
287
+ header{display:flex;align-items:center;gap:14px;padding:10px 16px;background:var(--panel);
288
+ border-bottom:1px solid var(--border);position:sticky;top:0;z-index:5}
289
+ header h1{font-size:15px;margin:0}.badge{font-size:12px;padding:3px 10px;border-radius:20px;
290
+ background:var(--panel2);border:1px solid var(--border);white-space:nowrap}.badge b{color:var(--good)}
291
+ .board{max-width:1200px;margin:0 auto;padding:14px;display:flex;flex-direction:column;gap:14px}
292
+ .stats{display:flex;gap:12px}
293
+ .chip{flex:1;background:var(--panel);border:1px solid var(--border);border-radius:10px;padding:11px 14px;min-width:0}
294
+ .chip .k{font-size:10px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted)}
295
+ .chip .v{font-size:24px;font-weight:500;margin-top:2px;font-variant-numeric:tabular-nums}
296
+ .chip .v.good{color:var(--good)}
297
+ .card{background:var(--panel);border:1px solid var(--border);border-radius:10px;overflow:hidden}
298
+ .card-head{padding:10px 14px;font-size:11px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);
299
+ cursor:pointer;user-select:none;display:flex;align-items:center;gap:8px;border-bottom:1px solid var(--border)}
300
+ .card-head::before{content:'▾';font-size:10px;color:var(--muted)}
301
+ .card.collapsed .card-head::before{content:'▸'}
302
+ .card.collapsed .card-head{border-bottom:none}
303
+ .card.collapsed .card-body{display:none}
304
+ .card-head .ht{font-size:11px;color:#6b7686;margin-left:auto;text-transform:none;letter-spacing:0}
305
+ .card-body{padding:12px 14px}
306
+ .row2{display:flex;gap:14px}.row2 .card{flex:1;min-width:0}
307
+ .run{display:flex;height:max(460px,70vh)}
308
+ .sesslist{width:236px;flex:0 0 auto;overflow:auto;padding:8px;border-right:1px solid var(--border)}
309
+ .streampane{flex:1;min-width:0;display:flex;flex-direction:column;background:#0a0e14}
310
+ .streamhead{font-size:11px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);padding:9px 12px 6px}
311
+ .stream{flex:1;overflow:auto;padding:0 12px 12px}
312
+ .sess{padding:8px 10px;border-radius:8px;border:1px solid var(--border);cursor:pointer;background:var(--panel2);margin-bottom:8px}
313
+ .sess:last-child{margin-bottom:0}
314
+ .sess.sel{border-color:var(--accent)}.sess .role{font-weight:600;text-transform:capitalize}
315
+ .sess .role.worker{color:var(--accent)}.sess .role.director{color:var(--tool)}.sess .role.critic{color:var(--bad)}
316
+ .sess .meta{font-size:12px;color:var(--muted);margin-top:2px}
317
+ .ev{margin-bottom:10px;border-radius:8px;padding:8px 11px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow:auto}
318
+ .ev.user{background:#11233b;border:1px solid #1f3a5f}.ev.text{background:var(--panel2)}
319
+ .ev.tool{background:#1a1430;border:1px solid #3a2a5a;font-family:ui-monospace,Menlo,monospace;font-size:12.5px}
320
+ .ev.tool .tn{color:var(--tool);font-weight:600}.ev.result{background:#10161d;border:1px solid var(--border);color:var(--muted);font-family:ui-monospace,Menlo,monospace;font-size:12px}
321
+ .ev .lbl{font-size:10px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);display:block;margin-bottom:3px}
322
+ .ev .lbl .tm{text-transform:none;letter-spacing:0;color:#6b7686;font-weight:400;float:right}
323
+ .tablewrap{overflow-x:auto}
324
+ table{width:100%;border-collapse:collapse;font-size:12px}th,td{text-align:left;padding:5px 8px;border-bottom:1px solid var(--border);white-space:nowrap}
325
+ th{color:var(--muted);position:sticky;top:0;background:var(--panel)}td.num{text-align:right;font-variant-numeric:tabular-nums}
326
+ tr.best td{background:rgba(63,185,80,.13)}pre.dir{white-space:pre-wrap;font-size:12px;margin:0;
327
+ max-height:320px;overflow:auto}
328
+ .empty{color:var(--muted);font-style:italic;padding:16px;text-align:center}
329
+ .chartbox{position:relative;height:300px}
330
+ </style></head><body>
331
+ <header><h1>🧪 <span id="labName">leanlab</span></h1>
332
+ <span class="badge" id="objBadge">objective · —</span>
333
+ <span class="badge" id="usageBadge">session · —</span>
334
+ <span style="flex:1"></span><span class="badge" id="clock">—</span></header>
335
+ <div class="board">
336
+ <div class="stats" id="stats"></div>
337
+ <section class="card" data-panel="run"><div class="card-head" onclick="toggleFold('run')">Sessions &amp; live stream</div>
338
+ <div class="card-body" style="padding:0"><div class="run">
339
+ <div class="sesslist" id="sessions"></div>
340
+ <div class="streampane"><div class="streamhead" id="streamTitle">Live stream</div>
341
+ <div class="stream" id="stream"><div class="empty">Waiting…</div></div></div>
342
+ </div></div></section>
343
+ <section class="card" data-panel="progress"><div class="card-head" onclick="toggleFold('progress')">Progress</div>
344
+ <div class="card-body"><div class="chartbox" id="chart"></div></div></section>
345
+ <section class="card" data-panel="results"><div class="card-head" onclick="toggleFold('results')">Results</div>
346
+ <div class="card-body"><div class="tablewrap" id="results"></div></div></section>
347
+ <div class="row2">
348
+ <section class="card" data-panel="critics"><div class="card-head" onclick="toggleFold('critics')">🔴 Critics</div>
349
+ <div class="card-body"><pre class="dir" id="critique">—</pre></div></section>
350
+ <section class="card" data-panel="director"><div class="card-head" onclick="toggleFold('director')">🧭 Director</div>
351
+ <div class="card-body"><pre class="dir" id="directions">—</pre></div></section></div></div>
352
+ <script>
353
+ let selected=null,follow=true,es=null,lastState=null,wantJump=false;
354
+ const $=id=>document.getElementById(id);
355
+ function esc(s){const d=document.createElement('div');d.textContent=s==null?'':s;return d.innerHTML}
356
+ function fmtK(n){if(n==null)return'';return n>=1000?(n/1000).toFixed(n>=10000?0:1)+'k':''+n}
357
+ function shortModel(m){return m?m.replace(/^claude-/,'').replace(/-\d{8}$/,''):''}
358
+ function relTime(s){s=Math.round(s);if(s<10)return'just now';if(s<60)return'under a min ago';
359
+ if(s<3600)return Math.floor(s/60)+' min ago';if(s<86400)return Math.floor(s/3600)+' hr ago';return Math.floor(s/86400)+' day ago'}
360
+ function evMeta(ev){const t=ev.ts?new Date(ev.ts).toLocaleTimeString():'';const b=[];
361
+ if(ev.model)b.push(shortModel(ev.model));if(ev.in_tok!=null||ev.out_tok!=null)b.push(fmtK(ev.in_tok)+'→'+fmtK(ev.out_tok)+' tok');
362
+ if(ev.cost)b.push('$'+ev.cost.toFixed(4));if(ev.dur!=null)b.push('⏱ '+ev.dur+'s');
363
+ const r=(t||'')+(b.length?' · '+b.join(' · '):'');return r?`<span class="tm">${r}</span>`:''}
364
+
365
+ const DEFAULT_COLLAPSED={};
366
+ function foldState(){try{return JSON.parse(localStorage.getItem('leanlab.folds'))||{}}catch(e){return {}}}
367
+ function applyFolds(){const f=foldState();document.querySelectorAll('.card[data-panel]').forEach(c=>{
368
+ const k=c.dataset.panel;const col=(k in f)?f[k]:!!DEFAULT_COLLAPSED[k];c.classList.toggle('collapsed',col)})}
369
+ function toggleFold(k){const c=document.querySelector('.card[data-panel="'+k+'"]');if(!c)return;
370
+ const now=!c.classList.contains('collapsed');c.classList.toggle('collapsed',now);
371
+ const f=foldState();f[k]=now;localStorage.setItem('leanlab.folds',JSON.stringify(f))}
372
+
373
+ function connect(){if(es)es.close();es=new EventSource('/api/stream'+(selected?'?id='+encodeURIComponent(selected):''));
374
+ es.addEventListener('open',()=>$('clock').textContent='● streaming');
375
+ es.addEventListener('error',()=>$('clock').textContent='○ reconnecting…');
376
+ es.addEventListener('state',e=>onState(JSON.parse(e.data)));
377
+ es.addEventListener('session',e=>renderStream(JSON.parse(e.data)))}
378
+
379
+ function chip(k,v,good){return `<div class="chip"><div class="k">${k}</div><div class="v${good?' good':''}">${v}</div></div>`}
380
+ function renderStats(st){$('stats').innerHTML=
381
+ chip('best '+(st.direction==='min'?'↓':'↑'),st.best==null?'—':st.best,true)+
382
+ chip('latest',st.latest==null?'—':st.latest)+
383
+ chip('experiments',st.results.length)+
384
+ chip('total cost','$'+(st.total_cost||0).toFixed(2))}
385
+
386
+ function onState(st){lastState=st;
387
+ $('labName').textContent=st.lab;
388
+ $('objBadge').textContent='objective · '+st.direction+' '+st.metric;
389
+ if(follow&&st.active&&st.active!==selected){selected=st.active;wantJump=true;connect();return}
390
+ if(!selected&&st.active){selected=st.active;wantJump=true;connect();return}
391
+ renderStats(st);renderSessions();renderResults(st);renderChart(st);
392
+ $('critique').textContent=st.critique||'—';$('directions').textContent=st.directions||'—'}
393
+
394
+ function renderSessions(){if(!lastState)return;const now=Date.now()/1000;
395
+ $('sessions').innerHTML=lastState.sessions.map(s=>{const run=(now-s.mtime)<15;
396
+ const sel=s.id===selected?' sel':'';const art=s.artifact?' · '+s.artifact.replace('experiments/',''):'';
397
+ const cost=s.cost?' · $'+s.cost.toFixed(2):'';const when=run?'running':relTime(now-s.mtime);
398
+ return `<div class="sess${sel}" onclick="pick('${s.id}')"><div class="role ${s.role}">${s.role}${run?' ●':''}</div>
399
+ <div class="meta">${s.events} msgs${cost} · ${when}${art}</div></div>`}).join('')||'<div class="empty">No sessions</div>'}
400
+
401
+ function renderResults(st){const rows=st.results.slice();if(!rows.length){$('results').innerHTML='<div class="empty">No results yet</div>';return}
402
+ const skip=new Set(['tag','best_so_far','notes','ts','experiment_file']);
403
+ const cols=[];rows.forEach(r=>Object.keys(r).forEach(k=>{if(!skip.has(k)&&!cols.includes(k))cols.push(k)}));
404
+ const m=st.metric;rows.sort((a,b)=>{const x=parseFloat(a[m]),y=parseFloat(b[m]);
405
+ if(isNaN(x))return 1;if(isNaN(y))return -1;return st.direction==='min'?x-y:y-x});
406
+ let h='<table><tr><th>experiment</th>'+cols.map(c=>`<th>${esc(c)}</th>`).join('')+'</tr>';
407
+ for(const r of rows){const isB=parseFloat(r[m])===st.best;
408
+ h+=`<tr class="${isB?'best':''}"><td title="${esc(r.notes)}">${esc((r.experiment_file||'').replace('experiments/',''))}</td>`+
409
+ cols.map(c=>{let v=r[c];if(v!=null&&typeof v==='object')v=JSON.stringify(v);return `<td class="num">${esc(v)}</td>`}).join('')+'</tr>'}
410
+ $('results').innerHTML=h+'</table>'}
411
+
412
+ let chartObj=null;
413
+ function renderChart(st){const box=$('chart'),m=st.metric,rows=st.results||[];
414
+ const pts=rows.map(r=>parseFloat(r[m])),valid=pts.filter(v=>!isNaN(v));
415
+ if(typeof Chart==='undefined'){box.innerHTML='<div class="empty">chart library unavailable (offline?)</div>';chartObj=null;return}
416
+ if(!valid.length){box.innerHTML='<div class="empty">No data yet</div>';chartObj=null;return}
417
+ if(!box.querySelector('canvas')){box.innerHTML='<canvas></canvas>';chartObj=null}
418
+ const labels=rows.map((r,i)=>(r.experiment_file||('#'+(i+1))).replace('experiments/','').replace(/\.py$/,''));
419
+ let best=null;const bestLine=pts.map(v=>{if(!isNaN(v))best=best==null?v:(st.direction==='min'?Math.min(best,v):Math.max(best,v));return best});
420
+ const ptCol=pts.map(v=>v===st.best?'#3fb950':'#58a6ff'),ptR=pts.map(v=>v===st.best?6:4);
421
+ if(chartObj){const d=chartObj.data;d.labels=labels;d.datasets[0].data=pts;
422
+ d.datasets[0].pointBackgroundColor=ptCol;d.datasets[0].pointBorderColor=ptCol;d.datasets[0].pointRadius=ptR;
423
+ d.datasets[1].data=bestLine;chartObj.options.scales.y.title.text=st.direction+' '+m;chartObj.update();return}
424
+ chartObj=new Chart(box.querySelector('canvas').getContext('2d'),{type:'line',
425
+ data:{labels,datasets:[
426
+ {label:m,data:pts,borderColor:'#58a6ff',backgroundColor:'#58a6ff',pointBackgroundColor:ptCol,pointBorderColor:ptCol,pointRadius:ptR,pointHoverRadius:7,tension:.25,spanGaps:true},
427
+ {label:'best so far',data:bestLine,borderColor:'#3fb950',borderDash:[6,4],pointRadius:0,stepped:true}]},
428
+ options:{responsive:true,maintainAspectRatio:false,animation:{duration:300},interaction:{mode:'index',intersect:false},
429
+ plugins:{legend:{labels:{color:'#8b949e',boxWidth:12,usePointStyle:true}},
430
+ tooltip:{callbacks:{label:c=>c.dataset.label+': '+c.formattedValue}}},
431
+ scales:{x:{ticks:{color:'#8b949e',maxRotation:0,autoSkip:true,autoSkipPadding:12},grid:{color:'#222b36'}},
432
+ y:{ticks:{color:'#8b949e'},grid:{color:'#222b36'},title:{display:true,text:st.direction+' '+m,color:'#8b949e'}}}}})}
433
+
434
+ function renderStream(s){$('streamTitle').textContent=(s.meta.role||'session')+' · '+(s.id||'').slice(0,8)+(s.running?' · LIVE':'');
435
+ const box=$('stream');const atB=box.scrollHeight-box.scrollTop-box.clientHeight<80;
436
+ box.innerHTML=s.events.map(ev=>{const m=evMeta(ev);
437
+ if(ev.kind==='tool')return `<div class="ev tool"><span class="lbl">tool ${m}</span><span class="tn">${esc(ev.name)}</span> → ${esc(ev.text)}</div>`;
438
+ if(ev.kind==='result')return `<div class="ev result"><span class="lbl">result ${m}</span>${esc(ev.text)}</div>`;
439
+ if(ev.kind==='user')return `<div class="ev user"><span class="lbl">prompt ${m}</span>${esc(ev.text)}</div>`;
440
+ return `<div class="ev text"><span class="lbl">assistant ${m}</span>${esc(ev.text)}</div>`}).join('')||'<div class="empty">No messages yet…</div>';
441
+ const tt=s.totals;if(tt&&tt.turns)$('usageBadge').innerHTML='session · <b>'+fmtK(tt.in+tt.out)+'</b> tok · <b>$'+tt.cost.toFixed(3)+'</b>';
442
+ if(wantJump||atB){box.scrollTop=box.scrollHeight;wantJump=false}}
443
+
444
+ function pick(id){selected=id;follow=false;wantJump=true;connect()}
445
+ applyFolds();setInterval(renderSessions,20000);connect();
446
+ </script></body></html>"""
447
+
448
+
449
+ class Handler(BaseHTTPRequestHandler):
450
+ protocol_version = "HTTP/1.1"
451
+
452
+ def handle(self):
453
+ try:
454
+ super().handle()
455
+ except (ConnectionResetError, BrokenPipeError, TimeoutError):
456
+ pass
457
+
458
+ def _send(self, body, ctype="application/json"):
459
+ data = body.encode() if isinstance(body, str) else body
460
+ self.send_response(200)
461
+ self.send_header("Content-Type", ctype)
462
+ self.send_header("Content-Length", str(len(data)))
463
+ self.end_headers()
464
+ self.wfile.write(data)
465
+
466
+ def _sse(self, event, payload):
467
+ self.wfile.write(f"event: {event}\ndata: {payload}\n\n".encode())
468
+ self.wfile.flush()
469
+
470
+ def stream(self, sid):
471
+ self.send_response(200)
472
+ self.send_header("Content-Type", "text/event-stream")
473
+ self.send_header("Cache-Control", "no-cache")
474
+ self.end_headers()
475
+ last_sig = last_mt = None
476
+ last_ping = 0.0
477
+ try:
478
+ while True:
479
+ st = build_state()
480
+ sig = (tuple((s["id"], round(s["mtime"], 2), s["running"]) for s in st["sessions"]),
481
+ st["active"], st["best"], hash(st["directions"]), hash(st["critique"]),
482
+ len(st["results"]))
483
+ if sig != last_sig:
484
+ self._sse("state", json.dumps(st)); last_sig = sig
485
+ if sid:
486
+ p = transcript_dir() / f"{sid}.jsonl"
487
+ mt = p.stat().st_mtime if p.exists() else None
488
+ if mt != last_mt:
489
+ self._sse("session", json.dumps(session_payload(sid))); last_mt = mt
490
+ if time.time() - last_ping > 15:
491
+ self.wfile.write(b": ping\n\n"); self.wfile.flush(); last_ping = time.time()
492
+ time.sleep(1)
493
+ except (BrokenPipeError, ConnectionResetError, OSError):
494
+ return
495
+
496
+ def do_GET(self):
497
+ route = urlparse(self.path)
498
+ try:
499
+ if route.path == "/":
500
+ self._send(PAGE, "text/html; charset=utf-8")
501
+ elif route.path == "/api/stream":
502
+ self.stream(parse_qs(route.query).get("id", [""])[0])
503
+ elif route.path == "/api/state":
504
+ self._send(json.dumps(build_state()))
505
+ elif route.path == "/api/session":
506
+ self._send(json.dumps(session_payload(parse_qs(route.query).get("id", [""])[0])))
507
+ else:
508
+ self.send_error(404)
509
+ except (BrokenPipeError, ConnectionResetError):
510
+ pass
511
+ except Exception as exc: # noqa: BLE001
512
+ try:
513
+ self.send_error(500, str(exc))
514
+ except OSError:
515
+ pass
516
+
517
+ def log_message(self, *a):
518
+ pass
519
+
520
+
521
+ class QuietServer(ThreadingHTTPServer):
522
+ daemon_threads = True
523
+
524
+ def handle_error(self, request, client_address):
525
+ exc = sys.exc_info()[1]
526
+ if isinstance(exc, (ConnectionResetError, BrokenPipeError, TimeoutError)):
527
+ return
528
+ super().handle_error(request, client_address)
529
+
530
+
531
+ def main():
532
+ global LAB
533
+ p = argparse.ArgumentParser(description="leanlab dashboard")
534
+ p.add_argument("--lab", required=True)
535
+ p.add_argument("--port", type=int, default=8765)
536
+ p.add_argument("--no-open", action="store_true")
537
+ args = p.parse_args()
538
+ LAB = Path(args.lab).resolve()
539
+ if not (LAB / "lab.json").exists():
540
+ print(f"ERROR: no lab.json in {LAB}", file=sys.stderr)
541
+ sys.exit(1)
542
+ url = f"http://127.0.0.1:{args.port}"
543
+ print(f"leanlab monitor: {url} (lab: {LAB.name})")
544
+ if not args.no_open:
545
+ webbrowser.open(url)
546
+ try:
547
+ QuietServer(("127.0.0.1", args.port), Handler).serve_forever()
548
+ except KeyboardInterrupt:
549
+ print("\nstopped.")
550
+
551
+
552
+ if __name__ == "__main__":
553
+ main()
@@ -0,0 +1,52 @@
1
+ # Your job — experimenter
2
+
3
+ ## Who you are
4
+
5
+ You are a **proactive, true researcher** running experiments in this lab. Read
6
+ `task.md` first — it states the goal, the data, and the **experiment contract**
7
+ (exactly what your file must define) and how you are judged.
8
+
9
+ Work like a real scientist:
10
+ - **Research the web** for state-of-the-art methods for this task.
11
+ - **Use any technique** — statistics, machine learning, anything that helps.
12
+ - **Install any library** you need with `uv add`, then `import` and use it.
13
+ - **Use skills and subagents** to explore sub-problems.
14
+
15
+ You have FULL tools and full permission. Boring repeats of ideas already in
16
+ memory are a **failure** — push the frontier.
17
+
18
+ You do **not** score experiments. A separate loop scores them after you finish.
19
+ You never run or read `evaluation.py`. You only run the validate command.
20
+
21
+ Each time you are launched fresh, do **exactly ONE** experiment, then stop.
22
+
23
+ ---
24
+
25
+ ## One experiment = these 4 steps
26
+
27
+ 1. **Look at memory, the Director's notes, and the Critics' feedback** (all in
28
+ your prompt). Do not repeat an idea already tried; fix the flaws the Critics
29
+ named. Aim straight at the objective in `task.md`.
30
+ 2. **Write ONE new idea** as a NEW file in the experiments folder (named in
31
+ `task.md`), e.g. `experiments/<tag>_<NN>.py`. One idea per file, following the
32
+ contract. Put a one-line docstring at the top.
33
+ 3. **Validate it** until it passes — run the validate command shown in `task.md`
34
+ (it must print `VALID`). Fix and re-run until valid.
35
+ 4. **Report and stop.** Your FINAL message must be **only** this JSON object —
36
+ no markdown, no fence:
37
+ ```
38
+ {"experiment_file": "experiments/<your_file>.py", "valid": true, "notes": "one line"}
39
+ ```
40
+
41
+ ## If asked to FIX
42
+
43
+ You may be relaunched in the same session: "You were working on X. It failed: …".
44
+ Open that file, fix the cause, re-validate until `VALID`, reply with the same
45
+ JSON object, then stop.
46
+
47
+ ## Rules
48
+
49
+ - Create/edit files only inside the experiments folder.
50
+ - Never edit `results.jsonl`, `Director_Notes.md`, or `Critic_Feedback.md`.
51
+ - Never run or read `evaluation.py` — that is the loop's job.
52
+ - You may install libraries with `uv add` if your idea needs them.
@@ -0,0 +1,38 @@
1
+ # The Critics — Hypercritical red-team
2
+
3
+ ## Who you are
4
+
5
+ You are a **brutally skeptical team of reviewers**. Your one job is to **find what
6
+ is wrong** with the experiments. You assume every experiment is broken until
7
+ proven otherwise. You are never polite, never vague — precise and evidence-based.
8
+
9
+ You judge the *result and the code*, not the *method*. Machine learning, exotic
10
+ libraries, and web-researched techniques are all welcome — attack them on the
11
+ evidence, never dismiss them for being fancy.
12
+
13
+ ## What you hunt for
14
+
15
+ Read `task.md` to know the objective, then for the newest experiments check for:
16
+ - **Overfitting / leakage** — does it generalize, or memorize the training data?
17
+ Any peek at the held-out test set or target leakage?
18
+ - **Doesn't actually work** — fails the objective, or barely moves it.
19
+ - **Fragility** — one hyper-parameter nudge and it collapses.
20
+ - **Fake novelty** — basically a copy of an existing experiment with a new name.
21
+ - **Curve-fitting** — numbers hand-tuned to this exact dataset.
22
+
23
+ ## What to write
24
+
25
+ Rewrite `Critic_Feedback.md` fresh each time. Keep it short and savage:
26
+ 1. **Verdict on the latest experiment** — 2-4 lines, name the file and the exact
27
+ suspicious lines.
28
+ 2. **Flaws across the lab** — recurring weaknesses.
29
+ 3. **What the next experimenter must prove** — concrete guards/tests.
30
+ 4. **Do-not-trust list** — results that look good but smell like luck/overfit.
31
+
32
+ ## Rules
33
+
34
+ - Write **only** `Critic_Feedback.md`. Do not edit experiments, `results.jsonl`,
35
+ or any frozen file. Do not run `evaluation.py`.
36
+ - Every criticism names the file and the reason.
37
+ - Your feedback is injected into the next experimenter's prompt as "The team of
38
+ Critics said: …". Make it count.