leanlab 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- leanlab/__init__.py +1 -0
- leanlab/cli.py +315 -0
- leanlab/core/__init__.py +1 -0
- leanlab/core/agents/__init__.py +10 -0
- leanlab/core/agents/claude.py +38 -0
- leanlab/core/agents/port.py +49 -0
- leanlab/core/agents/protocol.py +64 -0
- leanlab/core/coding/__init__.py +1 -0
- leanlab/core/coding/board.py +335 -0
- leanlab/core/coding/board_dist/assets/index-BBCkNArL.css +1 -0
- leanlab/core/coding/board_dist/assets/index-CNGMDAuO.js +40 -0
- leanlab/core/coding/board_dist/index.html +13 -0
- leanlab/core/coding/engineer.py +304 -0
- leanlab/core/coding/gate.py +63 -0
- leanlab/core/coding/personas.py +23 -0
- leanlab/core/coding/playbook.py +47 -0
- leanlab/core/coding/spec.py +232 -0
- leanlab/core/doctor.py +220 -0
- leanlab/core/init.py +219 -0
- leanlab/core/loop.py +374 -0
- leanlab/core/monitor.py +553 -0
- leanlab/templates/agents/CLAUDE.md +52 -0
- leanlab/templates/agents/critic.md +38 -0
- leanlab/templates/agents/director.md +37 -0
- leanlab/templates/agents/engineer.md +12 -0
- leanlab/templates/agents/reviewer.md +34 -0
- leanlab/templates/agents/techlead.md +7 -0
- leanlab/templates/skill/SKILL.md +99 -0
- leanlab-0.2.1.dist-info/METADATA +273 -0
- leanlab-0.2.1.dist-info/RECORD +33 -0
- leanlab-0.2.1.dist-info/WHEEL +4 -0
- leanlab-0.2.1.dist-info/entry_points.txt +2 -0
- leanlab-0.2.1.dist-info/licenses/LICENSE +21 -0
leanlab/core/monitor.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
"""leanlab — generic live dashboard for a lab.
|
|
2
|
+
|
|
3
|
+
Schema-driven: it reads the lab's lab.json (the objective) and results.jsonl
|
|
4
|
+
(flexible metrics), so it works for any lab. Shows:
|
|
5
|
+
- PROGRESS chart: the objective metric across experiments + the best-so-far line.
|
|
6
|
+
- RESULTS table: every metric column, best row highlighted.
|
|
7
|
+
- LIVE STREAM: the running agent's messages, tool calls, timings, and cost.
|
|
8
|
+
- SESSIONS: each agent run (worker / director / critic) with its cost.
|
|
9
|
+
|
|
10
|
+
Run:
|
|
11
|
+
uv run python core/monitor.py --lab labs/house-prices
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
import time
|
|
21
|
+
import webbrowser
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from urllib.parse import urlparse, parse_qs
|
|
26
|
+
|
|
27
|
+
RUNNING_WINDOW = 15
|
|
28
|
+
PRICES = {"opus": (15.0, 75.0, 1.50), "sonnet": (3.0, 15.0, 0.30),
|
|
29
|
+
"haiku": (0.80, 4.0, 0.08), "fable": (15.0, 75.0, 1.50)}
|
|
30
|
+
_DEFAULT_PRICE = (15.0, 75.0, 1.50)
|
|
31
|
+
|
|
32
|
+
LAB = None # set in main()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def cfg():
|
|
36
|
+
return json.loads((LAB / "lab.json").read_text())
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def objective():
|
|
40
|
+
o = cfg().get("objective", {})
|
|
41
|
+
return o.get("metric", "score"), o.get("direction", "max")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# --- pricing / timestamps ---------------------------------------------------
|
|
45
|
+
def _price_for(model):
|
|
46
|
+
name = (model or "").lower()
|
|
47
|
+
for k, r in PRICES.items():
|
|
48
|
+
if k in name:
|
|
49
|
+
return r
|
|
50
|
+
return _DEFAULT_PRICE
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _turn_cost(model, u):
|
|
54
|
+
i, o, cr = _price_for(model)
|
|
55
|
+
return (((u.get("input_tokens") or 0) * i + (u.get("output_tokens") or 0) * o
|
|
56
|
+
+ (u.get("cache_read_input_tokens") or 0) * cr
|
|
57
|
+
+ (u.get("cache_creation_input_tokens") or 0) * i * 1.25) / 1e6)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _parse_ts(ts):
|
|
61
|
+
if not ts:
|
|
62
|
+
return None
|
|
63
|
+
try:
|
|
64
|
+
return datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
|
|
65
|
+
except ValueError:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _clip(t, limit=200000):
|
|
70
|
+
if t is None:
|
|
71
|
+
return ""
|
|
72
|
+
t = str(t)
|
|
73
|
+
return t if len(t) <= limit else t[:limit] + " …(clipped)"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _stringify(v):
|
|
77
|
+
if isinstance(v, str):
|
|
78
|
+
return v
|
|
79
|
+
try:
|
|
80
|
+
return json.dumps(v, ensure_ascii=False)
|
|
81
|
+
except (TypeError, ValueError):
|
|
82
|
+
return str(v)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# --- transcripts ------------------------------------------------------------
|
|
86
|
+
def transcript_dir():
|
|
87
|
+
base = Path.home() / ".claude" / "projects"
|
|
88
|
+
exact = base / str(LAB).replace("/", "-")
|
|
89
|
+
if exact.is_dir():
|
|
90
|
+
return exact
|
|
91
|
+
matches = sorted(base.glob(f"*{LAB.name}*"))
|
|
92
|
+
return matches[-1] if matches else exact
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
_EXP_RE = re.compile(r"experiments/[A-Za-z0-9_]+\.py")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def parse_session(path):
|
|
99
|
+
events, first_user, artifact, pending = [], None, None, {}
|
|
100
|
+
try:
|
|
101
|
+
lines = path.read_text(errors="replace").splitlines()
|
|
102
|
+
except OSError:
|
|
103
|
+
return {"role": "unknown", "artifact": None}, []
|
|
104
|
+
for line in lines:
|
|
105
|
+
line = line.strip()
|
|
106
|
+
if not line:
|
|
107
|
+
continue
|
|
108
|
+
try:
|
|
109
|
+
obj = json.loads(line)
|
|
110
|
+
except json.JSONDecodeError:
|
|
111
|
+
continue
|
|
112
|
+
kind, msg = obj.get("type"), obj.get("message") or {}
|
|
113
|
+
ts = obj.get("timestamp")
|
|
114
|
+
ts_s = _parse_ts(ts)
|
|
115
|
+
if kind == "user":
|
|
116
|
+
content = msg.get("content")
|
|
117
|
+
if isinstance(content, str):
|
|
118
|
+
if first_user is None:
|
|
119
|
+
first_user = content
|
|
120
|
+
events.append({"kind": "user", "text": _clip(content), "ts": ts})
|
|
121
|
+
elif isinstance(content, list):
|
|
122
|
+
for b in content:
|
|
123
|
+
if not isinstance(b, dict):
|
|
124
|
+
continue
|
|
125
|
+
if b.get("type") == "tool_result":
|
|
126
|
+
dur = None
|
|
127
|
+
start = pending.pop(b.get("tool_use_id"), None)
|
|
128
|
+
if start is not None and ts_s is not None:
|
|
129
|
+
dur = round(ts_s - start, 2)
|
|
130
|
+
events.append({"kind": "result", "ts": ts, "dur": dur,
|
|
131
|
+
"text": _clip(_stringify(b.get("content")))})
|
|
132
|
+
elif b.get("type") == "text":
|
|
133
|
+
if first_user is None:
|
|
134
|
+
first_user = b.get("text", "")
|
|
135
|
+
events.append({"kind": "user", "text": _clip(b.get("text", "")), "ts": ts})
|
|
136
|
+
elif kind == "assistant":
|
|
137
|
+
u = msg.get("usage") or {}
|
|
138
|
+
turn = {"model": msg.get("model"), "in_tok": u.get("input_tokens"),
|
|
139
|
+
"out_tok": u.get("output_tokens"),
|
|
140
|
+
"cache_tok": u.get("cache_read_input_tokens"),
|
|
141
|
+
"cost": round(_turn_cost(msg.get("model"), u), 6) if u else None}
|
|
142
|
+
firstb = True
|
|
143
|
+
for b in msg.get("content") or []:
|
|
144
|
+
if not isinstance(b, dict):
|
|
145
|
+
continue
|
|
146
|
+
if b.get("type") == "text" and b.get("text", "").strip():
|
|
147
|
+
ev = {"kind": "text", "text": _clip(b["text"]), "ts": ts}
|
|
148
|
+
if firstb:
|
|
149
|
+
ev.update(turn); firstb = False
|
|
150
|
+
events.append(ev)
|
|
151
|
+
elif b.get("type") == "tool_use":
|
|
152
|
+
raw = _stringify(b.get("input"))
|
|
153
|
+
m = _EXP_RE.search(raw)
|
|
154
|
+
if m and "sample.py" not in m.group(0):
|
|
155
|
+
artifact = m.group(0)
|
|
156
|
+
if ts_s is not None and b.get("id"):
|
|
157
|
+
pending[b["id"]] = ts_s
|
|
158
|
+
ev = {"kind": "tool", "name": b.get("name", "tool"), "text": _clip(raw), "ts": ts}
|
|
159
|
+
if firstb:
|
|
160
|
+
ev.update(turn); firstb = False
|
|
161
|
+
events.append(ev)
|
|
162
|
+
role = "unknown"
|
|
163
|
+
if first_user:
|
|
164
|
+
head = first_user.strip().splitlines()[0].lower()
|
|
165
|
+
# New prompts start "You are the WORKER/DIRECTOR/CRITIC ..."; the older markers
|
|
166
|
+
# ("read director.md", etc.) are kept so historical transcripts still classify.
|
|
167
|
+
if head.startswith("you are the director") or head.startswith("read director.md"):
|
|
168
|
+
role = "director"
|
|
169
|
+
elif head.startswith("you are the critic") or head.startswith("read critic.md"):
|
|
170
|
+
role = "critic"
|
|
171
|
+
elif (head.startswith("you are the worker") or head.startswith("read claude.md")
|
|
172
|
+
or "exactly one experiment" in head):
|
|
173
|
+
role = "worker"
|
|
174
|
+
return {"role": role, "artifact": artifact}, events
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
_META = {}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def session_meta(path):
|
|
181
|
+
mt = path.stat().st_mtime
|
|
182
|
+
c = _META.get(str(path))
|
|
183
|
+
if c and c[0] == mt:
|
|
184
|
+
return c[1]
|
|
185
|
+
meta, events = parse_session(path)
|
|
186
|
+
info = {"role": meta["role"], "artifact": meta["artifact"], "events": len(events),
|
|
187
|
+
"cost": round(sum(e.get("cost") or 0 for e in events), 4)}
|
|
188
|
+
_META[str(path)] = (mt, info)
|
|
189
|
+
return info
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def list_sessions():
|
|
193
|
+
out, now = [], time.time()
|
|
194
|
+
for p in sorted(transcript_dir().glob("*.jsonl"), key=lambda x: x.stat().st_mtime, reverse=True):
|
|
195
|
+
info = session_meta(p)
|
|
196
|
+
mt = p.stat().st_mtime
|
|
197
|
+
out.append({"id": p.stem, **info, "mtime": mt, "running": (now - mt) < RUNNING_WINDOW})
|
|
198
|
+
return out
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# --- results ----------------------------------------------------------------
|
|
202
|
+
def read_results():
|
|
203
|
+
path = LAB / cfg().get("results_file", "results.jsonl")
|
|
204
|
+
if not path.exists():
|
|
205
|
+
return []
|
|
206
|
+
out = []
|
|
207
|
+
for line in path.read_text().splitlines():
|
|
208
|
+
line = line.strip()
|
|
209
|
+
if line:
|
|
210
|
+
try:
|
|
211
|
+
out.append(json.loads(line))
|
|
212
|
+
except json.JSONDecodeError:
|
|
213
|
+
pass
|
|
214
|
+
return out
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def best_value(rows):
|
|
218
|
+
metric, d = objective()
|
|
219
|
+
vals = []
|
|
220
|
+
for r in rows:
|
|
221
|
+
try:
|
|
222
|
+
vals.append(float(r.get(metric)))
|
|
223
|
+
except (TypeError, ValueError):
|
|
224
|
+
pass
|
|
225
|
+
if not vals:
|
|
226
|
+
return None
|
|
227
|
+
return min(vals) if d == "min" else max(vals)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def latest_value(rows, metric):
|
|
231
|
+
"""The metric of the most recent (last) experiment, or None if unparseable."""
|
|
232
|
+
if not rows:
|
|
233
|
+
return None
|
|
234
|
+
try:
|
|
235
|
+
return float(rows[-1].get(metric))
|
|
236
|
+
except (TypeError, ValueError):
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def total_cost(sessions):
|
|
241
|
+
"""Sum the cost across every agent session (worker / director / critic)."""
|
|
242
|
+
return round(sum(s.get("cost") or 0 for s in sessions), 4)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def build_state():
|
|
246
|
+
sessions = list_sessions()
|
|
247
|
+
active = next((s["id"] for s in sessions if s["running"]), None) or (sessions[0]["id"] if sessions else None)
|
|
248
|
+
rows = read_results()
|
|
249
|
+
metric, d = objective()
|
|
250
|
+
return {
|
|
251
|
+
"lab": cfg().get("name", LAB.name),
|
|
252
|
+
"metric": metric, "direction": d,
|
|
253
|
+
"results": rows, "best": best_value(rows),
|
|
254
|
+
"latest": latest_value(rows, metric), "total_cost": total_cost(sessions),
|
|
255
|
+
"sessions": sessions, "active": active,
|
|
256
|
+
"directions": (LAB / "Director_Notes.md").read_text() if (LAB / "Director_Notes.md").exists() else "",
|
|
257
|
+
"critique": (LAB / "Critic_Feedback.md").read_text() if (LAB / "Critic_Feedback.md").exists() else "",
|
|
258
|
+
"now": time.time(),
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def session_payload(sid):
|
|
263
|
+
path = transcript_dir() / f"{sid}.jsonl"
|
|
264
|
+
if not path.exists():
|
|
265
|
+
return {"id": sid, "meta": {}, "events": [], "totals": {}}
|
|
266
|
+
meta, events = parse_session(path)
|
|
267
|
+
tot = {"in": 0, "out": 0, "cost": 0.0, "turns": 0, "model": None}
|
|
268
|
+
for e in events:
|
|
269
|
+
if e.get("cost") is not None:
|
|
270
|
+
tot["in"] += e.get("in_tok") or 0; tot["out"] += e.get("out_tok") or 0
|
|
271
|
+
tot["cost"] += e.get("cost") or 0; tot["turns"] += 1
|
|
272
|
+
if e.get("model"):
|
|
273
|
+
tot["model"] = e["model"]
|
|
274
|
+
tot["cost"] = round(tot["cost"], 4)
|
|
275
|
+
return {"id": sid, "meta": meta, "events": events, "totals": tot,
|
|
276
|
+
"running": (time.time() - path.stat().st_mtime) < RUNNING_WINDOW}
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
PAGE = r"""<!doctype html><html><head><meta charset="utf-8">
|
|
280
|
+
<title>leanlab · monitor</title>
|
|
281
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.1/chart.umd.min.js"></script>
|
|
282
|
+
<style>
|
|
283
|
+
:root{--bg:#0d1117;--panel:#161b22;--panel2:#1c2230;--border:#2a3240;--text:#e6edf3;
|
|
284
|
+
--muted:#8b949e;--accent:#58a6ff;--good:#3fb950;--bad:#f85149;--tool:#a371f7}
|
|
285
|
+
*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--text);
|
|
286
|
+
font:14px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif}
|
|
287
|
+
header{display:flex;align-items:center;gap:14px;padding:10px 16px;background:var(--panel);
|
|
288
|
+
border-bottom:1px solid var(--border);position:sticky;top:0;z-index:5}
|
|
289
|
+
header h1{font-size:15px;margin:0}.badge{font-size:12px;padding:3px 10px;border-radius:20px;
|
|
290
|
+
background:var(--panel2);border:1px solid var(--border);white-space:nowrap}.badge b{color:var(--good)}
|
|
291
|
+
.board{max-width:1200px;margin:0 auto;padding:14px;display:flex;flex-direction:column;gap:14px}
|
|
292
|
+
.stats{display:flex;gap:12px}
|
|
293
|
+
.chip{flex:1;background:var(--panel);border:1px solid var(--border);border-radius:10px;padding:11px 14px;min-width:0}
|
|
294
|
+
.chip .k{font-size:10px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted)}
|
|
295
|
+
.chip .v{font-size:24px;font-weight:500;margin-top:2px;font-variant-numeric:tabular-nums}
|
|
296
|
+
.chip .v.good{color:var(--good)}
|
|
297
|
+
.card{background:var(--panel);border:1px solid var(--border);border-radius:10px;overflow:hidden}
|
|
298
|
+
.card-head{padding:10px 14px;font-size:11px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);
|
|
299
|
+
cursor:pointer;user-select:none;display:flex;align-items:center;gap:8px;border-bottom:1px solid var(--border)}
|
|
300
|
+
.card-head::before{content:'▾';font-size:10px;color:var(--muted)}
|
|
301
|
+
.card.collapsed .card-head::before{content:'▸'}
|
|
302
|
+
.card.collapsed .card-head{border-bottom:none}
|
|
303
|
+
.card.collapsed .card-body{display:none}
|
|
304
|
+
.card-head .ht{font-size:11px;color:#6b7686;margin-left:auto;text-transform:none;letter-spacing:0}
|
|
305
|
+
.card-body{padding:12px 14px}
|
|
306
|
+
.row2{display:flex;gap:14px}.row2 .card{flex:1;min-width:0}
|
|
307
|
+
.run{display:flex;height:max(460px,70vh)}
|
|
308
|
+
.sesslist{width:236px;flex:0 0 auto;overflow:auto;padding:8px;border-right:1px solid var(--border)}
|
|
309
|
+
.streampane{flex:1;min-width:0;display:flex;flex-direction:column;background:#0a0e14}
|
|
310
|
+
.streamhead{font-size:11px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);padding:9px 12px 6px}
|
|
311
|
+
.stream{flex:1;overflow:auto;padding:0 12px 12px}
|
|
312
|
+
.sess{padding:8px 10px;border-radius:8px;border:1px solid var(--border);cursor:pointer;background:var(--panel2);margin-bottom:8px}
|
|
313
|
+
.sess:last-child{margin-bottom:0}
|
|
314
|
+
.sess.sel{border-color:var(--accent)}.sess .role{font-weight:600;text-transform:capitalize}
|
|
315
|
+
.sess .role.worker{color:var(--accent)}.sess .role.director{color:var(--tool)}.sess .role.critic{color:var(--bad)}
|
|
316
|
+
.sess .meta{font-size:12px;color:var(--muted);margin-top:2px}
|
|
317
|
+
.ev{margin-bottom:10px;border-radius:8px;padding:8px 11px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow:auto}
|
|
318
|
+
.ev.user{background:#11233b;border:1px solid #1f3a5f}.ev.text{background:var(--panel2)}
|
|
319
|
+
.ev.tool{background:#1a1430;border:1px solid #3a2a5a;font-family:ui-monospace,Menlo,monospace;font-size:12.5px}
|
|
320
|
+
.ev.tool .tn{color:var(--tool);font-weight:600}.ev.result{background:#10161d;border:1px solid var(--border);color:var(--muted);font-family:ui-monospace,Menlo,monospace;font-size:12px}
|
|
321
|
+
.ev .lbl{font-size:10px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);display:block;margin-bottom:3px}
|
|
322
|
+
.ev .lbl .tm{text-transform:none;letter-spacing:0;color:#6b7686;font-weight:400;float:right}
|
|
323
|
+
.tablewrap{overflow-x:auto}
|
|
324
|
+
table{width:100%;border-collapse:collapse;font-size:12px}th,td{text-align:left;padding:5px 8px;border-bottom:1px solid var(--border);white-space:nowrap}
|
|
325
|
+
th{color:var(--muted);position:sticky;top:0;background:var(--panel)}td.num{text-align:right;font-variant-numeric:tabular-nums}
|
|
326
|
+
tr.best td{background:rgba(63,185,80,.13)}pre.dir{white-space:pre-wrap;font-size:12px;margin:0;
|
|
327
|
+
max-height:320px;overflow:auto}
|
|
328
|
+
.empty{color:var(--muted);font-style:italic;padding:16px;text-align:center}
|
|
329
|
+
.chartbox{position:relative;height:300px}
|
|
330
|
+
</style></head><body>
|
|
331
|
+
<header><h1>🧪 <span id="labName">leanlab</span></h1>
|
|
332
|
+
<span class="badge" id="objBadge">objective · —</span>
|
|
333
|
+
<span class="badge" id="usageBadge">session · —</span>
|
|
334
|
+
<span style="flex:1"></span><span class="badge" id="clock">—</span></header>
|
|
335
|
+
<div class="board">
|
|
336
|
+
<div class="stats" id="stats"></div>
|
|
337
|
+
<section class="card" data-panel="run"><div class="card-head" onclick="toggleFold('run')">Sessions & live stream</div>
|
|
338
|
+
<div class="card-body" style="padding:0"><div class="run">
|
|
339
|
+
<div class="sesslist" id="sessions"></div>
|
|
340
|
+
<div class="streampane"><div class="streamhead" id="streamTitle">Live stream</div>
|
|
341
|
+
<div class="stream" id="stream"><div class="empty">Waiting…</div></div></div>
|
|
342
|
+
</div></div></section>
|
|
343
|
+
<section class="card" data-panel="progress"><div class="card-head" onclick="toggleFold('progress')">Progress</div>
|
|
344
|
+
<div class="card-body"><div class="chartbox" id="chart"></div></div></section>
|
|
345
|
+
<section class="card" data-panel="results"><div class="card-head" onclick="toggleFold('results')">Results</div>
|
|
346
|
+
<div class="card-body"><div class="tablewrap" id="results"></div></div></section>
|
|
347
|
+
<div class="row2">
|
|
348
|
+
<section class="card" data-panel="critics"><div class="card-head" onclick="toggleFold('critics')">🔴 Critics</div>
|
|
349
|
+
<div class="card-body"><pre class="dir" id="critique">—</pre></div></section>
|
|
350
|
+
<section class="card" data-panel="director"><div class="card-head" onclick="toggleFold('director')">🧭 Director</div>
|
|
351
|
+
<div class="card-body"><pre class="dir" id="directions">—</pre></div></section></div></div>
|
|
352
|
+
<script>
|
|
353
|
+
let selected=null,follow=true,es=null,lastState=null,wantJump=false;
|
|
354
|
+
const $=id=>document.getElementById(id);
|
|
355
|
+
function esc(s){const d=document.createElement('div');d.textContent=s==null?'':s;return d.innerHTML}
|
|
356
|
+
function fmtK(n){if(n==null)return'';return n>=1000?(n/1000).toFixed(n>=10000?0:1)+'k':''+n}
|
|
357
|
+
function shortModel(m){return m?m.replace(/^claude-/,'').replace(/-\d{8}$/,''):''}
|
|
358
|
+
function relTime(s){s=Math.round(s);if(s<10)return'just now';if(s<60)return'under a min ago';
|
|
359
|
+
if(s<3600)return Math.floor(s/60)+' min ago';if(s<86400)return Math.floor(s/3600)+' hr ago';return Math.floor(s/86400)+' day ago'}
|
|
360
|
+
function evMeta(ev){const t=ev.ts?new Date(ev.ts).toLocaleTimeString():'';const b=[];
|
|
361
|
+
if(ev.model)b.push(shortModel(ev.model));if(ev.in_tok!=null||ev.out_tok!=null)b.push(fmtK(ev.in_tok)+'→'+fmtK(ev.out_tok)+' tok');
|
|
362
|
+
if(ev.cost)b.push('$'+ev.cost.toFixed(4));if(ev.dur!=null)b.push('⏱ '+ev.dur+'s');
|
|
363
|
+
const r=(t||'')+(b.length?' · '+b.join(' · '):'');return r?`<span class="tm">${r}</span>`:''}
|
|
364
|
+
|
|
365
|
+
const DEFAULT_COLLAPSED={};
|
|
366
|
+
function foldState(){try{return JSON.parse(localStorage.getItem('leanlab.folds'))||{}}catch(e){return {}}}
|
|
367
|
+
function applyFolds(){const f=foldState();document.querySelectorAll('.card[data-panel]').forEach(c=>{
|
|
368
|
+
const k=c.dataset.panel;const col=(k in f)?f[k]:!!DEFAULT_COLLAPSED[k];c.classList.toggle('collapsed',col)})}
|
|
369
|
+
function toggleFold(k){const c=document.querySelector('.card[data-panel="'+k+'"]');if(!c)return;
|
|
370
|
+
const now=!c.classList.contains('collapsed');c.classList.toggle('collapsed',now);
|
|
371
|
+
const f=foldState();f[k]=now;localStorage.setItem('leanlab.folds',JSON.stringify(f))}
|
|
372
|
+
|
|
373
|
+
function connect(){if(es)es.close();es=new EventSource('/api/stream'+(selected?'?id='+encodeURIComponent(selected):''));
|
|
374
|
+
es.addEventListener('open',()=>$('clock').textContent='● streaming');
|
|
375
|
+
es.addEventListener('error',()=>$('clock').textContent='○ reconnecting…');
|
|
376
|
+
es.addEventListener('state',e=>onState(JSON.parse(e.data)));
|
|
377
|
+
es.addEventListener('session',e=>renderStream(JSON.parse(e.data)))}
|
|
378
|
+
|
|
379
|
+
function chip(k,v,good){return `<div class="chip"><div class="k">${k}</div><div class="v${good?' good':''}">${v}</div></div>`}
|
|
380
|
+
function renderStats(st){$('stats').innerHTML=
|
|
381
|
+
chip('best '+(st.direction==='min'?'↓':'↑'),st.best==null?'—':st.best,true)+
|
|
382
|
+
chip('latest',st.latest==null?'—':st.latest)+
|
|
383
|
+
chip('experiments',st.results.length)+
|
|
384
|
+
chip('total cost','$'+(st.total_cost||0).toFixed(2))}
|
|
385
|
+
|
|
386
|
+
function onState(st){lastState=st;
|
|
387
|
+
$('labName').textContent=st.lab;
|
|
388
|
+
$('objBadge').textContent='objective · '+st.direction+' '+st.metric;
|
|
389
|
+
if(follow&&st.active&&st.active!==selected){selected=st.active;wantJump=true;connect();return}
|
|
390
|
+
if(!selected&&st.active){selected=st.active;wantJump=true;connect();return}
|
|
391
|
+
renderStats(st);renderSessions();renderResults(st);renderChart(st);
|
|
392
|
+
$('critique').textContent=st.critique||'—';$('directions').textContent=st.directions||'—'}
|
|
393
|
+
|
|
394
|
+
function renderSessions(){if(!lastState)return;const now=Date.now()/1000;
|
|
395
|
+
$('sessions').innerHTML=lastState.sessions.map(s=>{const run=(now-s.mtime)<15;
|
|
396
|
+
const sel=s.id===selected?' sel':'';const art=s.artifact?' · '+s.artifact.replace('experiments/',''):'';
|
|
397
|
+
const cost=s.cost?' · $'+s.cost.toFixed(2):'';const when=run?'running':relTime(now-s.mtime);
|
|
398
|
+
return `<div class="sess${sel}" onclick="pick('${s.id}')"><div class="role ${s.role}">${s.role}${run?' ●':''}</div>
|
|
399
|
+
<div class="meta">${s.events} msgs${cost} · ${when}${art}</div></div>`}).join('')||'<div class="empty">No sessions</div>'}
|
|
400
|
+
|
|
401
|
+
function renderResults(st){const rows=st.results.slice();if(!rows.length){$('results').innerHTML='<div class="empty">No results yet</div>';return}
|
|
402
|
+
const skip=new Set(['tag','best_so_far','notes','ts','experiment_file']);
|
|
403
|
+
const cols=[];rows.forEach(r=>Object.keys(r).forEach(k=>{if(!skip.has(k)&&!cols.includes(k))cols.push(k)}));
|
|
404
|
+
const m=st.metric;rows.sort((a,b)=>{const x=parseFloat(a[m]),y=parseFloat(b[m]);
|
|
405
|
+
if(isNaN(x))return 1;if(isNaN(y))return -1;return st.direction==='min'?x-y:y-x});
|
|
406
|
+
let h='<table><tr><th>experiment</th>'+cols.map(c=>`<th>${esc(c)}</th>`).join('')+'</tr>';
|
|
407
|
+
for(const r of rows){const isB=parseFloat(r[m])===st.best;
|
|
408
|
+
h+=`<tr class="${isB?'best':''}"><td title="${esc(r.notes)}">${esc((r.experiment_file||'').replace('experiments/',''))}</td>`+
|
|
409
|
+
cols.map(c=>{let v=r[c];if(v!=null&&typeof v==='object')v=JSON.stringify(v);return `<td class="num">${esc(v)}</td>`}).join('')+'</tr>'}
|
|
410
|
+
$('results').innerHTML=h+'</table>'}
|
|
411
|
+
|
|
412
|
+
let chartObj=null;
|
|
413
|
+
function renderChart(st){const box=$('chart'),m=st.metric,rows=st.results||[];
|
|
414
|
+
const pts=rows.map(r=>parseFloat(r[m])),valid=pts.filter(v=>!isNaN(v));
|
|
415
|
+
if(typeof Chart==='undefined'){box.innerHTML='<div class="empty">chart library unavailable (offline?)</div>';chartObj=null;return}
|
|
416
|
+
if(!valid.length){box.innerHTML='<div class="empty">No data yet</div>';chartObj=null;return}
|
|
417
|
+
if(!box.querySelector('canvas')){box.innerHTML='<canvas></canvas>';chartObj=null}
|
|
418
|
+
const labels=rows.map((r,i)=>(r.experiment_file||('#'+(i+1))).replace('experiments/','').replace(/\.py$/,''));
|
|
419
|
+
let best=null;const bestLine=pts.map(v=>{if(!isNaN(v))best=best==null?v:(st.direction==='min'?Math.min(best,v):Math.max(best,v));return best});
|
|
420
|
+
const ptCol=pts.map(v=>v===st.best?'#3fb950':'#58a6ff'),ptR=pts.map(v=>v===st.best?6:4);
|
|
421
|
+
if(chartObj){const d=chartObj.data;d.labels=labels;d.datasets[0].data=pts;
|
|
422
|
+
d.datasets[0].pointBackgroundColor=ptCol;d.datasets[0].pointBorderColor=ptCol;d.datasets[0].pointRadius=ptR;
|
|
423
|
+
d.datasets[1].data=bestLine;chartObj.options.scales.y.title.text=st.direction+' '+m;chartObj.update();return}
|
|
424
|
+
chartObj=new Chart(box.querySelector('canvas').getContext('2d'),{type:'line',
|
|
425
|
+
data:{labels,datasets:[
|
|
426
|
+
{label:m,data:pts,borderColor:'#58a6ff',backgroundColor:'#58a6ff',pointBackgroundColor:ptCol,pointBorderColor:ptCol,pointRadius:ptR,pointHoverRadius:7,tension:.25,spanGaps:true},
|
|
427
|
+
{label:'best so far',data:bestLine,borderColor:'#3fb950',borderDash:[6,4],pointRadius:0,stepped:true}]},
|
|
428
|
+
options:{responsive:true,maintainAspectRatio:false,animation:{duration:300},interaction:{mode:'index',intersect:false},
|
|
429
|
+
plugins:{legend:{labels:{color:'#8b949e',boxWidth:12,usePointStyle:true}},
|
|
430
|
+
tooltip:{callbacks:{label:c=>c.dataset.label+': '+c.formattedValue}}},
|
|
431
|
+
scales:{x:{ticks:{color:'#8b949e',maxRotation:0,autoSkip:true,autoSkipPadding:12},grid:{color:'#222b36'}},
|
|
432
|
+
y:{ticks:{color:'#8b949e'},grid:{color:'#222b36'},title:{display:true,text:st.direction+' '+m,color:'#8b949e'}}}}})}
|
|
433
|
+
|
|
434
|
+
function renderStream(s){$('streamTitle').textContent=(s.meta.role||'session')+' · '+(s.id||'').slice(0,8)+(s.running?' · LIVE':'');
|
|
435
|
+
const box=$('stream');const atB=box.scrollHeight-box.scrollTop-box.clientHeight<80;
|
|
436
|
+
box.innerHTML=s.events.map(ev=>{const m=evMeta(ev);
|
|
437
|
+
if(ev.kind==='tool')return `<div class="ev tool"><span class="lbl">tool ${m}</span><span class="tn">${esc(ev.name)}</span> → ${esc(ev.text)}</div>`;
|
|
438
|
+
if(ev.kind==='result')return `<div class="ev result"><span class="lbl">result ${m}</span>${esc(ev.text)}</div>`;
|
|
439
|
+
if(ev.kind==='user')return `<div class="ev user"><span class="lbl">prompt ${m}</span>${esc(ev.text)}</div>`;
|
|
440
|
+
return `<div class="ev text"><span class="lbl">assistant ${m}</span>${esc(ev.text)}</div>`}).join('')||'<div class="empty">No messages yet…</div>';
|
|
441
|
+
const tt=s.totals;if(tt&&tt.turns)$('usageBadge').innerHTML='session · <b>'+fmtK(tt.in+tt.out)+'</b> tok · <b>$'+tt.cost.toFixed(3)+'</b>';
|
|
442
|
+
if(wantJump||atB){box.scrollTop=box.scrollHeight;wantJump=false}}
|
|
443
|
+
|
|
444
|
+
function pick(id){selected=id;follow=false;wantJump=true;connect()}
|
|
445
|
+
applyFolds();setInterval(renderSessions,20000);connect();
|
|
446
|
+
</script></body></html>"""
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
class Handler(BaseHTTPRequestHandler):
|
|
450
|
+
protocol_version = "HTTP/1.1"
|
|
451
|
+
|
|
452
|
+
def handle(self):
|
|
453
|
+
try:
|
|
454
|
+
super().handle()
|
|
455
|
+
except (ConnectionResetError, BrokenPipeError, TimeoutError):
|
|
456
|
+
pass
|
|
457
|
+
|
|
458
|
+
def _send(self, body, ctype="application/json"):
|
|
459
|
+
data = body.encode() if isinstance(body, str) else body
|
|
460
|
+
self.send_response(200)
|
|
461
|
+
self.send_header("Content-Type", ctype)
|
|
462
|
+
self.send_header("Content-Length", str(len(data)))
|
|
463
|
+
self.end_headers()
|
|
464
|
+
self.wfile.write(data)
|
|
465
|
+
|
|
466
|
+
def _sse(self, event, payload):
|
|
467
|
+
self.wfile.write(f"event: {event}\ndata: {payload}\n\n".encode())
|
|
468
|
+
self.wfile.flush()
|
|
469
|
+
|
|
470
|
+
def stream(self, sid):
|
|
471
|
+
self.send_response(200)
|
|
472
|
+
self.send_header("Content-Type", "text/event-stream")
|
|
473
|
+
self.send_header("Cache-Control", "no-cache")
|
|
474
|
+
self.end_headers()
|
|
475
|
+
last_sig = last_mt = None
|
|
476
|
+
last_ping = 0.0
|
|
477
|
+
try:
|
|
478
|
+
while True:
|
|
479
|
+
st = build_state()
|
|
480
|
+
sig = (tuple((s["id"], round(s["mtime"], 2), s["running"]) for s in st["sessions"]),
|
|
481
|
+
st["active"], st["best"], hash(st["directions"]), hash(st["critique"]),
|
|
482
|
+
len(st["results"]))
|
|
483
|
+
if sig != last_sig:
|
|
484
|
+
self._sse("state", json.dumps(st)); last_sig = sig
|
|
485
|
+
if sid:
|
|
486
|
+
p = transcript_dir() / f"{sid}.jsonl"
|
|
487
|
+
mt = p.stat().st_mtime if p.exists() else None
|
|
488
|
+
if mt != last_mt:
|
|
489
|
+
self._sse("session", json.dumps(session_payload(sid))); last_mt = mt
|
|
490
|
+
if time.time() - last_ping > 15:
|
|
491
|
+
self.wfile.write(b": ping\n\n"); self.wfile.flush(); last_ping = time.time()
|
|
492
|
+
time.sleep(1)
|
|
493
|
+
except (BrokenPipeError, ConnectionResetError, OSError):
|
|
494
|
+
return
|
|
495
|
+
|
|
496
|
+
def do_GET(self):
|
|
497
|
+
route = urlparse(self.path)
|
|
498
|
+
try:
|
|
499
|
+
if route.path == "/":
|
|
500
|
+
self._send(PAGE, "text/html; charset=utf-8")
|
|
501
|
+
elif route.path == "/api/stream":
|
|
502
|
+
self.stream(parse_qs(route.query).get("id", [""])[0])
|
|
503
|
+
elif route.path == "/api/state":
|
|
504
|
+
self._send(json.dumps(build_state()))
|
|
505
|
+
elif route.path == "/api/session":
|
|
506
|
+
self._send(json.dumps(session_payload(parse_qs(route.query).get("id", [""])[0])))
|
|
507
|
+
else:
|
|
508
|
+
self.send_error(404)
|
|
509
|
+
except (BrokenPipeError, ConnectionResetError):
|
|
510
|
+
pass
|
|
511
|
+
except Exception as exc: # noqa: BLE001
|
|
512
|
+
try:
|
|
513
|
+
self.send_error(500, str(exc))
|
|
514
|
+
except OSError:
|
|
515
|
+
pass
|
|
516
|
+
|
|
517
|
+
def log_message(self, *a):
|
|
518
|
+
pass
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
class QuietServer(ThreadingHTTPServer):
|
|
522
|
+
daemon_threads = True
|
|
523
|
+
|
|
524
|
+
def handle_error(self, request, client_address):
|
|
525
|
+
exc = sys.exc_info()[1]
|
|
526
|
+
if isinstance(exc, (ConnectionResetError, BrokenPipeError, TimeoutError)):
|
|
527
|
+
return
|
|
528
|
+
super().handle_error(request, client_address)
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def main():
|
|
532
|
+
global LAB
|
|
533
|
+
p = argparse.ArgumentParser(description="leanlab dashboard")
|
|
534
|
+
p.add_argument("--lab", required=True)
|
|
535
|
+
p.add_argument("--port", type=int, default=8765)
|
|
536
|
+
p.add_argument("--no-open", action="store_true")
|
|
537
|
+
args = p.parse_args()
|
|
538
|
+
LAB = Path(args.lab).resolve()
|
|
539
|
+
if not (LAB / "lab.json").exists():
|
|
540
|
+
print(f"ERROR: no lab.json in {LAB}", file=sys.stderr)
|
|
541
|
+
sys.exit(1)
|
|
542
|
+
url = f"http://127.0.0.1:{args.port}"
|
|
543
|
+
print(f"leanlab monitor: {url} (lab: {LAB.name})")
|
|
544
|
+
if not args.no_open:
|
|
545
|
+
webbrowser.open(url)
|
|
546
|
+
try:
|
|
547
|
+
QuietServer(("127.0.0.1", args.port), Handler).serve_forever()
|
|
548
|
+
except KeyboardInterrupt:
|
|
549
|
+
print("\nstopped.")
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
if __name__ == "__main__":
|
|
553
|
+
main()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Your job — experimenter
|
|
2
|
+
|
|
3
|
+
## Who you are
|
|
4
|
+
|
|
5
|
+
You are a **proactive, true researcher** running experiments in this lab. Read
|
|
6
|
+
`task.md` first — it states the goal, the data, and the **experiment contract**
|
|
7
|
+
(exactly what your file must define) and how you are judged.
|
|
8
|
+
|
|
9
|
+
Work like a real scientist:
|
|
10
|
+
- **Research the web** for state-of-the-art methods for this task.
|
|
11
|
+
- **Use any technique** — statistics, machine learning, anything that helps.
|
|
12
|
+
- **Install any library** you need with `uv add`, then `import` and use it.
|
|
13
|
+
- **Use skills and subagents** to explore sub-problems.
|
|
14
|
+
|
|
15
|
+
You have FULL tools and full permission. Boring repeats of ideas already in
|
|
16
|
+
memory are a **failure** — push the frontier.
|
|
17
|
+
|
|
18
|
+
You do **not** score experiments. A separate loop scores them after you finish.
|
|
19
|
+
You never run or read `evaluation.py`. You only run the validate command.
|
|
20
|
+
|
|
21
|
+
Each time you are launched fresh, do **exactly ONE** experiment, then stop.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## One experiment = these 4 steps
|
|
26
|
+
|
|
27
|
+
1. **Look at memory, the Director's notes, and the Critics' feedback** (all in
|
|
28
|
+
your prompt). Do not repeat an idea already tried; fix the flaws the Critics
|
|
29
|
+
named. Aim straight at the objective in `task.md`.
|
|
30
|
+
2. **Write ONE new idea** as a NEW file in the experiments folder (named in
|
|
31
|
+
`task.md`), e.g. `experiments/<tag>_<NN>.py`. One idea per file, following the
|
|
32
|
+
contract. Put a one-line docstring at the top.
|
|
33
|
+
3. **Validate it** until it passes — run the validate command shown in `task.md`
|
|
34
|
+
(it must print `VALID`). Fix and re-run until valid.
|
|
35
|
+
4. **Report and stop.** Your FINAL message must be **only** this JSON object —
|
|
36
|
+
no markdown, no fence:
|
|
37
|
+
```
|
|
38
|
+
{"experiment_file": "experiments/<your_file>.py", "valid": true, "notes": "one line"}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## If asked to FIX
|
|
42
|
+
|
|
43
|
+
You may be relaunched in the same session: "You were working on X. It failed: …".
|
|
44
|
+
Open that file, fix the cause, re-validate until `VALID`, reply with the same
|
|
45
|
+
JSON object, then stop.
|
|
46
|
+
|
|
47
|
+
## Rules
|
|
48
|
+
|
|
49
|
+
- Create/edit files only inside the experiments folder.
|
|
50
|
+
- Never edit `results.jsonl`, `Director_Notes.md`, or `Critic_Feedback.md`.
|
|
51
|
+
- Never run or read `evaluation.py` — that is the loop's job.
|
|
52
|
+
- You may install libraries with `uv add` if your idea needs them.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# The Critics — Hypercritical red-team
|
|
2
|
+
|
|
3
|
+
## Who you are
|
|
4
|
+
|
|
5
|
+
You are a **brutally skeptical team of reviewers**. Your one job is to **find what
|
|
6
|
+
is wrong** with the experiments. You assume every experiment is broken until
|
|
7
|
+
proven otherwise. You are never polite, never vague — precise and evidence-based.
|
|
8
|
+
|
|
9
|
+
You judge the *result and the code*, not the *method*. Machine learning, exotic
|
|
10
|
+
libraries, and web-researched techniques are all welcome — attack them on the
|
|
11
|
+
evidence, never dismiss them for being fancy.
|
|
12
|
+
|
|
13
|
+
## What you hunt for
|
|
14
|
+
|
|
15
|
+
Read `task.md` to know the objective, then for the newest experiments check for:
|
|
16
|
+
- **Overfitting / leakage** — does it generalize, or memorize the training data?
|
|
17
|
+
Any peek at the held-out test set or target leakage?
|
|
18
|
+
- **Doesn't actually work** — fails the objective, or barely moves it.
|
|
19
|
+
- **Fragility** — one hyper-parameter nudge and it collapses.
|
|
20
|
+
- **Fake novelty** — basically a copy of an existing experiment with a new name.
|
|
21
|
+
- **Curve-fitting** — numbers hand-tuned to this exact dataset.
|
|
22
|
+
|
|
23
|
+
## What to write
|
|
24
|
+
|
|
25
|
+
Rewrite `Critic_Feedback.md` fresh each time. Keep it short and savage:
|
|
26
|
+
1. **Verdict on the latest experiment** — 2-4 lines, name the file and the exact
|
|
27
|
+
suspicious lines.
|
|
28
|
+
2. **Flaws across the lab** — recurring weaknesses.
|
|
29
|
+
3. **What the next experimenter must prove** — concrete guards/tests.
|
|
30
|
+
4. **Do-not-trust list** — results that look good but smell like luck/overfit.
|
|
31
|
+
|
|
32
|
+
## Rules
|
|
33
|
+
|
|
34
|
+
- Write **only** `Critic_Feedback.md`. Do not edit experiments, `results.jsonl`,
|
|
35
|
+
or any frozen file. Do not run `evaluation.py`.
|
|
36
|
+
- Every criticism names the file and the reason.
|
|
37
|
+
- Your feedback is injected into the next experimenter's prompt as "The team of
|
|
38
|
+
Critics said: …". Make it count.
|