leanlab 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
leanlab/core/loop.py ADDED
@@ -0,0 +1,374 @@
1
+ """leanlab — the generic experiment loop.
2
+
3
+ A LAB is a folder with:
4
+ - task.md the goal + the experiment contract (what to write)
5
+ - lab.json machine config: objective, commands, cadences
6
+ - evaluation.py the FROZEN judge — prints ONE line of JSON metrics
7
+ - validate.py a structural check the experimenter runs (no score)
8
+ - experiments/ where the Worker writes one Experiment file per loop
9
+ - results.jsonl the book: one JSON record per experiment
10
+ - CLAUDE.md / director.md / critic.md the agent specs
11
+
12
+ One LOOP = one experiment:
13
+ 1. Build the Worker prompt: task + memory (best experiments) + Director notes
14
+ + Critic feedback.
15
+ 2. Launch the Worker (a Claude session). It writes ONE Experiment file,
16
+ validates it, and returns JSON: {experiment_file, valid, notes}.
17
+ 3. Run the lab's evaluation.py on that file → parse the JSON metrics.
18
+ 4. If it errors, resume the same Worker session to fix it (up to max_fix_calls).
19
+ 5. Append one record to results.jsonl.
20
+ 6. Every critic_every / director_every loops, wake the Critic / Director.
21
+
22
+ The objective (e.g. minimize rmse, or maximize pnl) is read from lab.json, so the
23
+ same loop drives any kind of lab.
24
+
25
+ Run:
26
+ uv run python core/loop.py --lab labs/house-prices --n 5
27
+ uv run python core/loop.py --lab labs/house-prices --dry-run
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import argparse
33
+ import json
34
+ import shlex
35
+ import shutil
36
+ import subprocess
37
+ import sys
38
+ from datetime import datetime, timezone
39
+ from importlib import resources
40
+ from pathlib import Path
41
+
42
+ from rich.console import Console
43
+
44
+ from .agents import ClaudeAgent, StructuredRunner
45
+
46
+ CORE = Path(__file__).resolve().parent
47
+ console = Console()
48
+
49
+ # Runaway brake + memory size.
50
+ TURNS_PER_RUN = 250
51
+ MEMORY_TOP_N = 5
52
+ FULL_PERMISSION_MODE = "bypassPermissions"
53
+
54
+
55
+ # --- lab config -------------------------------------------------------------
56
+ def load_lab(lab_dir):
57
+ cfg = json.loads((lab_dir / "lab.json").read_text())
58
+ cfg.setdefault("experiments_dir", "experiments")
59
+ cfg.setdefault("results_file", "results.jsonl")
60
+ cfg.setdefault("director_every", 5)
61
+ cfg.setdefault("critic_every", 5)
62
+ cfg.setdefault("max_fix_calls", 3)
63
+ obj = cfg["objective"]
64
+ obj.setdefault("direction", "max")
65
+ return cfg
66
+
67
+
68
+ def metric_name(cfg):
69
+ return cfg["objective"]["metric"]
70
+
71
+
72
+ def direction(cfg):
73
+ return cfg["objective"]["direction"]
74
+
75
+
76
+ # --- results.jsonl ----------------------------------------------------------
77
+ def read_results(lab_dir, cfg):
78
+ path = lab_dir / cfg["results_file"]
79
+ if not path.exists():
80
+ return []
81
+ out = []
82
+ for line in path.read_text().splitlines():
83
+ line = line.strip()
84
+ if not line:
85
+ continue
86
+ try:
87
+ out.append(json.loads(line))
88
+ except json.JSONDecodeError:
89
+ pass
90
+ return out
91
+
92
+
93
+ def append_result(lab_dir, cfg, record):
94
+ path = lab_dir / cfg["results_file"]
95
+ with path.open("a") as f:
96
+ f.write(json.dumps(record) + "\n")
97
+
98
+
99
+ def _metric_val(rec, metric):
100
+ try:
101
+ return float(rec.get(metric))
102
+ except (TypeError, ValueError):
103
+ return None
104
+
105
+
106
+ def best_value(rows, cfg):
107
+ metric, d = metric_name(cfg), direction(cfg)
108
+ vals = [v for v in (_metric_val(r, metric) for r in rows) if v is not None]
109
+ if not vals:
110
+ return None
111
+ return min(vals) if d == "min" else max(vals)
112
+
113
+
114
+ def is_better(value, best, d):
115
+ if best is None:
116
+ return True
117
+ return value < best if d == "min" else value > best
118
+
119
+
120
+ def build_memory(rows, cfg):
121
+ metric, d = metric_name(cfg), direction(cfg)
122
+ scored = [r for r in rows if _metric_val(r, metric) is not None]
123
+ if not scored:
124
+ return "MEMORY: no experiments scored yet. You are the first."
125
+ scored.sort(key=lambda r: _metric_val(r, metric), reverse=(d == "max"))
126
+ top = scored[:MEMORY_TOP_N]
127
+ lines = [f"MEMORY — best experiments so far (objective: {d} {metric}; "
128
+ f"do not repeat these):"]
129
+ for r in top:
130
+ extras = " ".join(f"{k}={v}" for k, v in r.items()
131
+ if k not in ("tag", "experiment_file", "best_so_far", "notes", "ts"))
132
+ lines.append(f"- {r.get('experiment_file','?')} :: {extras} :: {r.get('notes','')}")
133
+ return "\n".join(lines)
134
+
135
+
136
+ # --- injected advisor files -------------------------------------------------
137
+ def _inject(lab_dir, filename, header):
138
+ path = lab_dir / filename
139
+ if not path.exists():
140
+ return ""
141
+ text = path.read_text().strip()
142
+ return f"{header}\n{text}" if text else ""
143
+
144
+
145
+ def build_directions(lab_dir):
146
+ return _inject(lab_dir, "Director_Notes.md",
147
+ "DIRECTOR'S GUIDANCE (your chief scientist — follow it):")
148
+
149
+
150
+ def build_critique(lab_dir):
151
+ return _inject(lab_dir, "Critic_Feedback.md",
152
+ "THE TEAM OF CRITICS SAID (fix these flaws — do not repeat them):")
153
+
154
+
155
+ # --- prompts ----------------------------------------------------------------
156
+ # The fixed agent specs live in the leanlab package (not the user's lab) and are
157
+ # injected into each prompt at runtime. Each prompt's FIRST line names the role so
158
+ # the dashboard can identify the session.
159
+ def _spec(name):
160
+ return (resources.files("leanlab") / "templates" / "agents" / name).read_text().strip()
161
+
162
+
163
+ WORKER_INTRO = ("You are the WORKER (experimenter) on this lab. Read task.md in this lab "
164
+ "and the spec below, then follow them.")
165
+ WORKER_ACTION = (
166
+ "Do EXACTLY ONE experiment: invent one new idea and write it as ONE new file in the "
167
+ "experiments folder (a new file, do not overwrite sample.py). Give it a one-line "
168
+ "docstring. Validate it with the lab's validate command until it prints VALID. Do NOT "
169
+ "run or read evaluation.py, and do NOT edit results.jsonl. Be a proactive, true "
170
+ "researcher: research the web, use any method, install any library you need (uv add). "
171
+ "Your final message must be ONLY this JSON object: "
172
+ '{"experiment_file": "experiments/<file>.py", "valid": true, "notes": "one line"}'
173
+ )
174
+ DIRECTOR_INTRO = ("You are the DIRECTOR (chief scientist) of this lab. Read task.md in this "
175
+ "lab and the spec below, then follow them.")
176
+ DIRECTOR_ACTION = (
177
+ "Study results.jsonl and the experiment files, then rewrite Director_Notes.md with "
178
+ "sharp, concrete, ambitious guidance for the next experiments. Write ONLY that one "
179
+ "file, then stop."
180
+ )
181
+ CRITIC_INTRO = ("You are the CRITIC (hypercritical red-team) of this lab. Read task.md in "
182
+ "this lab and the spec below, then follow it.")
183
+ CRITIC_ACTION = (
184
+ "Review the newest experiments against results.jsonl, hunt for every flaw (overfitting, "
185
+ "leakage, fragility, fake novelty), and rewrite Critic_Feedback.md with blunt, specific "
186
+ "criticism. Write ONLY that one file, then stop."
187
+ )
188
+
189
+
190
+ def build_director_prompt():
191
+ return "\n\n".join([DIRECTOR_INTRO, _spec("director.md"), DIRECTOR_ACTION])
192
+
193
+
194
+ def build_critic_prompt():
195
+ return "\n\n".join([CRITIC_INTRO, _spec("critic.md"), CRITIC_ACTION])
196
+
197
+
198
+ # --- launching agents -------------------------------------------------------
199
+ def make_runner(lab_dir):
200
+ """The default AgentRunner: a Claude transport wrapped in the retry protocol.
201
+
202
+ The loop depends only on the AgentRunner abstraction — swap this factory to
203
+ plug in a Hermes or custom backend.
204
+ """
205
+ return StructuredRunner(
206
+ ClaudeAgent(lab_dir, max_turns=TURNS_PER_RUN, permission_mode=FULL_PERMISSION_MODE)
207
+ )
208
+
209
+
210
+ # --- evaluation -------------------------------------------------------------
211
+ def run_cmd_template(template, lab_dir, file_rel):
212
+ parts = [p.replace("{file}", file_rel) for p in shlex.split(template)]
213
+ return subprocess.run(parts, cwd=lab_dir, capture_output=True, text=True)
214
+
215
+
216
+ def evaluate(lab_dir, cfg, file_rel):
217
+ """Run the lab's eval_cmd. Returns (metrics_dict_or_None, raw_output)."""
218
+ proc = run_cmd_template(cfg["eval_cmd"], lab_dir, file_rel)
219
+ out = (proc.stdout + ("\n" + proc.stderr if proc.stderr else "")).strip()
220
+ # The judge prints ONE line of JSON metrics; take the last JSON object.
221
+ for line in reversed(out.splitlines()):
222
+ line = line.strip()
223
+ try:
224
+ obj = json.loads(line)
225
+ except json.JSONDecodeError:
226
+ continue
227
+ if isinstance(obj, dict):
228
+ return obj, out
229
+ return None, out
230
+
231
+
232
+ # --- experiment-file resolution ---------------------------------------------
233
+ def experiment_files(lab_dir, cfg):
234
+ d = lab_dir / cfg["experiments_dir"]
235
+ return {p for p in d.glob("*.py") if p.name != "sample.py"}
236
+
237
+
238
+ def resolve_experiment(lab_dir, cfg, report, new_files):
239
+ if report and report.get("experiment_file"):
240
+ cand = (lab_dir / report["experiment_file"]).resolve()
241
+ if cand.exists():
242
+ return cand
243
+ return sorted(new_files)[-1] if new_files else None
244
+
245
+
246
+ # --- the loop ---------------------------------------------------------------
247
+ def build_worker_prompt(lab_dir, cfg):
248
+ rows = read_results(lab_dir, cfg)
249
+ parts = [WORKER_INTRO, _spec("CLAUDE.md"), WORKER_ACTION, build_memory(rows, cfg)]
250
+ for extra in (build_directions(lab_dir), build_critique(lab_dir)):
251
+ if extra:
252
+ parts.append(extra)
253
+ return "\n\n".join(parts)
254
+
255
+
256
+ def score_with_fixes(lab_dir, cfg, tag, exp, session_id, runner):
257
+ rel = str(exp.relative_to(lab_dir))
258
+ metrics, out = evaluate(lab_dir, cfg, rel)
259
+ fixes = 0
260
+ while metrics is None:
261
+ fixes += 1
262
+ if fixes > cfg["max_fix_calls"] or not session_id:
263
+ console.print(f"[red]✗ {exp.name} — invalid after {cfg['max_fix_calls']} fix attempts[/red]")
264
+ console.print(f"[dim]{' '.join(out.split())[-200:]}[/dim]")
265
+ append_result(lab_dir, cfg, {
266
+ "tag": tag, "experiment_file": rel, metric_name(cfg): None,
267
+ "best_so_far": False, "notes": f"invalid: {' '.join(out.split())[-160:]}",
268
+ "ts": datetime.now(timezone.utc).isoformat(),
269
+ })
270
+ return
271
+ console.print(f"[yellow]⟳ evaluation failed — asking the worker to fix it "
272
+ f"({fixes}/{cfg['max_fix_calls']})…[/yellow]")
273
+ fix_prompt = (
274
+ f"You were working on {rel}. The judge ran on it and FAILED:\n\n{out}\n\n"
275
+ "Fix the experiment file so it runs cleanly. Validate it. Do not run "
276
+ "evaluation.py. Reply with the same JSON object, then stop."
277
+ )
278
+ with console.status("[yellow]Worker is fixing the experiment…", spinner="dots"):
279
+ session_id = runner.run_structured(fix_prompt, ["experiment_file"], session=session_id).session_id
280
+ metrics, out = evaluate(lab_dir, cfg, rel)
281
+
282
+ rows = read_results(lab_dir, cfg)
283
+ prior = best_value(rows, cfg)
284
+ val = _metric_val(metrics, metric_name(cfg))
285
+ best = val is not None and is_better(val, prior, direction(cfg))
286
+ append_result(lab_dir, cfg, {
287
+ "tag": tag, "experiment_file": rel, **metrics,
288
+ "best_so_far": best, "notes": first_docstring_line(exp),
289
+ "ts": datetime.now(timezone.utc).isoformat(),
290
+ })
291
+ flag = " [bold yellow]⭐ new best![/bold yellow]" if best else ""
292
+ console.print(f"[green]✓ {exp.name}[/green] · {metric_name(cfg)}=[bold]{val}[/bold]{flag}")
293
+
294
+
295
+ def first_docstring_line(path):
296
+ import re
297
+ m = re.search(r'^\s*["\']{3}(.*?)["\']{3}', path.read_text(), re.DOTALL)
298
+ if not m:
299
+ return ""
300
+ body = m.group(1).strip()
301
+ return body.splitlines()[0].strip() if body else ""
302
+
303
+
304
+ def main():
305
+ p = argparse.ArgumentParser(description="Run a leanlab for N experiments.")
306
+ p.add_argument("--lab", required=True, help="path to the lab folder")
307
+ p.add_argument("--n", type=int, default=5, help="how many experiments to run")
308
+ p.add_argument("--dry-run", action="store_true", help="print the worker prompt, run nothing")
309
+ args = p.parse_args()
310
+
311
+ lab_dir = Path(args.lab).resolve()
312
+ if not (lab_dir / "lab.json").exists():
313
+ print(f"ERROR: no lab.json in {lab_dir}", file=sys.stderr)
314
+ sys.exit(1)
315
+ cfg = load_lab(lab_dir)
316
+
317
+ if args.dry_run:
318
+ print(f"Lab: {cfg['name']} — objective: {direction(cfg)} {metric_name(cfg)}\n")
319
+ print("# WORKER PROMPT\n" + build_worker_prompt(lab_dir, cfg))
320
+ print(f"\n# eval: {cfg['eval_cmd']} | validate: {cfg['validate_cmd']}")
321
+ print(f"# Critic every {cfg['critic_every']} loops, Director every {cfg['director_every']}.")
322
+ return
323
+
324
+ if shutil.which("claude") is None:
325
+ print("ERROR: `claude` not found on PATH.", file=sys.stderr)
326
+ sys.exit(1)
327
+
328
+ tag = datetime.now(timezone.utc).strftime("%b%d").lower()
329
+ start = len(read_results(lab_dir, cfg))
330
+ runner = make_runner(lab_dir)
331
+ console.rule(f"[bold]🧪 {cfg['name']}[/bold] · {direction(cfg)} {metric_name(cfg)} · "
332
+ f"{args.n} experiment(s)")
333
+
334
+ try:
335
+ for loop in range(1, args.n + 1):
336
+ console.rule(f"[bold cyan]Experiment {loop}/{args.n}", style="cyan")
337
+ before = experiment_files(lab_dir, cfg)
338
+ with console.status("[cyan]Worker is researching and writing an experiment…", spinner="dots"):
339
+ result = runner.run_structured(build_worker_prompt(lab_dir, cfg), ["experiment_file"])
340
+ scored = False
341
+ if not result.ok:
342
+ console.print("[yellow]⚠ worker produced no valid result — moving on.[/yellow]")
343
+ else:
344
+ new = experiment_files(lab_dir, cfg) - before
345
+ exp = resolve_experiment(lab_dir, cfg, result.data, new)
346
+ if exp is None:
347
+ console.print("[yellow]⚠ no experiment file produced — moving on.[/yellow]")
348
+ else:
349
+ console.print(f"[dim]Worker wrote [bold]{exp.name}[/bold] — scoring…[/dim]")
350
+ score_with_fixes(lab_dir, cfg, tag, exp, result.session_id, runner)
351
+ scored = True
352
+
353
+ if scored and loop % cfg["critic_every"] == 0:
354
+ console.rule("[magenta]Critic review", style="magenta")
355
+ with console.status("[magenta]Critic is red-teaming the latest experiments…", spinner="dots"):
356
+ runner.run_plain(build_critic_prompt())
357
+ console.print("[magenta]✓ Critic_Feedback.md updated.[/magenta]")
358
+ if loop % cfg["director_every"] == 0:
359
+ console.rule("[blue]Director review", style="blue")
360
+ with console.status("[blue]Director is rewriting the research plan…", spinner="dots"):
361
+ runner.run_plain(build_director_prompt())
362
+ console.print("[blue]✓ Director_Notes.md updated.[/blue]")
363
+ except KeyboardInterrupt:
364
+ console.print("\n[yellow]Stopped.[/yellow]")
365
+
366
+ n_new = len(read_results(lab_dir, cfg)) - start
367
+ best = best_value(read_results(lab_dir, cfg), cfg)
368
+ console.rule("[green]Done", style="green")
369
+ console.print(f"[green]✓ {n_new} new record(s)[/green] · best {metric_name(cfg)}=[bold]{best}[/bold]"
370
+ f" · watch: [bold]leanlab serve {cfg['name']}[/bold]")
371
+
372
+
373
+ if __name__ == "__main__":
374
+ main()