leanlab 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- leanlab/__init__.py +1 -0
- leanlab/cli.py +315 -0
- leanlab/core/__init__.py +1 -0
- leanlab/core/agents/__init__.py +10 -0
- leanlab/core/agents/claude.py +38 -0
- leanlab/core/agents/port.py +49 -0
- leanlab/core/agents/protocol.py +64 -0
- leanlab/core/coding/__init__.py +1 -0
- leanlab/core/coding/board.py +335 -0
- leanlab/core/coding/board_dist/assets/index-BBCkNArL.css +1 -0
- leanlab/core/coding/board_dist/assets/index-CNGMDAuO.js +40 -0
- leanlab/core/coding/board_dist/index.html +13 -0
- leanlab/core/coding/engineer.py +304 -0
- leanlab/core/coding/gate.py +63 -0
- leanlab/core/coding/personas.py +23 -0
- leanlab/core/coding/playbook.py +47 -0
- leanlab/core/coding/spec.py +232 -0
- leanlab/core/doctor.py +220 -0
- leanlab/core/init.py +219 -0
- leanlab/core/loop.py +374 -0
- leanlab/core/monitor.py +553 -0
- leanlab/templates/agents/CLAUDE.md +52 -0
- leanlab/templates/agents/critic.md +38 -0
- leanlab/templates/agents/director.md +37 -0
- leanlab/templates/agents/engineer.md +12 -0
- leanlab/templates/agents/reviewer.md +34 -0
- leanlab/templates/agents/techlead.md +7 -0
- leanlab/templates/skill/SKILL.md +99 -0
- leanlab-0.2.1.dist-info/METADATA +273 -0
- leanlab-0.2.1.dist-info/RECORD +33 -0
- leanlab-0.2.1.dist-info/WHEEL +4 -0
- leanlab-0.2.1.dist-info/entry_points.txt +2 -0
- leanlab-0.2.1.dist-info/licenses/LICENSE +21 -0
leanlab/core/doctor.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Preflight 'doctor' — verify a lab is wired correctly before running.
|
|
2
|
+
|
|
3
|
+
The expensive bugs are silent wiring mismatches: lab.json names metric "FPS" but
|
|
4
|
+
evaluation.py prints "score"; or the command passes --experiment but the script
|
|
5
|
+
reads a positional arg. `check_lab` catches those *cheaply* by probing the
|
|
6
|
+
evaluator with a sentinel missing file — the evaluator should fail fast on the
|
|
7
|
+
missing file (before any render / Claude call), and from its output we can verify
|
|
8
|
+
both the argument wiring and the metric key.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import shlex
|
|
15
|
+
import shutil
|
|
16
|
+
import subprocess
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from importlib import resources
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
OK, WARN, FAIL = "ok", "warn", "fail"
|
|
22
|
+
_SENTINEL = "experiments/__leanlab_preflight__.py"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Check:
|
|
27
|
+
name: str
|
|
28
|
+
status: str # ok | warn | fail
|
|
29
|
+
message: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _run(cmd_template, lab_dir, file_rel):
|
|
33
|
+
parts = [p.replace("{file}", file_rel) for p in shlex.split(cmd_template)]
|
|
34
|
+
try:
|
|
35
|
+
proc = subprocess.run(parts, cwd=lab_dir, capture_output=True, text=True, timeout=120)
|
|
36
|
+
except Exception as e: # noqa: BLE001
|
|
37
|
+
return None, f"could not run `{cmd_template}`: {e}"
|
|
38
|
+
return proc, (proc.stdout + ("\n" + proc.stderr if proc.stderr else "")).strip()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _last_json(out):
|
|
42
|
+
for line in reversed(out.splitlines()):
|
|
43
|
+
try:
|
|
44
|
+
o = json.loads(line.strip())
|
|
45
|
+
except (ValueError, json.JSONDecodeError):
|
|
46
|
+
continue
|
|
47
|
+
if isinstance(o, dict):
|
|
48
|
+
return o
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _probe_args(name, cmd_template, lab_dir, checks):
|
|
53
|
+
"""A command should receive the file path, not the flag. Returns the probe output."""
|
|
54
|
+
proc, out = _run(cmd_template, lab_dir, _SENTINEL)
|
|
55
|
+
if proc is None:
|
|
56
|
+
checks.append(Check(name, FAIL, out))
|
|
57
|
+
return None
|
|
58
|
+
if "--experiment" in out and _SENTINEL not in out and "__leanlab_preflight__" not in out:
|
|
59
|
+
checks.append(Check(name, FAIL, "the script is not reading the file argument (it got the "
|
|
60
|
+
"flag instead) — fix its arg parsing or the command in lab.json"))
|
|
61
|
+
else:
|
|
62
|
+
checks.append(Check(name, OK, "the file argument reaches the script"))
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def check_lab(lab_dir) -> list[Check]:
|
|
67
|
+
lab = Path(lab_dir)
|
|
68
|
+
checks: list[Check] = []
|
|
69
|
+
|
|
70
|
+
cfgpath = lab / "lab.json"
|
|
71
|
+
if not cfgpath.exists():
|
|
72
|
+
return [Check("lab.json", FAIL, "missing — is this a lab folder?")]
|
|
73
|
+
try:
|
|
74
|
+
cfg = json.loads(cfgpath.read_text())
|
|
75
|
+
except (ValueError, json.JSONDecodeError) as e:
|
|
76
|
+
return [Check("lab.json", FAIL, f"invalid JSON: {e}")]
|
|
77
|
+
|
|
78
|
+
obj = cfg.get("objective") or {}
|
|
79
|
+
metric, direction = obj.get("metric"), obj.get("direction")
|
|
80
|
+
missing = [k for k in ("eval_cmd", "validate_cmd", "experiments_dir", "results_file")
|
|
81
|
+
if not cfg.get(k)]
|
|
82
|
+
if missing or not metric or not direction:
|
|
83
|
+
checks.append(Check("lab.json", FAIL,
|
|
84
|
+
f"missing {missing or ''} objective.metric/direction={metric}/{direction}"))
|
|
85
|
+
else:
|
|
86
|
+
checks.append(Check("lab.json", OK, f"objective {direction} {metric}"))
|
|
87
|
+
|
|
88
|
+
for f in ("task.md", "evaluation.py", "validate.py"):
|
|
89
|
+
checks.append(Check(f, OK if (lab / f).exists() else FAIL,
|
|
90
|
+
"present" if (lab / f).exists() else "missing"))
|
|
91
|
+
exp = lab / cfg.get("experiments_dir", "experiments")
|
|
92
|
+
checks.append(Check("experiments/", OK if exp.is_dir() else FAIL,
|
|
93
|
+
"present" if exp.is_dir() else "missing"))
|
|
94
|
+
|
|
95
|
+
checks.append(Check("claude CLI", OK if shutil.which("claude") else FAIL,
|
|
96
|
+
"on PATH" if shutil.which("claude")
|
|
97
|
+
else "not found — workers/director/critic/judge cannot run"))
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
base = resources.files("leanlab") / "templates" / "agents"
|
|
101
|
+
have = all((base / s).is_file() for s in ("CLAUDE.md", "director.md", "critic.md"))
|
|
102
|
+
checks.append(Check("agent specs", OK if have else FAIL,
|
|
103
|
+
"resolved from package" if have else "missing from package"))
|
|
104
|
+
except Exception as e: # noqa: BLE001
|
|
105
|
+
checks.append(Check("agent specs", FAIL, f"cannot resolve: {e}"))
|
|
106
|
+
|
|
107
|
+
# Actually build the worker/director/critic prompts for this lab — the real wiring test
|
|
108
|
+
# (injects the specs and reads the lab's memory + Director/Critic notes).
|
|
109
|
+
try:
|
|
110
|
+
from .loop import build_worker_prompt, build_director_prompt, build_critic_prompt
|
|
111
|
+
build_worker_prompt(lab, cfg)
|
|
112
|
+
build_director_prompt()
|
|
113
|
+
build_critic_prompt()
|
|
114
|
+
checks.append(Check("agent prompts", OK, "worker / director / critic prompts build"))
|
|
115
|
+
except Exception as e: # noqa: BLE001
|
|
116
|
+
checks.append(Check("agent prompts", FAIL, f"cannot build agent prompts: {e}"))
|
|
117
|
+
|
|
118
|
+
for f in ("Director_Notes.md", "Critic_Feedback.md"):
|
|
119
|
+
checks.append(Check(f, OK if (lab / f).exists() else WARN,
|
|
120
|
+
"present" if (lab / f).exists() else "missing (created on first review)"))
|
|
121
|
+
|
|
122
|
+
# The wiring probes — cheap, run the eval/validate on a missing sentinel file.
|
|
123
|
+
if (lab / "evaluation.py").exists() and cfg.get("eval_cmd"):
|
|
124
|
+
out = _probe_args("eval args", cfg["eval_cmd"], lab, checks)
|
|
125
|
+
if out is not None and metric:
|
|
126
|
+
verdict = _last_json(out)
|
|
127
|
+
if verdict is None:
|
|
128
|
+
checks.append(Check("eval metric", WARN,
|
|
129
|
+
"evaluator emitted no JSON for a missing file — can't verify the metric key"))
|
|
130
|
+
elif metric not in verdict:
|
|
131
|
+
checks.append(Check("eval metric", FAIL,
|
|
132
|
+
f'evaluator output has no "{metric}" key (lab.json objective); '
|
|
133
|
+
f"it printed keys {list(verdict)[:6]}"))
|
|
134
|
+
else:
|
|
135
|
+
checks.append(Check("eval metric", OK, f'emits the objective key "{metric}"'))
|
|
136
|
+
if (lab / "validate.py").exists() and cfg.get("validate_cmd"):
|
|
137
|
+
_probe_args("validate args", cfg["validate_cmd"], lab, checks)
|
|
138
|
+
|
|
139
|
+
return checks
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def summarize(checks):
|
|
143
|
+
return {s: sum(c.status == s for c in checks) for s in (OK, WARN, FAIL)}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def ok(checks):
|
|
147
|
+
return not any(c.status == FAIL for c in checks)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# --- reporting --------------------------------------------------------------
|
|
151
|
+
_SYMBOL = {OK: "[green]✓[/green]", WARN: "[yellow]⚠[/yellow]", FAIL: "[red]✗[/red]"}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class RichReport:
|
|
155
|
+
"""Default terminal reporter for check/fix."""
|
|
156
|
+
|
|
157
|
+
def __init__(self):
|
|
158
|
+
from rich.console import Console
|
|
159
|
+
self.console = Console()
|
|
160
|
+
|
|
161
|
+
def report(self, checks):
|
|
162
|
+
for c in checks:
|
|
163
|
+
self.console.print(f"{_SYMBOL.get(c.status, '?')} [bold]{c.name}[/bold] — {c.message}")
|
|
164
|
+
s = summarize(checks)
|
|
165
|
+
self.console.print(f"\n[bold]{s[OK]} ok · {s[WARN]} warn · {s[FAIL]} fail[/bold]")
|
|
166
|
+
|
|
167
|
+
def status(self, message):
|
|
168
|
+
return self.console.status(f"[bold cyan]{message}", spinner="dots")
|
|
169
|
+
|
|
170
|
+
def note(self, message):
|
|
171
|
+
self.console.print(message)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# --- automated fixing -------------------------------------------------------
|
|
175
|
+
def _fix_prompt(fails):
|
|
176
|
+
lines = "\n".join(f"- {c.name}: {c.message}" for c in fails)
|
|
177
|
+
return (
|
|
178
|
+
"You are fixing a leanlab lab in the current directory. `leanlab check` found these "
|
|
179
|
+
f"wiring problems:\n\n{lines}\n\n"
|
|
180
|
+
"Fix them by editing the lab's files so they are mutually consistent:\n"
|
|
181
|
+
"- evaluation.py and validate.py MUST parse their CLI args to match lab.json's eval_cmd / "
|
|
182
|
+
"validate_cmd (which pass `--experiment <path>`) — use argparse, NOT a positional arg.\n"
|
|
183
|
+
"- evaluation.py MUST print ONE line of JSON whose keys include exactly the objective "
|
|
184
|
+
"metric named in lab.json (objective.metric) — not a generic 'score' unless that IS the "
|
|
185
|
+
"metric.\n"
|
|
186
|
+
"- Create any missing files the checks listed.\n"
|
|
187
|
+
"Make the edits now with your tools, then stop. Do NOT run experiments."
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def fix_lab(lab_dir, *, runner=None, ui=None, rounds=3) -> bool:
|
|
192
|
+
"""Run checks; if any fail, have Claude edit the lab to fix them, then re-check. Loop."""
|
|
193
|
+
lab = Path(lab_dir)
|
|
194
|
+
ui = ui or RichReport()
|
|
195
|
+
from .loop import make_runner
|
|
196
|
+
runner = runner or make_runner(lab)
|
|
197
|
+
|
|
198
|
+
for attempt in range(1, rounds + 1):
|
|
199
|
+
checks = check_lab(lab)
|
|
200
|
+
fails = [c for c in checks if c.status == FAIL]
|
|
201
|
+
ui.report(checks)
|
|
202
|
+
if not fails:
|
|
203
|
+
ui.note("\n[green]✓ all checks pass — nothing to fix.[/green]")
|
|
204
|
+
return True
|
|
205
|
+
ui.note(f"\n[yellow]Fixing {len(fails)} issue(s) with Claude (round {attempt}/{rounds})…[/yellow]")
|
|
206
|
+
# evaluation.py is often locked read-only — unlock so the agent can edit it.
|
|
207
|
+
ev = lab / "evaluation.py"
|
|
208
|
+
relock = ev.exists() and not (ev.stat().st_mode & 0o200)
|
|
209
|
+
if relock:
|
|
210
|
+
ev.chmod(0o644)
|
|
211
|
+
try:
|
|
212
|
+
with ui.status("Claude is editing the lab to fix the issues…"):
|
|
213
|
+
runner.run_plain(_fix_prompt(fails))
|
|
214
|
+
finally:
|
|
215
|
+
if relock and ev.exists():
|
|
216
|
+
ev.chmod(0o444)
|
|
217
|
+
|
|
218
|
+
checks = check_lab(lab)
|
|
219
|
+
ui.report(checks)
|
|
220
|
+
return ok(checks)
|
leanlab/core/init.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Interactive `leanlab init` — the init-architect.
|
|
2
|
+
|
|
3
|
+
Realizes the init-lab use case. The architect (Claude, reached through the same
|
|
4
|
+
AgentRunner the workers use) drafts task.md + the objective from the operator's
|
|
5
|
+
plain-words description, then proposes an evaluator in a loop until the operator
|
|
6
|
+
approves. All terminal I/O goes through an injected `ui` (default: a rich + questionary
|
|
7
|
+
console) so the flow is fully testable without a real terminal or Claude.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import subprocess
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from .loop import make_runner
|
|
17
|
+
|
|
18
|
+
LAB_JSON_TEMPLATE = {
|
|
19
|
+
"name": "",
|
|
20
|
+
"description": "TODO: one line describing the task.",
|
|
21
|
+
"objective": {"metric": "score", "direction": "max"},
|
|
22
|
+
"experiments_dir": "experiments",
|
|
23
|
+
"results_file": "results.jsonl",
|
|
24
|
+
"validate_cmd": "uv run python validate.py --experiment {file}",
|
|
25
|
+
"eval_cmd": "uv run python evaluation.py --experiment {file}",
|
|
26
|
+
"director_every": 5,
|
|
27
|
+
"critic_every": 5,
|
|
28
|
+
"max_fix_calls": 3,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def scaffold(lab: Path, name: str) -> None:
|
|
33
|
+
"""Create the empty .leanlab/<name>/ skeleton (no agent specs — those live in the package)."""
|
|
34
|
+
lab = Path(lab)
|
|
35
|
+
(lab / "experiments").mkdir(parents=True)
|
|
36
|
+
(lab / "lab.json").write_text(json.dumps(dict(LAB_JSON_TEMPLATE, name=name), indent=2) + "\n")
|
|
37
|
+
(lab / "results.jsonl").write_text("")
|
|
38
|
+
(lab / "Director_Notes.md").write_text("# Director Notes\n\nNeutral — no experiments yet.\n")
|
|
39
|
+
(lab / "Critic_Feedback.md").write_text("# Critic Feedback\n\nNo experiments reviewed yet.\n")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _draft_prompt(description: str) -> str:
|
|
43
|
+
return (
|
|
44
|
+
"You are the lab ARCHITECT for leanlab. The operator wants to research:\n\n"
|
|
45
|
+
f"{description}\n\n"
|
|
46
|
+
"Base your decision ONLY on the task description above. Ignore any other labs, "
|
|
47
|
+
"example projects, or files that may exist in this directory or its parents — they are "
|
|
48
|
+
"unrelated and must not influence the metric or framing. "
|
|
49
|
+
"Decide what a single 'experiment' is for this task and how success is measured. "
|
|
50
|
+
"Choose the objective metric that is STANDARD and APPROPRIATE for THIS task — judge it "
|
|
51
|
+
"on the task's own terms; do NOT default to any particular metric. Match the metric to "
|
|
52
|
+
"the task type, e.g.: classification -> accuracy / F1 / ROC-AUC; regression -> RMSE / "
|
|
53
|
+
"MAE / R2; ranking -> NDCG / MAP; clustering -> silhouette. Set direction (min or max) "
|
|
54
|
+
"to fit, and give a one-line justification for the choice. If the request is not a "
|
|
55
|
+
"measurable experiment as written, reframe it into the closest measurable one and say so. "
|
|
56
|
+
"Write a clear task.md: the goal, the chosen metric (with the justification), and the "
|
|
57
|
+
"experiment contract — what ONE file in experiments/ must define so evaluation.py can run "
|
|
58
|
+
"it. You may research the web. Do NOT create or edit any files. Reply with ONLY this JSON "
|
|
59
|
+
'object: {"task_md": "<full markdown>", '
|
|
60
|
+
'"objective": {"metric": "<name>", "direction": "min|max"}}'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _propose_prompt(feedback: str | None, metric: str, eval_cmd: str, validate_cmd: str) -> str:
|
|
65
|
+
base = (
|
|
66
|
+
"Propose how to EVALUATE this task, using the objective metric you chose. If it needs "
|
|
67
|
+
"data, say what data and how you would obtain and split it. Provide the full CONTENTS of "
|
|
68
|
+
"a frozen evaluation.py and a cheap validate.py.\n"
|
|
69
|
+
"HARD REQUIREMENTS — the lab will NOT score unless these hold exactly:\n"
|
|
70
|
+
f"1. leanlab runs your files from inside the lab dir with EXACTLY these commands, where "
|
|
71
|
+
f"{{file}} is the experiment path:\n"
|
|
72
|
+
f" evaluation: {eval_cmd}\n"
|
|
73
|
+
f" validate: {validate_cmd}\n"
|
|
74
|
+
f" So evaluation.py and validate.py MUST parse their arguments to match — use argparse "
|
|
75
|
+
f"with a '--experiment' option. Do NOT read a positional argument like sys.argv[1].\n"
|
|
76
|
+
f'2. evaluation.py MUST print ONE line of JSON whose keys include exactly "{metric}" '
|
|
77
|
+
f'(the objective metric, spelled exactly — NOT a generic "score"). Extra metrics are fine '
|
|
78
|
+
f"as additional flat (number) keys; avoid nested objects.\n"
|
|
79
|
+
"3. Do NOT create or edit any files — return everything inside the JSON only.\n"
|
|
80
|
+
"4. evaluation.py and validate.py MUST check the experiment file exists and fail fast FIRST "
|
|
81
|
+
"— before importing heavy libraries, rendering, or calling any model — so a quick preflight "
|
|
82
|
+
"stays cheap.\n"
|
|
83
|
+
"5. List in 'packages' every third-party pip package evaluation.py or validate.py imports "
|
|
84
|
+
"(e.g. playwright, numpy) so the lab can install them. Use [] if none. If you need an LLM "
|
|
85
|
+
"to judge, call the `claude` CLI via subprocess (no API key needed) rather than an SDK.\n"
|
|
86
|
+
"Reply with ONLY this JSON object: "
|
|
87
|
+
'{"summary": "<plain-English approach, 3-5 sentences>", '
|
|
88
|
+
'"evaluation_py": "<full file contents>", "validate_py": "<full file contents>", '
|
|
89
|
+
'"packages": ["<pip name>", ...]}'
|
|
90
|
+
)
|
|
91
|
+
if feedback:
|
|
92
|
+
return (f"The operator gave this feedback on your previous proposal:\n\n{feedback}\n\n"
|
|
93
|
+
f"Revise accordingly. {base}")
|
|
94
|
+
return base
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class RichUI:
|
|
98
|
+
"""Terminal UI for `init` — spinners, panels, syntax-highlighted code, arrow-key menus."""
|
|
99
|
+
|
|
100
|
+
def __init__(self):
|
|
101
|
+
from rich.console import Console
|
|
102
|
+
self.console = Console()
|
|
103
|
+
|
|
104
|
+
def status(self, message):
|
|
105
|
+
return self.console.status(f"[bold cyan]{message}", spinner="dots")
|
|
106
|
+
|
|
107
|
+
def note(self, message):
|
|
108
|
+
self.console.print(message)
|
|
109
|
+
|
|
110
|
+
def error(self, message):
|
|
111
|
+
self.console.print(f"[bold red]{message}[/bold red]")
|
|
112
|
+
|
|
113
|
+
def objective(self, obj):
|
|
114
|
+
from rich.panel import Panel
|
|
115
|
+
self.console.print(Panel(f"[bold]{obj.get('direction')} {obj.get('metric')}[/bold]",
|
|
116
|
+
title="Objective", border_style="cyan", expand=False))
|
|
117
|
+
|
|
118
|
+
def proposal(self, summary):
|
|
119
|
+
from rich.markdown import Markdown
|
|
120
|
+
from rich.panel import Panel
|
|
121
|
+
self.console.print(Panel(Markdown(summary), title="Proposed evaluation",
|
|
122
|
+
border_style="magenta"))
|
|
123
|
+
|
|
124
|
+
def decide(self, evaluation_py):
|
|
125
|
+
import questionary
|
|
126
|
+
from rich.syntax import Syntax
|
|
127
|
+
approve, view, feedback, cancel = (
|
|
128
|
+
"✓ Approve & write the files", "👁 View the generated evaluation.py",
|
|
129
|
+
"✍ Give feedback (revise)", "✖ Cancel")
|
|
130
|
+
while True:
|
|
131
|
+
choice = questionary.select("What now?", choices=[approve, view, feedback, cancel]).ask()
|
|
132
|
+
if choice is None or choice == cancel:
|
|
133
|
+
return ("cancel", None)
|
|
134
|
+
if choice == view:
|
|
135
|
+
self.console.print(Syntax(evaluation_py, "python", theme="ansi_dark",
|
|
136
|
+
line_numbers=True, word_wrap=True))
|
|
137
|
+
continue
|
|
138
|
+
if choice == approve:
|
|
139
|
+
return ("approve", None)
|
|
140
|
+
return ("feedback", questionary.text("Your feedback for Claude:").ask() or "")
|
|
141
|
+
|
|
142
|
+
def success(self, lab, name):
|
|
143
|
+
from rich.panel import Panel
|
|
144
|
+
self.console.print(Panel(
|
|
145
|
+
f"Lab ready: [bold]{lab}[/bold]\n\nReview the evaluator, then run:\n"
|
|
146
|
+
f" [green]leanlab lock {name} && leanlab run {name}[/green]",
|
|
147
|
+
title="✓ Done", border_style="green"))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def run_init(lab, name, description, *, runner=None, ui=None, verify=True, yes=False) -> None:
|
|
151
|
+
"""Scaffold a lab, then draft it with the architect and approve the evaluator in a loop."""
|
|
152
|
+
lab = Path(lab)
|
|
153
|
+
scaffold(lab, name)
|
|
154
|
+
ui = ui or RichUI()
|
|
155
|
+
runner = runner or make_runner(lab)
|
|
156
|
+
|
|
157
|
+
with ui.status("Drafting task.md and the objective with Claude…"):
|
|
158
|
+
draft = runner.run_structured(_draft_prompt(description), ["task_md", "objective"])
|
|
159
|
+
if not draft.ok:
|
|
160
|
+
ui.error("Could not draft the task — try again with a clearer description.")
|
|
161
|
+
return
|
|
162
|
+
(lab / "task.md").write_text(draft.data["task_md"])
|
|
163
|
+
cfg = json.loads((lab / "lab.json").read_text())
|
|
164
|
+
obj = draft.data["objective"]
|
|
165
|
+
if isinstance(obj, dict) and "metric" in obj and "direction" in obj:
|
|
166
|
+
cfg["objective"] = obj
|
|
167
|
+
else:
|
|
168
|
+
ui.error("⚠ Claude returned an unexpected objective shape — set lab.json's objective by hand.")
|
|
169
|
+
(lab / "lab.json").write_text(json.dumps(cfg, indent=2) + "\n")
|
|
170
|
+
ui.objective(cfg["objective"])
|
|
171
|
+
|
|
172
|
+
metric = cfg["objective"].get("metric", "score")
|
|
173
|
+
session, feedback = draft.session_id, None
|
|
174
|
+
while True:
|
|
175
|
+
with ui.status("Designing an evaluator with Claude…"):
|
|
176
|
+
prop = runner.run_structured(
|
|
177
|
+
_propose_prompt(feedback, metric, cfg["eval_cmd"], cfg["validate_cmd"]),
|
|
178
|
+
["summary", "evaluation_py", "validate_py"], session=session)
|
|
179
|
+
if not prop.ok:
|
|
180
|
+
ui.error("Could not propose an evaluation — aborting.")
|
|
181
|
+
return
|
|
182
|
+
session = prop.session_id or session
|
|
183
|
+
ui.proposal(prop.data["summary"])
|
|
184
|
+
action, text = ("approve", None) if yes else ui.decide(prop.data["evaluation_py"])
|
|
185
|
+
if action == "approve":
|
|
186
|
+
(lab / "evaluation.py").write_text(prop.data["evaluation_py"])
|
|
187
|
+
(lab / "validate.py").write_text(prop.data["validate_py"])
|
|
188
|
+
_install_packages(prop.data.get("packages"), ui)
|
|
189
|
+
break
|
|
190
|
+
if action == "cancel":
|
|
191
|
+
ui.note("Cancelled — no evaluator written. task.md and lab.json were kept.")
|
|
192
|
+
return
|
|
193
|
+
feedback = text
|
|
194
|
+
|
|
195
|
+
if verify:
|
|
196
|
+
_self_verify(lab, runner, ui)
|
|
197
|
+
ui.success(lab, name)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _install_packages(packages, ui):
|
|
201
|
+
"""uv add the pip packages the evaluator/validator declared they need."""
|
|
202
|
+
for pkg in packages or []:
|
|
203
|
+
if not isinstance(pkg, str) or not pkg.strip():
|
|
204
|
+
continue
|
|
205
|
+
with ui.status(f"Installing {pkg}…"):
|
|
206
|
+
r = subprocess.run(["uv", "add", pkg.strip()], capture_output=True, text=True)
|
|
207
|
+
if r.returncode != 0:
|
|
208
|
+
ui.error(f"⚠ could not install {pkg!r} (uv add failed) — install it yourself: "
|
|
209
|
+
f"{(r.stderr or '').strip()[:120]}")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _self_verify(lab, runner, ui):
|
|
213
|
+
"""Run the doctor; if the generated lab is mis-wired, have Claude fix it before finishing."""
|
|
214
|
+
from .doctor import check_lab, ok, fix_lab
|
|
215
|
+
if ok(check_lab(lab)):
|
|
216
|
+
ui.note("[green]✓ wiring check passed.[/green]")
|
|
217
|
+
return
|
|
218
|
+
ui.note("[yellow]⚠ wiring check found problems — fixing with Claude before finishing…[/yellow]")
|
|
219
|
+
fix_lab(lab, runner=runner)
|