leanlab 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
leanlab/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """leanlab — a self-improving experiment-lab tool."""
leanlab/cli.py ADDED
@@ -0,0 +1,315 @@
1
+ """leanlab CLI — init, run, and serve labs.
2
+
3
+ leanlab is a research tool used *inside* another project (like archik). Labs live in
4
+ that project's `.leanlab/<name>/` folder and hold only the task-specific files the user
5
+ edits: task.md, lab.json, evaluation.py, validate.py. The engine (loop + monitor) and the
6
+ fixed agent specs live in the installed package and are never copied into the project.
7
+
8
+ Usage (inside your project):
9
+ uvx leanlab init <name> # interactively scaffold a lab (Claude drafts it)
10
+ uvx leanlab run <lab> --n 5 # run N experiments
11
+ uvx leanlab serve <lab> # open the dashboard
12
+ uvx leanlab list # list labs in this project
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import subprocess
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ from .core.init import run_init
24
+
25
+
26
+ def _version() -> str:
27
+ """The installed package version (falls back when running from an uninstalled checkout)."""
28
+ try:
29
+ from importlib.metadata import PackageNotFoundError, version
30
+ return version("leanlab")
31
+ except (ImportError, PackageNotFoundError):
32
+ return "0.2.1"
33
+
34
+
35
+ def labs_dir() -> Path:
36
+ """Where labs live: the .leanlab/ folder of the project leanlab is run inside."""
37
+ return Path.cwd() / ".leanlab"
38
+
39
+
40
+ def _install_agent_skill(project: Path) -> Path:
41
+ from importlib import resources
42
+ src = (resources.files("leanlab") / "templates" / "skill" / "SKILL.md").read_text()
43
+ dest = project / ".claude" / "skills" / "leanlab" / "SKILL.md"
44
+ dest.parent.mkdir(parents=True, exist_ok=True)
45
+ dest.write_text(src)
46
+ return dest
47
+
48
+
49
+ _CLAUDE_MD_MARKER = "<!-- leanlab:agent -->"
50
+ _CLAUDE_MD_BLOCK = (
51
+ f"{_CLAUDE_MD_MARKER}\n"
52
+ "## Coding tasks — use leanlab\n\n"
53
+ "For feature / fix / refactor work on this repo, drive it through **leanlab** (the honest, "
54
+ "test-gated loop) instead of editing files directly. See `.claude/skills/leanlab/SKILL.md`.\n"
55
+ "Flow: `leanlab spec \"<task>\" --yes` → read the `slug:` line → `leanlab build <slug>` → report.\n"
56
+ f"{_CLAUDE_MD_MARKER}\n"
57
+ )
58
+
59
+
60
+ def _append_claude_md(project: Path) -> bool:
61
+ """Append the leanlab guidance to the project's CLAUDE.md (idempotent). True if it changed."""
62
+ p = project / "CLAUDE.md"
63
+ text = p.read_text() if p.exists() else ""
64
+ if _CLAUDE_MD_MARKER in text:
65
+ return False
66
+ sep = "" if (not text or text.endswith("\n")) else "\n"
67
+ p.write_text(text + sep + "\n" + _CLAUDE_MD_BLOCK)
68
+ return True
69
+
70
+
71
+ def cmd_init(args):
72
+ if args.for_agent:
73
+ dest = _install_agent_skill(Path.cwd())
74
+ print(f"✓ leanlab skill installed at {dest}")
75
+ if _append_claude_md(Path.cwd()):
76
+ print("✓ added leanlab guidance to CLAUDE.md")
77
+ else:
78
+ print("• CLAUDE.md already mentions leanlab — left it as is")
79
+ print("Claude Code in this project can now drive leanlab (spec → build → merge).")
80
+ return
81
+ import questionary
82
+ if not args.name:
83
+ print("ERROR: a lab name is required (or use `leanlab init --for-agent`).", file=sys.stderr)
84
+ sys.exit(1)
85
+ lab = labs_dir() / args.name
86
+ if lab.exists():
87
+ print(f"ERROR: {lab} already exists.", file=sys.stderr)
88
+ sys.exit(1)
89
+ description = (args.describe
90
+ or questionary.text("Describe the task you want to research:").ask() or "").strip()
91
+ if not description:
92
+ print("ERROR: a task description is required.", file=sys.stderr)
93
+ sys.exit(1)
94
+ run_init(lab, args.name, description, yes=args.yes)
95
+
96
+
97
+ def cmd_spec(args):
98
+ from .core.coding.spec import spec_task
99
+ res = spec_task(Path.cwd(), args.task, yes=args.yes)
100
+ if res:
101
+ print(f"slug: {Path(res['worktree']).name}") # plain line for an agent to parse
102
+ sys.exit(0 if res else 1)
103
+
104
+
105
+ def cmd_build(args):
106
+ from .core.coding.engineer import build_task
107
+ cmds = [{"name": "tests", "cmd": args.test_cmd or "pytest -q"}]
108
+ if args.lint_cmd:
109
+ cmds.append({"name": "lint", "cmd": args.lint_cmd})
110
+ res = build_task(Path.cwd(), args.slug, gate_cmds=cmds, persona_set=args.persona_set,
111
+ max_attempts=args.max_attempts, playbook=not args.no_playbook,
112
+ min_quality=args.min_quality, isolate=not args.no_isolate,
113
+ accept_cmd=args.accept_cmd, reviewers=args.reviewers)
114
+ sys.exit(0 if (res and res.get("merged")) else 1)
115
+
116
+
117
+ def cmd_clean(args):
118
+ from .core.coding.spec import clean_worktrees
119
+ removed = clean_worktrees(Path.cwd(), args.slug, remove_all=args.all)
120
+ print(f"removed {len(removed)} worktree(s): {', '.join(removed) or '(none)'}")
121
+
122
+
123
+ def cmd_board(args):
124
+ from .core.coding.board import serve_board
125
+ serve_board(Path.cwd(), port=args.port or 8766, open_browser=not args.no_open)
126
+
127
+
128
+ def cmd_gate(args):
129
+ from .core.coding.gate import run_gate, report
130
+ wt = labs_dir() / "worktrees" / args.slug
131
+ if not wt.is_dir():
132
+ print(f"ERROR: no worktree at {wt} — run `leanlab spec` first.", file=sys.stderr)
133
+ sys.exit(1)
134
+ cmds = [{"name": "tests", "cmd": args.test_cmd or "pytest -q"}]
135
+ if args.lint_cmd:
136
+ cmds.append({"name": "lint", "cmd": args.lint_cmd})
137
+ res = run_gate(wt, cmds)
138
+ report(res)
139
+ sys.exit(0 if res.passed else 1)
140
+
141
+
142
+ def cmd_check(args):
143
+ from .core.doctor import check_lab, RichReport, ok
144
+ lab = _resolve_lab(args.lab)
145
+ checks = check_lab(lab)
146
+ RichReport().report(checks)
147
+ sys.exit(0 if ok(checks) else 1)
148
+
149
+
150
+ def cmd_fix(args):
151
+ from .core.doctor import fix_lab
152
+ lab = _resolve_lab(args.lab)
153
+ sys.exit(0 if fix_lab(lab) else 1)
154
+
155
+
156
+ def cmd_run(args):
157
+ lab = _resolve_lab(args.lab)
158
+ if not args.skip_checks and not args.dry_run:
159
+ from .core.doctor import check_lab, RichReport, ok
160
+ checks = check_lab(lab)
161
+ if not ok(checks):
162
+ RichReport().report(checks)
163
+ print(f"\nRun blocked by failed checks. Fix them with: leanlab fix {args.lab}\n"
164
+ f"(or rerun with --skip-checks to ignore)", file=sys.stderr)
165
+ sys.exit(1)
166
+ cmd = [sys.executable, "-m", "leanlab.core.loop", "--lab", str(lab), "--n", str(args.n)]
167
+ if args.dry_run:
168
+ cmd.append("--dry-run")
169
+ sys.exit(subprocess.run(cmd).returncode)
170
+
171
+
172
+ def cmd_serve(args):
173
+ lab = _resolve_lab(args.lab)
174
+ cmd = [sys.executable, "-m", "leanlab.core.monitor", "--lab", str(lab)]
175
+ if args.port:
176
+ cmd += ["--port", str(args.port)]
177
+ sys.exit(subprocess.run(cmd).returncode)
178
+
179
+
180
+ def cmd_list(_args):
181
+ base = labs_dir()
182
+ if not base.exists():
183
+ print("(no labs yet — run `leanlab init <name>`)")
184
+ return
185
+ for d in sorted(base.iterdir()):
186
+ if (d / "lab.json").exists():
187
+ cfg = json.loads((d / "lab.json").read_text())
188
+ obj = cfg.get("objective", {})
189
+ print(f" {d.name:20} objective: {obj.get('direction')} {obj.get('metric')}")
190
+
191
+
192
+ def _evaluator(lab):
193
+ ev = lab / "evaluation.py"
194
+ if not ev.exists():
195
+ print(f"ERROR: no evaluation.py in {lab}", file=sys.stderr)
196
+ sys.exit(1)
197
+ return ev
198
+
199
+
200
+ def cmd_lock(args):
201
+ """Make the lab's evaluation.py read-only — a guardrail against accidental edits.
202
+
203
+ Note: this is a speed bump, not a sandbox. An agent running as you with full
204
+ tools could chmod it back. For a hard wall, run the Worker as a separate user.
205
+ """
206
+ ev = _evaluator(_resolve_lab(args.lab))
207
+ ev.chmod(0o444)
208
+ print(f"🔒 locked {ev} read-only. Run `leanlab unlock {args.lab}` to edit it.")
209
+
210
+
211
+ def cmd_unlock(args):
212
+ ev = _evaluator(_resolve_lab(args.lab))
213
+ ev.chmod(0o644)
214
+ print(f"🔓 unlocked {ev}. Edit it, then `leanlab lock {args.lab}` again.")
215
+
216
+
217
+ def _resolve_lab(name):
218
+ p = Path(name)
219
+ if p.exists() and (p / "lab.json").exists():
220
+ return p.resolve()
221
+ if (labs_dir() / name / "lab.json").exists():
222
+ return (labs_dir() / name).resolve()
223
+ print(f"ERROR: lab '{name}' not found in {labs_dir()}.", file=sys.stderr)
224
+ sys.exit(1)
225
+
226
+
227
+ def main():
228
+ p = argparse.ArgumentParser(description="leanlab CLI")
229
+ p.add_argument("--version", action="version", version=f"leanlab {_version()}")
230
+ sub = p.add_subparsers(dest="cmd", required=True)
231
+
232
+ pi = sub.add_parser("init", help="scaffold a lab (Claude drafts it), or --for-agent to install the skill")
233
+ pi.add_argument("name", nargs="?", default=None)
234
+ pi.add_argument("--for-agent", action="store_true",
235
+ help="install the leanlab skill into .claude/ so Claude Code can drive leanlab")
236
+ pi.add_argument("--describe", default=None, help="task description (skips the prompt)")
237
+ pi.add_argument("--yes", action="store_true", help="auto-approve the drafted evaluator (headless)")
238
+ pi.set_defaults(func=cmd_init)
239
+
240
+ pr = sub.add_parser("run", help="run N experiments in a lab")
241
+ pr.add_argument("lab")
242
+ pr.add_argument("--n", type=int, default=5)
243
+ pr.add_argument("--dry-run", action="store_true")
244
+ pr.add_argument("--skip-checks", action="store_true", help="skip the preflight doctor checks")
245
+ pr.set_defaults(func=cmd_run)
246
+
247
+ psp = sub.add_parser("spec", help="(coding) turn a task into approved, locked acceptance tests")
248
+ psp.add_argument("task", help="the coding task, e.g. \"create a /health endpoint\"")
249
+ psp.add_argument("--yes", action="store_true", help="auto-approve the drafted tests (headless)")
250
+ psp.set_defaults(func=cmd_spec)
251
+
252
+ pb = sub.add_parser("build", help="(coding) engineer the task to a green gate + review, then merge")
253
+ pb.add_argument("slug", help="the task worktree name under .leanlab/worktrees/")
254
+ pb.add_argument("--persona-set", default="coding", help="agent persona set (default: coding)")
255
+ pb.add_argument("--max-attempts", type=int, default=5)
256
+ pb.add_argument("--test-cmd", default=None, help="test command (default: pytest -q)")
257
+ pb.add_argument("--lint-cmd", default=None, help="optional lint/typecheck command")
258
+ pb.add_argument("--no-playbook", action="store_true", help="skip the tech-lead PLAYBOOK update")
259
+ pb.add_argument("--min-quality", type=float, default=0, help="reject merges below this 0-100 quality score")
260
+ pb.add_argument("--reviewers", type=int, default=1,
261
+ help="adversarial reviewer panel size; >1 runs that many reviewers with "
262
+ "different lenses (correctness/spec/security/robustness) and merges only "
263
+ "if all approve")
264
+ pb.add_argument("--no-isolate", action="store_true",
265
+ help="skip the isolated acceptance re-run (which disables engineer conftest)")
266
+ pb.add_argument("--accept-cmd", default="pytest --noconftest -q",
267
+ help="isolated acceptance command (pristine test paths are appended)")
268
+ pb.set_defaults(func=cmd_build)
269
+
270
+ pcl = sub.add_parser("clean", help="(coding) remove task worktrees + branches (merged only by default)")
271
+ pcl.add_argument("slug", nargs="?", default=None, help="a specific task to remove (force)")
272
+ pcl.add_argument("--all", action="store_true", help="remove ALL task worktrees (force)")
273
+ pcl.set_defaults(func=cmd_clean)
274
+
275
+ pbd = sub.add_parser("board", help="(coding) live dashboard of tasks, status, and the playbook")
276
+ pbd.add_argument("--port", type=int, default=0)
277
+ pbd.add_argument("--no-open", action="store_true")
278
+ pbd.set_defaults(func=cmd_board)
279
+
280
+ pg = sub.add_parser("gate", help="(coding) run the pass/fail gate on a task's worktree")
281
+ pg.add_argument("slug", help="the task worktree name under .leanlab/worktrees/")
282
+ pg.add_argument("--test-cmd", default=None, help="test command (default: pytest -q)")
283
+ pg.add_argument("--lint-cmd", default=None, help="optional lint/typecheck command")
284
+ pg.set_defaults(func=cmd_gate)
285
+
286
+ pc = sub.add_parser("check", help="preflight: verify the lab is wired correctly")
287
+ pc.add_argument("lab")
288
+ pc.set_defaults(func=cmd_check)
289
+
290
+ pf = sub.add_parser("fix", help="use Claude to fix lab wiring problems the checks found")
291
+ pf.add_argument("lab")
292
+ pf.set_defaults(func=cmd_fix)
293
+
294
+ ps = sub.add_parser("serve", help="open the dashboard for a lab")
295
+ ps.add_argument("lab")
296
+ ps.add_argument("--port", type=int, default=0)
297
+ ps.set_defaults(func=cmd_serve)
298
+
299
+ pl = sub.add_parser("list", help="list labs")
300
+ pl.set_defaults(func=cmd_list)
301
+
302
+ pk = sub.add_parser("lock", help="make a lab's evaluation.py read-only (frozen)")
303
+ pk.add_argument("lab")
304
+ pk.set_defaults(func=cmd_lock)
305
+
306
+ pu = sub.add_parser("unlock", help="restore write access to a lab's evaluation.py")
307
+ pu.add_argument("lab")
308
+ pu.set_defaults(func=cmd_unlock)
309
+
310
+ args = p.parse_args()
311
+ args.func(args)
312
+
313
+
314
+ if __name__ == "__main__":
315
+ main()
@@ -0,0 +1 @@
1
+ """leanlab engine — the generic loop, dashboard, and agent layer."""
@@ -0,0 +1,10 @@
1
+ """Agent ports & adapters — the backend-agnostic agent layer the loop depends on."""
2
+
3
+ from .claude import ClaudeAgent
4
+ from .port import AgentResult, AgentRunner, AgentTransport
5
+ from .protocol import StructuredRunner, extract_json
6
+
7
+ __all__ = [
8
+ "AgentResult", "AgentRunner", "AgentTransport",
9
+ "StructuredRunner", "extract_json", "ClaudeAgent",
10
+ ]
@@ -0,0 +1,38 @@
1
+ """ClaudeAgent — an AgentTransport backed by the Claude Code CLI (claude -p).
2
+
3
+ One concrete backend. Hermes / custom backends would be sibling AgentTransports;
4
+ the loop, which depends only on AgentRunner, would not change.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import subprocess
11
+ from pathlib import Path
12
+
13
+ from .port import AgentTransport
14
+
15
+
16
+ class ClaudeAgent(AgentTransport):
17
+ """Runs one agent turn via `claude -p --output-format json`, in a lab's cwd."""
18
+
19
+ def __init__(self, cwd, *, max_turns: int = 250, permission_mode: str = "bypassPermissions"):
20
+ self._cwd = Path(cwd)
21
+ self._max_turns = max_turns
22
+ self._permission_mode = permission_mode
23
+
24
+ def send(self, prompt: str, *, session: str | None = None) -> tuple[str | None, str]:
25
+ cmd = ["claude", "-p", prompt,
26
+ "--permission-mode", self._permission_mode,
27
+ "--max-turns", str(self._max_turns),
28
+ "--output-format", "json"]
29
+ if session:
30
+ cmd += ["--resume", session]
31
+ proc = subprocess.run(cmd, cwd=self._cwd, capture_output=True, text=True)
32
+ if proc.returncode != 0 and not proc.stdout.strip():
33
+ raise RuntimeError(proc.stderr.strip() or "claude CLI failed")
34
+ try:
35
+ env = json.loads(proc.stdout)
36
+ except json.JSONDecodeError:
37
+ return None, proc.stdout # let the protocol treat it as malformed
38
+ return env.get("session_id"), env.get("result", "")
@@ -0,0 +1,49 @@
1
+ """Agent ports — the abstraction the loop depends on (Dependency Inversion).
2
+
3
+ The loop never talks to a concrete agent backend. It depends on `AgentRunner`.
4
+ A backend implements the low-level `AgentTransport` (send one prompt, get text);
5
+ `StructuredRunner` (protocol.py) adapts a transport into an `AgentRunner` by
6
+ adding JSON validation and retry. Swapping in Hermes or a custom backend means a
7
+ new `AgentTransport` — nothing in the loop changes (open/closed).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from abc import ABC, abstractmethod
13
+ from dataclasses import dataclass
14
+
15
+
16
+ @dataclass
17
+ class AgentResult:
18
+ """The outcome of one structured agent turn."""
19
+
20
+ data: dict # validated JSON the agent returned ({} if it never produced valid output)
21
+ session_id: str | None = None # to resume the same agent session
22
+ raw: str = "" # the agent's last raw reply, for diagnostics
23
+
24
+ @property
25
+ def ok(self) -> bool:
26
+ return bool(self.data)
27
+
28
+
29
+ class AgentTransport(ABC):
30
+ """Low-level: send one prompt to an agent backend, get back (session_id, text)."""
31
+
32
+ @abstractmethod
33
+ def send(self, prompt: str, *, session: str | None = None) -> tuple[str | None, str]:
34
+ """Run one turn. Return (session_id, final_text). Raise on transport failure."""
35
+ raise NotImplementedError
36
+
37
+
38
+ class AgentRunner(ABC):
39
+ """High-level: what the loop depends on — structured output and fire-and-forget."""
40
+
41
+ @abstractmethod
42
+ def run_structured(self, prompt: str, required_keys, *, session: str | None = None) -> AgentResult:
43
+ """Run a turn and return a JSON object that contains all `required_keys`."""
44
+ raise NotImplementedError
45
+
46
+ @abstractmethod
47
+ def run_plain(self, prompt: str) -> None:
48
+ """Run a fire-and-forget turn (e.g. the Director or Critic writing a file)."""
49
+ raise NotImplementedError
@@ -0,0 +1,64 @@
1
+ """StructuredRunner — turns any AgentTransport into an AgentRunner.
2
+
3
+ It enforces the structured-output contract: parse the agent's reply as JSON,
4
+ check the required keys, and on malformed output re-prompt the SAME session
5
+ ("reply with ONLY that JSON object") up to a retry limit. This is the one place
6
+ the "agent must return valid structured output" rule lives.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import re
13
+
14
+ from .port import AgentResult, AgentRunner, AgentTransport
15
+
16
+
17
+ def extract_json(text: str, required_keys) -> dict | None:
18
+ """Return a JSON object in `text` containing all `required_keys`, or None.
19
+
20
+ Tries the whole text first, then the last embedded {...} block — agents
21
+ sometimes wrap the object in prose or code fences.
22
+ """
23
+ if not text:
24
+ return None
25
+ text = text.strip()
26
+ candidates = []
27
+ try:
28
+ candidates.append(json.loads(text))
29
+ except json.JSONDecodeError:
30
+ pass
31
+ for m in re.finditer(r"\{[^{}]*\}", text, re.DOTALL):
32
+ try:
33
+ candidates.append(json.loads(m.group(0)))
34
+ except json.JSONDecodeError:
35
+ continue
36
+ for obj in reversed(candidates):
37
+ if isinstance(obj, dict) and all(k in obj for k in required_keys):
38
+ return obj
39
+ return None
40
+
41
+
42
+ class StructuredRunner(AgentRunner):
43
+ """Wraps a transport: validate the JSON reply, re-prompt on malformed output."""
44
+
45
+ def __init__(self, transport: AgentTransport, max_retries: int = 2):
46
+ self._transport = transport
47
+ self._max_retries = max_retries
48
+
49
+ def run_structured(self, prompt, required_keys, *, session=None) -> AgentResult:
50
+ attempt_prompt, session_id, last_text = prompt, session, ""
51
+ for _attempt in range(self._max_retries + 1):
52
+ session_id, last_text = self._transport.send(attempt_prompt, session=session_id)
53
+ data = extract_json(last_text, required_keys)
54
+ if data is not None:
55
+ return AgentResult(data=data, session_id=session_id, raw=last_text)
56
+ # Malformed — correct and retry in the SAME session.
57
+ attempt_prompt = (
58
+ "Your last reply was NOT a valid JSON object with keys "
59
+ f"{list(required_keys)}. Reply with ONLY that JSON object — no prose, no code fence."
60
+ )
61
+ return AgentResult(data={}, session_id=session_id, raw=last_text)
62
+
63
+ def run_plain(self, prompt) -> None:
64
+ self._transport.send(prompt)
@@ -0,0 +1 @@
1
+ """Coding lab — Worker/Director/Critic over a real repo, judged by tests + quality."""