leanlab 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ """The engineer loop — implement a spec'd task to a green gate + reviewer sign-off, then merge.
2
+
3
+ Realizes the build-task use case. The engineer edits the worktree; the gate checks it; on a
4
+ green gate the reviewer judges the diff. It loops on gate failures / review feedback, then
5
+ commits and merges the branch into main. `runner` / `ui` are injected for testing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import json
12
+ import shlex
13
+ import subprocess
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+
17
+ from ..loop import make_runner
18
+ from .board import log_event
19
+ from .gate import run_gate
20
+ from .personas import spec_text
21
+ from .playbook import read_playbook, update_playbook
22
+
23
+ _APPROVED = (True, "true", "yes", "True")
24
+
25
+
26
+ def _git(repo, *args):
27
+ return subprocess.run(["git", "-C", str(repo), *args], capture_output=True, text=True)
28
+
29
+
30
+ def _record(repo, rec):
31
+ """Append a build outcome so `leanlab board` can show it."""
32
+ p = Path(repo) / ".leanlab" / "coding-results.jsonl"
33
+ p.parent.mkdir(parents=True, exist_ok=True)
34
+ rec = {**rec, "ts": datetime.now(timezone.utc).isoformat()}
35
+ with p.open("a") as f:
36
+ f.write(json.dumps(rec) + "\n")
37
+
38
+
39
+ def _load_lock(repo, slug):
40
+ """Load the out-of-tree lock (pristine acceptance tests). None if the task wasn't spec'd."""
41
+ p = Path(repo) / ".leanlab" / "locks" / f"{slug}.json"
42
+ if not p.exists():
43
+ return None
44
+ try:
45
+ return json.loads(p.read_text())
46
+ except (OSError, ValueError):
47
+ return None
48
+
49
+
50
+ def _is_pristine(lock, wt) -> bool:
51
+ """Did the engineer leave the locked tests untouched? (missing or changed = tampered)"""
52
+ for it in lock.get("tests", []):
53
+ p = Path(wt) / it["path"]
54
+ if not p.exists() or hashlib.sha256(p.read_bytes()).hexdigest() != it["sha256"]:
55
+ return False
56
+ return True
57
+
58
+
59
+ def _isolated_acceptance(wt, lock, accept_cmd):
60
+ """Re-run the pristine acceptance tests with engineer conftest/fixtures DISABLED.
61
+
62
+ If they passed the normal gate but fail here (exit 1), the pass relied on engineer-added
63
+ test infrastructure (a conftest monkeypatch, a fixture) — i.e. gaming. Any other exit
64
+ (can't collect / import) means we couldn't isolate cleanly, so we don't block on it.
65
+ """
66
+ paths = [it["path"] for it in lock.get("tests", [])]
67
+ if not paths:
68
+ return True, ""
69
+ try:
70
+ proc = subprocess.run(shlex.split(accept_cmd) + paths, cwd=Path(wt),
71
+ capture_output=True, text=True, timeout=600)
72
+ except Exception as e: # noqa: BLE001
73
+ return True, f"(isolation skipped: {e})"
74
+ out = (proc.stdout + ("\n" + proc.stderr if proc.stderr else "")).strip()
75
+ return (proc.returncode != 1), out
76
+
77
+
78
+ def _restore_tests(lock, wt) -> None:
79
+ """Overwrite the worktree's acceptance tests with the pristine, out-of-tree copies, so the
80
+ gate always runs the ORIGINAL tests no matter what the engineer did to them."""
81
+ for it in lock.get("tests", []):
82
+ p = Path(wt) / it["path"]
83
+ p.parent.mkdir(parents=True, exist_ok=True)
84
+ if p.exists():
85
+ p.chmod(0o644)
86
+ p.write_text(it["content"])
87
+ p.chmod(0o444)
88
+
89
+
90
+ def _stage(wt):
91
+ """Stage all changes except gate caches and the lock file."""
92
+ ep = _git(wt, "rev-parse", "--git-path", "info/exclude").stdout.strip()
93
+ epath = Path(ep) if Path(ep).is_absolute() else Path(wt) / ep
94
+ try:
95
+ epath.parent.mkdir(parents=True, exist_ok=True)
96
+ cur = epath.read_text() if epath.exists() else ""
97
+ for pat in ("__pycache__/", ".pytest_cache/", ".leanlab-lock.json"):
98
+ if pat not in cur:
99
+ cur += pat + "\n"
100
+ epath.write_text(cur)
101
+ except Exception: # noqa: BLE001
102
+ pass
103
+ _git(wt, "add", "-A")
104
+
105
+
106
+ def _engineer_prompt(spec_md, persona_set, feedback, playbook=""):
107
+ base = spec_text("engineer", persona_set) + "\n\n## The task spec\n" + spec_md + "\n\n"
108
+ if playbook:
109
+ base += "## Project playbook (follow it)\n" + playbook + "\n\n"
110
+ base += (
111
+ "Implement the change in this worktree so the gate passes. Read the locked acceptance "
112
+ "tests under the test directory and make them pass — do NOT modify them. Follow the "
113
+ "repository's conventions. Edit files with your tools, then stop."
114
+ )
115
+ if feedback:
116
+ base += "\n\n## Fix this (from the last attempt)\n" + feedback
117
+ return base
118
+
119
+
120
+ _DIFF_LIMIT = 40000
121
+
122
+ # Each panel reviewer attacks from a distinct angle — diversity catches what one lens misses.
123
+ REVIEW_LENSES = [
124
+ {"name": "correctness",
125
+ "focus": "logic errors, off-by-one, wrong operators, integer division, edge cases, error paths"},
126
+ {"name": "spec-conformance",
127
+ "focus": "requirements stated in the spec that the locked tests do NOT check — find one the code gets wrong"},
128
+ {"name": "security",
129
+ "focus": "injection, path traversal, unsafe input handling, leaked secrets, resource exhaustion"},
130
+ {"name": "robustness",
131
+ "focus": "behaviour on bad/empty/huge input, concurrency, mutable shared state, failure recovery"},
132
+ ]
133
+
134
+
135
+ def _clip_diff(diff):
136
+ if len(diff) <= _DIFF_LIMIT:
137
+ return diff
138
+ return (diff[:_DIFF_LIMIT]
139
+ + f"\n…(diff truncated — {len(diff) - _DIFF_LIMIT} more chars not shown; "
140
+ "do NOT approve code you could not see)")
141
+
142
+
143
+ def _lenses_for(n):
144
+ """Lenses for a panel of n reviewers. n<=1 → one general reviewer (no extra focus)."""
145
+ if n <= 1:
146
+ return [None]
147
+ return [REVIEW_LENSES[i % len(REVIEW_LENSES)] for i in range(n)]
148
+
149
+
150
+ def _review_prompt(spec_md, diff, persona_set, lens=None):
151
+ body = spec_text("reviewer", persona_set)
152
+ if lens:
153
+ body += (f"\n\n## Your lens: {lens['name']}\nWeight your attack toward {lens['focus']}. "
154
+ "Still reject any blocking defect you find outside this lens.")
155
+ return (body + "\n\n## Task spec\n" + spec_md
156
+ + "\n\n## The diff to review\n```diff\n" + _clip_diff(diff) + "\n```")
157
+
158
+
159
+ def _review_panel(runner, spec_md, diff, persona_set, lenses):
160
+ """Adversarial quorum: run one reviewer per lens. Approved only if ALL approve; score is the
161
+ harshest (min); feedback aggregates every blocker, labelled by lens. Returns
162
+ (approved, score, feedback, verdicts)."""
163
+ verdicts = []
164
+ for lens in lenses:
165
+ res = runner.run_structured(_review_prompt(spec_md, diff, persona_set, lens),
166
+ ["approved", "feedback"])
167
+ ok = res.ok and res.data.get("approved") in _APPROVED
168
+ try:
169
+ sc = float(res.data.get("score", 100)) if res.ok else 0.0
170
+ except (TypeError, ValueError):
171
+ sc = 0.0
172
+ fb = str(res.data.get("feedback", "")) if res.ok else "(review call failed)"
173
+ verdicts.append({"lens": lens["name"] if lens else "review",
174
+ "approved": ok, "score": sc, "feedback": fb})
175
+ approved = bool(verdicts) and all(v["approved"] for v in verdicts)
176
+ score = min((v["score"] for v in verdicts), default=0.0)
177
+ feedback = "\n\n".join(f"[{v['lens']}] {v['feedback']}"
178
+ for v in verdicts if not v["approved"] and v["feedback"])
179
+ return approved, score, feedback, verdicts
180
+
181
+
182
+ def build_task(repo, slug, *, runner=None, ui=None, gate_cmds=None,
183
+ persona_set="coding", max_attempts=5, playbook=True, min_quality=0,
184
+ isolate=True, accept_cmd="pytest --noconftest -q", reviewers=1):
185
+ """Run the engineer loop on a spec'd task. Returns a result dict or None."""
186
+ repo = Path(repo).resolve()
187
+ ui = ui or BuildUI()
188
+ wt = repo / ".leanlab" / "worktrees" / slug
189
+ if not wt.is_dir():
190
+ ui.error(f"no worktree at {wt} — run `leanlab spec` first.")
191
+ return None
192
+ branch = f"leanlab/{slug}"
193
+ spec_md = (wt / "SPEC.md").read_text() if (wt / "SPEC.md").exists() else ""
194
+ pb = read_playbook(repo)
195
+ lock = _load_lock(repo, slug)
196
+ runner = runner or make_runner(wt)
197
+
198
+ feedback = None
199
+ for attempt in range(1, max_attempts + 1):
200
+ ui.attempt(attempt, max_attempts)
201
+ with ui.status("Engineer is implementing the change…"):
202
+ runner.run_plain(_engineer_prompt(spec_md, persona_set, feedback, pb))
203
+
204
+ tampered = lock is not None and not _is_pristine(lock, wt)
205
+ if lock is not None:
206
+ _restore_tests(lock, wt) # the gate ALWAYS runs the pristine acceptance tests
207
+
208
+ result = run_gate(wt, gate_cmds)
209
+ ui.gate(result)
210
+ log_event(repo, slug, {"event": "attempt", "n": attempt, "gate_passed": result.passed,
211
+ "failures": [c.name for c in result.failures()]})
212
+ if not result.passed:
213
+ feedback = "The gate failed:\n" + "\n".join(
214
+ f"[{c.name}]\n{c.output[-800:]}" for c in result.failures())
215
+ continue
216
+
217
+ if tampered:
218
+ ui.error("⚠ locked acceptance tests were modified — restored; rejecting this attempt.")
219
+ log_event(repo, slug, {"event": "tamper", "n": attempt})
220
+ feedback = ("You modified the locked acceptance tests (they were restored). They are "
221
+ "FROZEN — solve the task without touching them.")
222
+ continue
223
+
224
+ if lock is not None and isolate:
225
+ ok_iso, _iso = _isolated_acceptance(wt, lock, accept_cmd)
226
+ if not ok_iso:
227
+ ui.error("⚠ acceptance tests fail without the engineer's fixtures — gamed.")
228
+ log_event(repo, slug, {"event": "isolation", "n": attempt})
229
+ feedback = ("Your change passes only with extra fixtures/conftest. The acceptance "
230
+ "tests must pass on their own — implement the real behaviour.")
231
+ continue
232
+
233
+ _stage(wt)
234
+ diff = _git(wt, "diff", "--cached").stdout
235
+ lenses = _lenses_for(reviewers)
236
+ msg = ("Reviewer is checking the diff…" if len(lenses) == 1
237
+ else f"{len(lenses)} reviewers are attacking the diff…")
238
+ with ui.status(msg):
239
+ approved, score, review_fb, verdicts = _review_panel(
240
+ runner, spec_md, diff, persona_set, lenses)
241
+ log_event(repo, slug, {"event": "review", "n": attempt, "approved": bool(approved),
242
+ "score": score, "feedback": review_fb[:200],
243
+ "reviewers": [{"lens": v["lens"], "approved": v["approved"],
244
+ "score": v["score"]} for v in verdicts]})
245
+ if approved and score >= min_quality:
246
+ merged = _merge(repo, wt, branch, slug, ui)
247
+ log_event(repo, slug, {"event": "merged", "branch": branch, "merged": merged})
248
+ if merged:
249
+ ui.success(branch, attempt)
250
+ if playbook:
251
+ update_playbook(repo, slug=slug, ui=ui) # tech-lead refreshes the PLAYBOOK
252
+ _record(repo, {"slug": slug, "branch": branch, "attempts": attempt,
253
+ "merged": merged, "quality": score})
254
+ return {"branch": branch, "attempts": attempt, "merged": merged, "quality": score}
255
+ if approved: # passed review but below the quality bar
256
+ feedback = (f"Quality {score:.0f} is below the required {min_quality:.0f} — improve it. "
257
+ + review_fb)
258
+ else:
259
+ feedback = "The reviewer(s) requested changes:\n" + (review_fb or "(no feedback)")
260
+
261
+ ui.error(f"Gave up after {max_attempts} attempts — not merged.")
262
+ log_event(repo, slug, {"event": "gaveup", "attempts": max_attempts})
263
+ _record(repo, {"slug": slug, "branch": branch, "attempts": max_attempts, "merged": False})
264
+ return {"branch": branch, "attempts": max_attempts, "merged": False}
265
+
266
+
267
+ def _merge(repo, wt, branch, slug, ui) -> bool:
268
+ _stage(wt)
269
+ _git(wt, "commit", "-m", f"leanlab: {slug}")
270
+ r = _git(repo, "merge", "--no-ff", "-m", f"leanlab: merge {slug}", branch)
271
+ if r.returncode != 0:
272
+ ui.error("merge failed (resolve by hand): " + (r.stderr or r.stdout).strip())
273
+ return False
274
+ return True
275
+
276
+
277
+ class BuildUI:
278
+ """Terminal UI for `leanlab build` — attempt rules, spinners, gate report, merge panel."""
279
+
280
+ def __init__(self):
281
+ from rich.console import Console
282
+ self.console = Console()
283
+
284
+ def attempt(self, n, total):
285
+ self.console.rule(f"[bold cyan]Attempt {n}/{total}", style="cyan")
286
+
287
+ def status(self, message):
288
+ return self.console.status(f"[bold cyan]{message}", spinner="dots")
289
+
290
+ def gate(self, result):
291
+ from .gate import report
292
+ report(result, self.console)
293
+
294
+ def note(self, message):
295
+ self.console.print(message)
296
+
297
+ def error(self, message):
298
+ self.console.print(f"[bold red]{message}[/bold red]")
299
+
300
+ def success(self, branch, attempts):
301
+ from rich.panel import Panel
302
+ self.console.print(Panel(
303
+ f"Merged [bold]{branch}[/bold] into main after {attempts} attempt(s).",
304
+ title="✓ Task complete", border_style="green"))
@@ -0,0 +1,63 @@
1
+ """The gate — the deterministic checks a code change must pass.
2
+
3
+ Objective and binary: every configured command must exit 0 (tests incl. the locked
4
+ acceptance tests, plus optional lint / typecheck). Returns a structured GateResult.
5
+ The LLM quality score (the reviewer) is a separate, later concern.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import shlex
11
+ import subprocess
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ DEFAULT_GATE = [{"name": "tests", "cmd": "pytest -q"}]
16
+
17
+
18
+ @dataclass
19
+ class GateCheck:
20
+ name: str
21
+ ok: bool
22
+ code: int
23
+ output: str
24
+
25
+
26
+ @dataclass
27
+ class GateResult:
28
+ passed: bool
29
+ checks: list
30
+
31
+ def failures(self):
32
+ return [c for c in self.checks if not c.ok]
33
+
34
+
35
+ def run_gate(worktree, gate_cmds=None, *, timeout=600) -> GateResult:
36
+ """Run each gate command in the worktree; the change passes only if all exit 0."""
37
+ wt = Path(worktree)
38
+ checks = []
39
+ for step in (gate_cmds or DEFAULT_GATE):
40
+ name, cmd = step["name"], step["cmd"]
41
+ try:
42
+ proc = subprocess.run(shlex.split(cmd), cwd=wt, capture_output=True,
43
+ text=True, timeout=timeout)
44
+ out = (proc.stdout + ("\n" + proc.stderr if proc.stderr else "")).strip()
45
+ checks.append(GateCheck(name, proc.returncode == 0, proc.returncode, out))
46
+ except Exception as e: # noqa: BLE001 — couldn't even run it
47
+ checks.append(GateCheck(name, False, -1, f"could not run `{cmd}`: {e}"))
48
+ return GateResult(passed=all(c.ok for c in checks), checks=checks)
49
+
50
+
51
+ def report(result: GateResult, console=None):
52
+ """Print a rich pass/fail report."""
53
+ if console is None:
54
+ from rich.console import Console
55
+ console = Console()
56
+ for c in result.checks:
57
+ mark = "[green]✓[/green]" if c.ok else "[red]✗[/red]"
58
+ console.print(f"{mark} [bold]{c.name}[/bold] (exit {c.code})")
59
+ if not c.ok:
60
+ tail = "\n".join(c.output.splitlines()[-12:])
61
+ console.print(f"[dim]{tail}[/dim]")
62
+ verdict = "[green]GATE PASSED[/green]" if result.passed else "[red]GATE FAILED[/red]"
63
+ console.print(f"\n[bold]{verdict}[/bold]")
@@ -0,0 +1,23 @@
1
+ """Configurable agent persona sets — which package template each role uses.
2
+
3
+ A lab picks a set ("metric" for the classic Worker/Director/Critic, "coding" for the
4
+ Engineer/Reviewer/Tech-lead). Selectable via lab config and a CLI flag.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from importlib import resources
10
+
11
+ PERSONAS = {
12
+ "metric": {"worker": "CLAUDE.md", "director": "director.md", "critic": "critic.md"},
13
+ "coding": {"engineer": "engineer.md", "reviewer": "reviewer.md", "techlead": "techlead.md"},
14
+ }
15
+
16
+
17
+ def spec_text(role: str, persona_set: str = "coding") -> str:
18
+ """Load the template text for a role in a persona set (shipped as package data)."""
19
+ try:
20
+ fname = PERSONAS[persona_set][role]
21
+ except KeyError as e:
22
+ raise KeyError(f"no persona '{role}' in set '{persona_set}'") from e
23
+ return (resources.files("leanlab") / "templates" / "agents" / fname).read_text().strip()
@@ -0,0 +1,47 @@
1
+ """The PLAYBOOK — project knowledge the tech-lead maintains and the engineer reads.
2
+
3
+ `.leanlab/PLAYBOOK.md` accumulates conventions, architecture notes, and pitfalls so each
4
+ task starts smarter — the coding lab's version of memory. The engineer reads it; after a
5
+ successful merge the tech-lead rewrites it. (The test "ratchet" is automatic: each merged
6
+ task's locked acceptance tests join the main branch's suite and stay.)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from pathlib import Path
12
+
13
+ from ..loop import make_runner
14
+ from .personas import spec_text
15
+
16
+
17
+ def playbook_path(repo) -> Path:
18
+ return Path(repo) / ".leanlab" / "PLAYBOOK.md"
19
+
20
+
21
+ def read_playbook(repo) -> str:
22
+ p = playbook_path(repo)
23
+ return p.read_text().strip() if p.exists() else ""
24
+
25
+
26
+ def update_playbook(repo, *, slug=None, runner=None, ui=None) -> None:
27
+ """Have the tech-lead study recent changes and rewrite .leanlab/PLAYBOOK.md.
28
+
29
+ `slug` ties the update to the task that triggered it, so it shows on that task's
30
+ timeline as the tech-lead's step in the loop.
31
+ """
32
+ runner = runner or make_runner(Path(repo))
33
+ prompt = (
34
+ spec_text("techlead", "coding") + "\n\n"
35
+ "Study the recent merged changes (use `git log -p -5` and read key files), then write a "
36
+ "concise `.leanlab/PLAYBOOK.md`: conventions to follow, the architecture map, and "
37
+ "pitfalls already hit, as guidance for the next tasks. Create the `.leanlab` directory if "
38
+ "needed. Write ONLY that file, then stop."
39
+ )
40
+ if ui is not None:
41
+ with ui.status("Tech-lead is updating the PLAYBOOK…"):
42
+ runner.run_plain(prompt)
43
+ else:
44
+ runner.run_plain(prompt)
45
+ if slug:
46
+ from .board import log_event # lazy: board imports playbook, so import here
47
+ log_event(repo, slug, {"event": "playbook"})
@@ -0,0 +1,232 @@
1
+ """Spec a coding task — the spec-writer drafts a spec + acceptance tests in an isolated
2
+ git worktree, loops on the operator's feedback, then LOCKS the tests as the frozen
3
+ criteria the engineer is judged by (and can't change).
4
+
5
+ Realizes the spec-task use case. `runner` / `ui` are injected so it's testable without
6
+ Claude or a terminal.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import json
13
+ import re
14
+ import subprocess
15
+ from pathlib import Path
16
+
17
+ from ..loop import make_runner
18
+ from .board import log_event
19
+
20
+
21
+ # Filler words dropped from slugs so the name leads with what matters.
22
+ _SLUG_STOP = {"a", "an", "the", "to", "of", "for", "in", "on", "and"}
23
+
24
+
25
+ def _slug(task: str, max_len: int = 50) -> str:
26
+ """A short, readable, stable slug from a task description.
27
+
28
+ Rules so the name stays meaningful (vs. a blind 40-char chop):
29
+ 1. Use only the first sentence/line — task briefs are often multi-sentence.
30
+ 2. Drop filler words ("a", "the", "to", …) so the slug leads with the verb/noun.
31
+ 3. Kebab-case and cut on a WORD boundary — never mid-word, never a trailing dash.
32
+ Deterministic: the same task always yields the same slug.
33
+ """
34
+ first = re.split(r"[.\n!?]", task.strip(), maxsplit=1)[0]
35
+ words = re.sub(r"[^a-z0-9]+", " ", first.lower()).split()
36
+ meaningful = [w for w in words if w not in _SLUG_STOP] or words
37
+ out = ""
38
+ for w in meaningful:
39
+ candidate = f"{out}-{w}" if out else w
40
+ if len(candidate) > max_len:
41
+ break
42
+ out = candidate
43
+ if not out and meaningful: # a single first word longer than max_len
44
+ out = meaningful[0][:max_len]
45
+ return out or "task"
46
+
47
+
48
+ def _git(repo, *args):
49
+ return subprocess.run(["git", "-C", str(repo), *args], capture_output=True, text=True)
50
+
51
+
52
+ def _is_git_repo(repo) -> bool:
53
+ return _git(repo, "rev-parse", "--is-inside-work-tree").returncode == 0
54
+
55
+
56
+ def _create_worktree(repo, slug):
57
+ """Create (or reuse) an isolated worktree + branch for this task."""
58
+ wt = Path(repo) / ".leanlab" / "worktrees" / slug
59
+ branch = f"leanlab/{slug}"
60
+ gi = Path(repo) / ".gitignore"
61
+ line = ".leanlab/worktrees/"
62
+ if not gi.exists() or line not in gi.read_text():
63
+ with gi.open("a") as f:
64
+ f.write(("" if not gi.exists() or gi.read_text().endswith("\n") else "\n") + line + "\n")
65
+ if wt.exists():
66
+ return wt, branch
67
+ wt.parent.mkdir(parents=True, exist_ok=True)
68
+ r = _git(repo, "worktree", "add", "-b", branch, str(wt))
69
+ if r.returncode != 0: # branch may already exist — attach to it
70
+ r2 = _git(repo, "worktree", "add", str(wt), branch)
71
+ if r2.returncode != 0:
72
+ raise RuntimeError("git worktree add failed: " + (r.stderr or r2.stderr).strip())
73
+ return wt, branch
74
+
75
+
76
+ def _merged_branches(repo):
77
+ # `git branch` marks the current branch with "* " and worktree-checked-out ones with "+ ";
78
+ # the name is after the 2-char marker.
79
+ out = _git(repo, "branch", "--merged").stdout
80
+ return {ln[2:].strip() for ln in out.splitlines() if ln.strip()}
81
+
82
+
83
+ def clean_worktrees(repo, slug=None, *, remove_all=False) -> list[str]:
84
+ """Remove task worktrees + branches. Bulk removes only merged ones unless remove_all."""
85
+ repo = Path(repo).resolve()
86
+ wtroot = repo / ".leanlab" / "worktrees"
87
+ if not wtroot.is_dir():
88
+ return []
89
+ merged = _merged_branches(repo)
90
+ if slug:
91
+ targets = [slug] if (wtroot / slug).is_dir() else []
92
+ else:
93
+ all_slugs = [d.name for d in sorted(wtroot.iterdir()) if d.is_dir()]
94
+ targets = all_slugs if remove_all else [s for s in all_slugs if f"leanlab/{s}" in merged]
95
+ removed = []
96
+ for s in targets:
97
+ branch = f"leanlab/{s}"
98
+ force_branch = remove_all or bool(slug) or branch not in merged
99
+ # always --force the worktree: real task worktrees carry an untracked .leanlab-lock.json
100
+ # that would otherwise block removal. Branch deletion stays safe (-d) unless forced.
101
+ _git(repo, "worktree", "remove", "--force", str(wtroot / s))
102
+ _git(repo, "branch", "-D" if force_branch else "-d", branch)
103
+ (repo / ".leanlab" / "locks" / f"{s}.json").unlink(missing_ok=True)
104
+ removed.append(s)
105
+ return removed
106
+
107
+
108
+ def _spec_prompt(task: str, feedback: str | None) -> str:
109
+ base = (
110
+ "You are the SPEC-WRITER for a coding lab. Turn the task below into a precise spec and "
111
+ "a set of ACCEPTANCE TESTS that define 'done'. A different agent (the engineer) will be "
112
+ "judged ONLY by these tests and must not change them — so make them concrete, fair, and "
113
+ "runnable.\n\n"
114
+ f"TASK:\n{task}\n\n"
115
+ "Study the repository in the current directory (read files) to match its language, test "
116
+ "framework, and conventions. Do NOT create or edit any files — return everything in the "
117
+ "JSON. Use one or more acceptance test files as needed. Reply with ONLY this JSON object: "
118
+ '{"spec_md": "<the spec, as markdown>", '
119
+ '"tests": [{"path": "<relative test file path>", "content": "<full file contents>"}]}'
120
+ )
121
+ if feedback:
122
+ return (f"The operator gave feedback on your previous draft:\n\n{feedback}\n\n"
123
+ f"Revise the spec and tests accordingly. {base}")
124
+ return base
125
+
126
+
127
+ def spec_task(repo, task, *, runner=None, ui=None, yes=False):
128
+ """Draft → approve → lock the acceptance tests for a task. Returns a dict or None.
129
+
130
+ yes=True auto-approves the first draft (headless, for an agent driving leanlab).
131
+ """
132
+ repo = Path(repo).resolve()
133
+ ui = ui or SpecUI()
134
+ if not _is_git_repo(repo):
135
+ ui.error("not a git repository — coding labs need git for worktree isolation")
136
+ return None
137
+
138
+ slug = _slug(task)
139
+ wt, branch = _create_worktree(repo, slug)
140
+ runner = runner or make_runner(wt) # the spec-writer works inside the worktree
141
+
142
+ feedback = None
143
+ while True:
144
+ with ui.status("Spec-writer is drafting the spec + acceptance tests…"):
145
+ res = runner.run_structured(_spec_prompt(task, feedback), ["spec_md", "tests"])
146
+ if not res.ok:
147
+ ui.error("could not draft the spec — aborting.")
148
+ return None
149
+ files = [t for t in res.data["tests"]
150
+ if isinstance(t, dict) and t.get("path") and "content" in t]
151
+ if not files:
152
+ ui.error("the spec-writer returned no acceptance test files — aborting.")
153
+ return None
154
+ (wt / "SPEC.md").write_text(res.data["spec_md"])
155
+ for t in files:
156
+ p = wt / t["path"]
157
+ p.parent.mkdir(parents=True, exist_ok=True)
158
+ if p.exists():
159
+ p.chmod(0o644) # a prior spec run may have locked this file
160
+ p.write_text(t["content"])
161
+
162
+ ui.spec(res.data["spec_md"])
163
+ if yes:
164
+ action, text = "approve", None
165
+ else:
166
+ action, text = ui.decide("\n\n".join(f"# {t['path']}\n{t['content']}" for t in files))
167
+ if action == "approve":
168
+ # Store the lock + a PRISTINE copy OUTSIDE the worktree, where the engineer (which
169
+ # works inside the worktree) cannot reach it. The build step restores from here.
170
+ locked = [{"path": t["path"], "content": t["content"],
171
+ "sha256": hashlib.sha256(t["content"].encode()).hexdigest()} for t in files]
172
+ for t in files:
173
+ (wt / t["path"]).chmod(0o444) # in-tree lock is a cosmetic guardrail only
174
+ locks = repo / ".leanlab" / "locks"
175
+ locks.mkdir(parents=True, exist_ok=True)
176
+ (locks / f"{slug}.json").write_text(json.dumps({"tests": locked}))
177
+ log_event(repo, slug, {"event": "spec", "tests": [t["path"] for t in files]})
178
+ break
179
+ if action == "cancel":
180
+ ui.note("Cancelled — worktree kept, tests not locked.")
181
+ return None
182
+ feedback = text
183
+
184
+ ui.success(wt, branch)
185
+ return {"worktree": str(wt), "branch": branch, "test_paths": [t["path"] for t in files]}
186
+
187
+
188
+ class SpecUI:
189
+ """Terminal UI for `leanlab spec` — spinner, spec panel, arrow-key approve menu."""
190
+
191
+ def __init__(self):
192
+ from rich.console import Console
193
+ self.console = Console()
194
+
195
+ def status(self, message):
196
+ return self.console.status(f"[bold cyan]{message}", spinner="dots")
197
+
198
+ def note(self, message):
199
+ self.console.print(message)
200
+
201
+ def error(self, message):
202
+ self.console.print(f"[bold red]{message}[/bold red]")
203
+
204
+ def spec(self, spec_md):
205
+ from rich.markdown import Markdown
206
+ from rich.panel import Panel
207
+ self.console.print(Panel(Markdown(spec_md), title="Proposed spec", border_style="magenta"))
208
+
209
+ def decide(self, test_code):
210
+ import questionary
211
+ from rich.syntax import Syntax
212
+ approve, view, feedback, cancel = (
213
+ "✓ Approve & lock the acceptance tests", "👁 View the acceptance tests",
214
+ "✍ Give feedback (revise)", "✖ Cancel")
215
+ while True:
216
+ choice = questionary.select("What now?", choices=[approve, view, feedback, cancel]).ask()
217
+ if choice is None or choice == cancel:
218
+ return ("cancel", None)
219
+ if choice == view:
220
+ self.console.print(Syntax(test_code, "python", theme="ansi_dark",
221
+ line_numbers=True, word_wrap=True))
222
+ continue
223
+ if choice == approve:
224
+ return ("approve", None)
225
+ return ("feedback", questionary.text("Your feedback for the spec-writer:").ask() or "")
226
+
227
+ def success(self, worktree, branch):
228
+ from rich.panel import Panel
229
+ self.console.print(Panel(
230
+ f"Spec locked in [bold]{worktree}[/bold]\nbranch [bold]{branch}[/bold]\n\n"
231
+ "Acceptance tests are frozen — the engineer will implement against them.",
232
+ title="✓ Spec ready", border_style="green"))