leanlab 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- leanlab/__init__.py +1 -0
- leanlab/cli.py +315 -0
- leanlab/core/__init__.py +1 -0
- leanlab/core/agents/__init__.py +10 -0
- leanlab/core/agents/claude.py +38 -0
- leanlab/core/agents/port.py +49 -0
- leanlab/core/agents/protocol.py +64 -0
- leanlab/core/coding/__init__.py +1 -0
- leanlab/core/coding/board.py +335 -0
- leanlab/core/coding/board_dist/assets/index-BBCkNArL.css +1 -0
- leanlab/core/coding/board_dist/assets/index-CNGMDAuO.js +40 -0
- leanlab/core/coding/board_dist/index.html +13 -0
- leanlab/core/coding/engineer.py +304 -0
- leanlab/core/coding/gate.py +63 -0
- leanlab/core/coding/personas.py +23 -0
- leanlab/core/coding/playbook.py +47 -0
- leanlab/core/coding/spec.py +232 -0
- leanlab/core/doctor.py +220 -0
- leanlab/core/init.py +219 -0
- leanlab/core/loop.py +374 -0
- leanlab/core/monitor.py +553 -0
- leanlab/templates/agents/CLAUDE.md +52 -0
- leanlab/templates/agents/critic.md +38 -0
- leanlab/templates/agents/director.md +37 -0
- leanlab/templates/agents/engineer.md +12 -0
- leanlab/templates/agents/reviewer.md +34 -0
- leanlab/templates/agents/techlead.md +7 -0
- leanlab/templates/skill/SKILL.md +99 -0
- leanlab-0.2.1.dist-info/METADATA +273 -0
- leanlab-0.2.1.dist-info/RECORD +33 -0
- leanlab-0.2.1.dist-info/WHEEL +4 -0
- leanlab-0.2.1.dist-info/entry_points.txt +2 -0
- leanlab-0.2.1.dist-info/licenses/LICENSE +21 -0
leanlab/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""leanlab — a self-improving experiment-lab tool."""
|
leanlab/cli.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""leanlab CLI — init, run, and serve labs.
|
|
2
|
+
|
|
3
|
+
leanlab is a research tool used *inside* another project (like archik). Labs live in
|
|
4
|
+
that project's `.leanlab/<name>/` folder and hold only the task-specific files the user
|
|
5
|
+
edits: task.md, lab.json, evaluation.py, validate.py. The engine (loop + monitor) and the
|
|
6
|
+
fixed agent specs live in the installed package and are never copied into the project.
|
|
7
|
+
|
|
8
|
+
Usage (inside your project):
|
|
9
|
+
uvx leanlab init <name> # interactively scaffold a lab (Claude drafts it)
|
|
10
|
+
uvx leanlab run <lab> --n 5 # run N experiments
|
|
11
|
+
uvx leanlab serve <lab> # open the dashboard
|
|
12
|
+
uvx leanlab list # list labs in this project
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import subprocess
|
|
20
|
+
import sys
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from .core.init import run_init
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _version() -> str:
|
|
27
|
+
"""The installed package version (falls back when running from an uninstalled checkout)."""
|
|
28
|
+
try:
|
|
29
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
30
|
+
return version("leanlab")
|
|
31
|
+
except (ImportError, PackageNotFoundError):
|
|
32
|
+
return "0.2.1"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def labs_dir() -> Path:
|
|
36
|
+
"""Where labs live: the .leanlab/ folder of the project leanlab is run inside."""
|
|
37
|
+
return Path.cwd() / ".leanlab"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _install_agent_skill(project: Path) -> Path:
|
|
41
|
+
from importlib import resources
|
|
42
|
+
src = (resources.files("leanlab") / "templates" / "skill" / "SKILL.md").read_text()
|
|
43
|
+
dest = project / ".claude" / "skills" / "leanlab" / "SKILL.md"
|
|
44
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
dest.write_text(src)
|
|
46
|
+
return dest
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
_CLAUDE_MD_MARKER = "<!-- leanlab:agent -->"
|
|
50
|
+
_CLAUDE_MD_BLOCK = (
|
|
51
|
+
f"{_CLAUDE_MD_MARKER}\n"
|
|
52
|
+
"## Coding tasks — use leanlab\n\n"
|
|
53
|
+
"For feature / fix / refactor work on this repo, drive it through **leanlab** (the honest, "
|
|
54
|
+
"test-gated loop) instead of editing files directly. See `.claude/skills/leanlab/SKILL.md`.\n"
|
|
55
|
+
"Flow: `leanlab spec \"<task>\" --yes` → read the `slug:` line → `leanlab build <slug>` → report.\n"
|
|
56
|
+
f"{_CLAUDE_MD_MARKER}\n"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _append_claude_md(project: Path) -> bool:
|
|
61
|
+
"""Append the leanlab guidance to the project's CLAUDE.md (idempotent). True if it changed."""
|
|
62
|
+
p = project / "CLAUDE.md"
|
|
63
|
+
text = p.read_text() if p.exists() else ""
|
|
64
|
+
if _CLAUDE_MD_MARKER in text:
|
|
65
|
+
return False
|
|
66
|
+
sep = "" if (not text or text.endswith("\n")) else "\n"
|
|
67
|
+
p.write_text(text + sep + "\n" + _CLAUDE_MD_BLOCK)
|
|
68
|
+
return True
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def cmd_init(args):
|
|
72
|
+
if args.for_agent:
|
|
73
|
+
dest = _install_agent_skill(Path.cwd())
|
|
74
|
+
print(f"✓ leanlab skill installed at {dest}")
|
|
75
|
+
if _append_claude_md(Path.cwd()):
|
|
76
|
+
print("✓ added leanlab guidance to CLAUDE.md")
|
|
77
|
+
else:
|
|
78
|
+
print("• CLAUDE.md already mentions leanlab — left it as is")
|
|
79
|
+
print("Claude Code in this project can now drive leanlab (spec → build → merge).")
|
|
80
|
+
return
|
|
81
|
+
import questionary
|
|
82
|
+
if not args.name:
|
|
83
|
+
print("ERROR: a lab name is required (or use `leanlab init --for-agent`).", file=sys.stderr)
|
|
84
|
+
sys.exit(1)
|
|
85
|
+
lab = labs_dir() / args.name
|
|
86
|
+
if lab.exists():
|
|
87
|
+
print(f"ERROR: {lab} already exists.", file=sys.stderr)
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
description = (args.describe
|
|
90
|
+
or questionary.text("Describe the task you want to research:").ask() or "").strip()
|
|
91
|
+
if not description:
|
|
92
|
+
print("ERROR: a task description is required.", file=sys.stderr)
|
|
93
|
+
sys.exit(1)
|
|
94
|
+
run_init(lab, args.name, description, yes=args.yes)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def cmd_spec(args):
|
|
98
|
+
from .core.coding.spec import spec_task
|
|
99
|
+
res = spec_task(Path.cwd(), args.task, yes=args.yes)
|
|
100
|
+
if res:
|
|
101
|
+
print(f"slug: {Path(res['worktree']).name}") # plain line for an agent to parse
|
|
102
|
+
sys.exit(0 if res else 1)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def cmd_build(args):
|
|
106
|
+
from .core.coding.engineer import build_task
|
|
107
|
+
cmds = [{"name": "tests", "cmd": args.test_cmd or "pytest -q"}]
|
|
108
|
+
if args.lint_cmd:
|
|
109
|
+
cmds.append({"name": "lint", "cmd": args.lint_cmd})
|
|
110
|
+
res = build_task(Path.cwd(), args.slug, gate_cmds=cmds, persona_set=args.persona_set,
|
|
111
|
+
max_attempts=args.max_attempts, playbook=not args.no_playbook,
|
|
112
|
+
min_quality=args.min_quality, isolate=not args.no_isolate,
|
|
113
|
+
accept_cmd=args.accept_cmd, reviewers=args.reviewers)
|
|
114
|
+
sys.exit(0 if (res and res.get("merged")) else 1)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def cmd_clean(args):
|
|
118
|
+
from .core.coding.spec import clean_worktrees
|
|
119
|
+
removed = clean_worktrees(Path.cwd(), args.slug, remove_all=args.all)
|
|
120
|
+
print(f"removed {len(removed)} worktree(s): {', '.join(removed) or '(none)'}")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def cmd_board(args):
|
|
124
|
+
from .core.coding.board import serve_board
|
|
125
|
+
serve_board(Path.cwd(), port=args.port or 8766, open_browser=not args.no_open)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cmd_gate(args):
|
|
129
|
+
from .core.coding.gate import run_gate, report
|
|
130
|
+
wt = labs_dir() / "worktrees" / args.slug
|
|
131
|
+
if not wt.is_dir():
|
|
132
|
+
print(f"ERROR: no worktree at {wt} — run `leanlab spec` first.", file=sys.stderr)
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
cmds = [{"name": "tests", "cmd": args.test_cmd or "pytest -q"}]
|
|
135
|
+
if args.lint_cmd:
|
|
136
|
+
cmds.append({"name": "lint", "cmd": args.lint_cmd})
|
|
137
|
+
res = run_gate(wt, cmds)
|
|
138
|
+
report(res)
|
|
139
|
+
sys.exit(0 if res.passed else 1)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def cmd_check(args):
|
|
143
|
+
from .core.doctor import check_lab, RichReport, ok
|
|
144
|
+
lab = _resolve_lab(args.lab)
|
|
145
|
+
checks = check_lab(lab)
|
|
146
|
+
RichReport().report(checks)
|
|
147
|
+
sys.exit(0 if ok(checks) else 1)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def cmd_fix(args):
|
|
151
|
+
from .core.doctor import fix_lab
|
|
152
|
+
lab = _resolve_lab(args.lab)
|
|
153
|
+
sys.exit(0 if fix_lab(lab) else 1)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def cmd_run(args):
|
|
157
|
+
lab = _resolve_lab(args.lab)
|
|
158
|
+
if not args.skip_checks and not args.dry_run:
|
|
159
|
+
from .core.doctor import check_lab, RichReport, ok
|
|
160
|
+
checks = check_lab(lab)
|
|
161
|
+
if not ok(checks):
|
|
162
|
+
RichReport().report(checks)
|
|
163
|
+
print(f"\nRun blocked by failed checks. Fix them with: leanlab fix {args.lab}\n"
|
|
164
|
+
f"(or rerun with --skip-checks to ignore)", file=sys.stderr)
|
|
165
|
+
sys.exit(1)
|
|
166
|
+
cmd = [sys.executable, "-m", "leanlab.core.loop", "--lab", str(lab), "--n", str(args.n)]
|
|
167
|
+
if args.dry_run:
|
|
168
|
+
cmd.append("--dry-run")
|
|
169
|
+
sys.exit(subprocess.run(cmd).returncode)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def cmd_serve(args):
|
|
173
|
+
lab = _resolve_lab(args.lab)
|
|
174
|
+
cmd = [sys.executable, "-m", "leanlab.core.monitor", "--lab", str(lab)]
|
|
175
|
+
if args.port:
|
|
176
|
+
cmd += ["--port", str(args.port)]
|
|
177
|
+
sys.exit(subprocess.run(cmd).returncode)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def cmd_list(_args):
|
|
181
|
+
base = labs_dir()
|
|
182
|
+
if not base.exists():
|
|
183
|
+
print("(no labs yet — run `leanlab init <name>`)")
|
|
184
|
+
return
|
|
185
|
+
for d in sorted(base.iterdir()):
|
|
186
|
+
if (d / "lab.json").exists():
|
|
187
|
+
cfg = json.loads((d / "lab.json").read_text())
|
|
188
|
+
obj = cfg.get("objective", {})
|
|
189
|
+
print(f" {d.name:20} objective: {obj.get('direction')} {obj.get('metric')}")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _evaluator(lab):
|
|
193
|
+
ev = lab / "evaluation.py"
|
|
194
|
+
if not ev.exists():
|
|
195
|
+
print(f"ERROR: no evaluation.py in {lab}", file=sys.stderr)
|
|
196
|
+
sys.exit(1)
|
|
197
|
+
return ev
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def cmd_lock(args):
|
|
201
|
+
"""Make the lab's evaluation.py read-only — a guardrail against accidental edits.
|
|
202
|
+
|
|
203
|
+
Note: this is a speed bump, not a sandbox. An agent running as you with full
|
|
204
|
+
tools could chmod it back. For a hard wall, run the Worker as a separate user.
|
|
205
|
+
"""
|
|
206
|
+
ev = _evaluator(_resolve_lab(args.lab))
|
|
207
|
+
ev.chmod(0o444)
|
|
208
|
+
print(f"🔒 locked {ev} read-only. Run `leanlab unlock {args.lab}` to edit it.")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def cmd_unlock(args):
|
|
212
|
+
ev = _evaluator(_resolve_lab(args.lab))
|
|
213
|
+
ev.chmod(0o644)
|
|
214
|
+
print(f"🔓 unlocked {ev}. Edit it, then `leanlab lock {args.lab}` again.")
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _resolve_lab(name):
|
|
218
|
+
p = Path(name)
|
|
219
|
+
if p.exists() and (p / "lab.json").exists():
|
|
220
|
+
return p.resolve()
|
|
221
|
+
if (labs_dir() / name / "lab.json").exists():
|
|
222
|
+
return (labs_dir() / name).resolve()
|
|
223
|
+
print(f"ERROR: lab '{name}' not found in {labs_dir()}.", file=sys.stderr)
|
|
224
|
+
sys.exit(1)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def main():
|
|
228
|
+
p = argparse.ArgumentParser(description="leanlab CLI")
|
|
229
|
+
p.add_argument("--version", action="version", version=f"leanlab {_version()}")
|
|
230
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
231
|
+
|
|
232
|
+
pi = sub.add_parser("init", help="scaffold a lab (Claude drafts it), or --for-agent to install the skill")
|
|
233
|
+
pi.add_argument("name", nargs="?", default=None)
|
|
234
|
+
pi.add_argument("--for-agent", action="store_true",
|
|
235
|
+
help="install the leanlab skill into .claude/ so Claude Code can drive leanlab")
|
|
236
|
+
pi.add_argument("--describe", default=None, help="task description (skips the prompt)")
|
|
237
|
+
pi.add_argument("--yes", action="store_true", help="auto-approve the drafted evaluator (headless)")
|
|
238
|
+
pi.set_defaults(func=cmd_init)
|
|
239
|
+
|
|
240
|
+
pr = sub.add_parser("run", help="run N experiments in a lab")
|
|
241
|
+
pr.add_argument("lab")
|
|
242
|
+
pr.add_argument("--n", type=int, default=5)
|
|
243
|
+
pr.add_argument("--dry-run", action="store_true")
|
|
244
|
+
pr.add_argument("--skip-checks", action="store_true", help="skip the preflight doctor checks")
|
|
245
|
+
pr.set_defaults(func=cmd_run)
|
|
246
|
+
|
|
247
|
+
psp = sub.add_parser("spec", help="(coding) turn a task into approved, locked acceptance tests")
|
|
248
|
+
psp.add_argument("task", help="the coding task, e.g. \"create a /health endpoint\"")
|
|
249
|
+
psp.add_argument("--yes", action="store_true", help="auto-approve the drafted tests (headless)")
|
|
250
|
+
psp.set_defaults(func=cmd_spec)
|
|
251
|
+
|
|
252
|
+
pb = sub.add_parser("build", help="(coding) engineer the task to a green gate + review, then merge")
|
|
253
|
+
pb.add_argument("slug", help="the task worktree name under .leanlab/worktrees/")
|
|
254
|
+
pb.add_argument("--persona-set", default="coding", help="agent persona set (default: coding)")
|
|
255
|
+
pb.add_argument("--max-attempts", type=int, default=5)
|
|
256
|
+
pb.add_argument("--test-cmd", default=None, help="test command (default: pytest -q)")
|
|
257
|
+
pb.add_argument("--lint-cmd", default=None, help="optional lint/typecheck command")
|
|
258
|
+
pb.add_argument("--no-playbook", action="store_true", help="skip the tech-lead PLAYBOOK update")
|
|
259
|
+
pb.add_argument("--min-quality", type=float, default=0, help="reject merges below this 0-100 quality score")
|
|
260
|
+
pb.add_argument("--reviewers", type=int, default=1,
|
|
261
|
+
help="adversarial reviewer panel size; >1 runs that many reviewers with "
|
|
262
|
+
"different lenses (correctness/spec/security/robustness) and merges only "
|
|
263
|
+
"if all approve")
|
|
264
|
+
pb.add_argument("--no-isolate", action="store_true",
|
|
265
|
+
help="skip the isolated acceptance re-run (which disables engineer conftest)")
|
|
266
|
+
pb.add_argument("--accept-cmd", default="pytest --noconftest -q",
|
|
267
|
+
help="isolated acceptance command (pristine test paths are appended)")
|
|
268
|
+
pb.set_defaults(func=cmd_build)
|
|
269
|
+
|
|
270
|
+
pcl = sub.add_parser("clean", help="(coding) remove task worktrees + branches (merged only by default)")
|
|
271
|
+
pcl.add_argument("slug", nargs="?", default=None, help="a specific task to remove (force)")
|
|
272
|
+
pcl.add_argument("--all", action="store_true", help="remove ALL task worktrees (force)")
|
|
273
|
+
pcl.set_defaults(func=cmd_clean)
|
|
274
|
+
|
|
275
|
+
pbd = sub.add_parser("board", help="(coding) live dashboard of tasks, status, and the playbook")
|
|
276
|
+
pbd.add_argument("--port", type=int, default=0)
|
|
277
|
+
pbd.add_argument("--no-open", action="store_true")
|
|
278
|
+
pbd.set_defaults(func=cmd_board)
|
|
279
|
+
|
|
280
|
+
pg = sub.add_parser("gate", help="(coding) run the pass/fail gate on a task's worktree")
|
|
281
|
+
pg.add_argument("slug", help="the task worktree name under .leanlab/worktrees/")
|
|
282
|
+
pg.add_argument("--test-cmd", default=None, help="test command (default: pytest -q)")
|
|
283
|
+
pg.add_argument("--lint-cmd", default=None, help="optional lint/typecheck command")
|
|
284
|
+
pg.set_defaults(func=cmd_gate)
|
|
285
|
+
|
|
286
|
+
pc = sub.add_parser("check", help="preflight: verify the lab is wired correctly")
|
|
287
|
+
pc.add_argument("lab")
|
|
288
|
+
pc.set_defaults(func=cmd_check)
|
|
289
|
+
|
|
290
|
+
pf = sub.add_parser("fix", help="use Claude to fix lab wiring problems the checks found")
|
|
291
|
+
pf.add_argument("lab")
|
|
292
|
+
pf.set_defaults(func=cmd_fix)
|
|
293
|
+
|
|
294
|
+
ps = sub.add_parser("serve", help="open the dashboard for a lab")
|
|
295
|
+
ps.add_argument("lab")
|
|
296
|
+
ps.add_argument("--port", type=int, default=0)
|
|
297
|
+
ps.set_defaults(func=cmd_serve)
|
|
298
|
+
|
|
299
|
+
pl = sub.add_parser("list", help="list labs")
|
|
300
|
+
pl.set_defaults(func=cmd_list)
|
|
301
|
+
|
|
302
|
+
pk = sub.add_parser("lock", help="make a lab's evaluation.py read-only (frozen)")
|
|
303
|
+
pk.add_argument("lab")
|
|
304
|
+
pk.set_defaults(func=cmd_lock)
|
|
305
|
+
|
|
306
|
+
pu = sub.add_parser("unlock", help="restore write access to a lab's evaluation.py")
|
|
307
|
+
pu.add_argument("lab")
|
|
308
|
+
pu.set_defaults(func=cmd_unlock)
|
|
309
|
+
|
|
310
|
+
args = p.parse_args()
|
|
311
|
+
args.func(args)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
if __name__ == "__main__":
|
|
315
|
+
main()
|
leanlab/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""leanlab engine — the generic loop, dashboard, and agent layer."""
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Agent ports & adapters — the backend-agnostic agent layer the loop depends on."""
|
|
2
|
+
|
|
3
|
+
from .claude import ClaudeAgent
|
|
4
|
+
from .port import AgentResult, AgentRunner, AgentTransport
|
|
5
|
+
from .protocol import StructuredRunner, extract_json
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"AgentResult", "AgentRunner", "AgentTransport",
|
|
9
|
+
"StructuredRunner", "extract_json", "ClaudeAgent",
|
|
10
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""ClaudeAgent — an AgentTransport backed by the Claude Code CLI (claude -p).
|
|
2
|
+
|
|
3
|
+
One concrete backend. Hermes / custom backends would be sibling AgentTransports;
|
|
4
|
+
the loop, which depends only on AgentRunner, would not change.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from .port import AgentTransport
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ClaudeAgent(AgentTransport):
|
|
17
|
+
"""Runs one agent turn via `claude -p --output-format json`, in a lab's cwd."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, cwd, *, max_turns: int = 250, permission_mode: str = "bypassPermissions"):
|
|
20
|
+
self._cwd = Path(cwd)
|
|
21
|
+
self._max_turns = max_turns
|
|
22
|
+
self._permission_mode = permission_mode
|
|
23
|
+
|
|
24
|
+
def send(self, prompt: str, *, session: str | None = None) -> tuple[str | None, str]:
|
|
25
|
+
cmd = ["claude", "-p", prompt,
|
|
26
|
+
"--permission-mode", self._permission_mode,
|
|
27
|
+
"--max-turns", str(self._max_turns),
|
|
28
|
+
"--output-format", "json"]
|
|
29
|
+
if session:
|
|
30
|
+
cmd += ["--resume", session]
|
|
31
|
+
proc = subprocess.run(cmd, cwd=self._cwd, capture_output=True, text=True)
|
|
32
|
+
if proc.returncode != 0 and not proc.stdout.strip():
|
|
33
|
+
raise RuntimeError(proc.stderr.strip() or "claude CLI failed")
|
|
34
|
+
try:
|
|
35
|
+
env = json.loads(proc.stdout)
|
|
36
|
+
except json.JSONDecodeError:
|
|
37
|
+
return None, proc.stdout # let the protocol treat it as malformed
|
|
38
|
+
return env.get("session_id"), env.get("result", "")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Agent ports — the abstraction the loop depends on (Dependency Inversion).
|
|
2
|
+
|
|
3
|
+
The loop never talks to a concrete agent backend. It depends on `AgentRunner`.
|
|
4
|
+
A backend implements the low-level `AgentTransport` (send one prompt, get text);
|
|
5
|
+
`StructuredRunner` (protocol.py) adapts a transport into an `AgentRunner` by
|
|
6
|
+
adding JSON validation and retry. Swapping in Hermes or a custom backend means a
|
|
7
|
+
new `AgentTransport` — nothing in the loop changes (open/closed).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class AgentResult:
|
|
18
|
+
"""The outcome of one structured agent turn."""
|
|
19
|
+
|
|
20
|
+
data: dict # validated JSON the agent returned ({} if it never produced valid output)
|
|
21
|
+
session_id: str | None = None # to resume the same agent session
|
|
22
|
+
raw: str = "" # the agent's last raw reply, for diagnostics
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def ok(self) -> bool:
|
|
26
|
+
return bool(self.data)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AgentTransport(ABC):
|
|
30
|
+
"""Low-level: send one prompt to an agent backend, get back (session_id, text)."""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def send(self, prompt: str, *, session: str | None = None) -> tuple[str | None, str]:
|
|
34
|
+
"""Run one turn. Return (session_id, final_text). Raise on transport failure."""
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class AgentRunner(ABC):
|
|
39
|
+
"""High-level: what the loop depends on — structured output and fire-and-forget."""
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def run_structured(self, prompt: str, required_keys, *, session: str | None = None) -> AgentResult:
|
|
43
|
+
"""Run a turn and return a JSON object that contains all `required_keys`."""
|
|
44
|
+
raise NotImplementedError
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def run_plain(self, prompt: str) -> None:
|
|
48
|
+
"""Run a fire-and-forget turn (e.g. the Director or Critic writing a file)."""
|
|
49
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""StructuredRunner — turns any AgentTransport into an AgentRunner.
|
|
2
|
+
|
|
3
|
+
It enforces the structured-output contract: parse the agent's reply as JSON,
|
|
4
|
+
check the required keys, and on malformed output re-prompt the SAME session
|
|
5
|
+
("reply with ONLY that JSON object") up to a retry limit. This is the one place
|
|
6
|
+
the "agent must return valid structured output" rule lives.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from .port import AgentResult, AgentRunner, AgentTransport
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def extract_json(text: str, required_keys) -> dict | None:
|
|
18
|
+
"""Return a JSON object in `text` containing all `required_keys`, or None.
|
|
19
|
+
|
|
20
|
+
Tries the whole text first, then the last embedded {...} block — agents
|
|
21
|
+
sometimes wrap the object in prose or code fences.
|
|
22
|
+
"""
|
|
23
|
+
if not text:
|
|
24
|
+
return None
|
|
25
|
+
text = text.strip()
|
|
26
|
+
candidates = []
|
|
27
|
+
try:
|
|
28
|
+
candidates.append(json.loads(text))
|
|
29
|
+
except json.JSONDecodeError:
|
|
30
|
+
pass
|
|
31
|
+
for m in re.finditer(r"\{[^{}]*\}", text, re.DOTALL):
|
|
32
|
+
try:
|
|
33
|
+
candidates.append(json.loads(m.group(0)))
|
|
34
|
+
except json.JSONDecodeError:
|
|
35
|
+
continue
|
|
36
|
+
for obj in reversed(candidates):
|
|
37
|
+
if isinstance(obj, dict) and all(k in obj for k in required_keys):
|
|
38
|
+
return obj
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class StructuredRunner(AgentRunner):
|
|
43
|
+
"""Wraps a transport: validate the JSON reply, re-prompt on malformed output."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, transport: AgentTransport, max_retries: int = 2):
|
|
46
|
+
self._transport = transport
|
|
47
|
+
self._max_retries = max_retries
|
|
48
|
+
|
|
49
|
+
def run_structured(self, prompt, required_keys, *, session=None) -> AgentResult:
|
|
50
|
+
attempt_prompt, session_id, last_text = prompt, session, ""
|
|
51
|
+
for _attempt in range(self._max_retries + 1):
|
|
52
|
+
session_id, last_text = self._transport.send(attempt_prompt, session=session_id)
|
|
53
|
+
data = extract_json(last_text, required_keys)
|
|
54
|
+
if data is not None:
|
|
55
|
+
return AgentResult(data=data, session_id=session_id, raw=last_text)
|
|
56
|
+
# Malformed — correct and retry in the SAME session.
|
|
57
|
+
attempt_prompt = (
|
|
58
|
+
"Your last reply was NOT a valid JSON object with keys "
|
|
59
|
+
f"{list(required_keys)}. Reply with ONLY that JSON object — no prose, no code fence."
|
|
60
|
+
)
|
|
61
|
+
return AgentResult(data={}, session_id=session_id, raw=last_text)
|
|
62
|
+
|
|
63
|
+
def run_plain(self, prompt) -> None:
|
|
64
|
+
self._transport.send(prompt)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Coding lab — Worker/Director/Critic over a real repo, judged by tests + quality."""
|