director-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
director/plan.py ADDED
@@ -0,0 +1,523 @@
1
+ """`director plan` — brainstorm → spec → decompose → test-gated DAG.
2
+
3
+ Phase 2.5 turns planning into a re-entrant pipeline with two artifact-based
4
+ approval gates. director writes an artifact and then either pauses (interactive)
5
+ or auto-approves (`--auto`); the human and the self-critic are mechanically the
6
+ same gate — both read an artifact, decide, and continue.
7
+
8
+ Stage 0 job branch + synced agents (so `--agent <role>` resolves correctly)
9
+ recon explorer (cheap) reads the repo → .director/recon.md
10
+ Stage A planner-tier brainstorm/spec → .director/spec.md → GATE 1
11
+ Stage B planner decomposes the SPEC → .director/plan.json
12
+ Stage C test-author writes failing tests (committed, hashed) → GATE 2
13
+ READY approved; `director run` may execute
14
+
15
+ Resumption is driven by `.director/plan_stage.json`. `director plan "<task>"`
16
+ starts fresh; `director plan --continue` advances the current gate; `--auto`
17
+ swaps a planner self-critique into the gate so nothing blocks.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ import json
24
+ import re
25
+ import subprocess
26
+ import time
27
+ from dataclasses import asdict, dataclass
28
+ from pathlib import Path
29
+
30
+ from director import gitutil, setup
31
+ from director.config import Config
32
+ from director.cost import CostLedger
33
+ from director.dag import topo_order, validate
34
+ from director.models import Node, Plan
35
+ from director.opencode import run_agent
36
+ from director.setup import sync_agents
37
+
38
+ # Pipeline stages persisted to .director/plan_stage.json. SPEC/DECOMPOSE are
39
+ # transient (executed then advanced in one invocation); GATE_SPEC/GATE_PLAN/READY
40
+ # are the points where an invocation can stop.
41
+ SPEC, GATE_SPEC, DECOMPOSE, GATE_PLAN, READY = (
42
+ "spec",
43
+ "gate_spec",
44
+ "decompose",
45
+ "gate_plan",
46
+ "ready",
47
+ )
48
+
49
+
50
+ @dataclass
51
+ class PlanProgress:
52
+ job_id: str
53
+ task: str
54
+ job_branch: str
55
+ stage: str
56
+ auto: bool
57
+ critique: bool
58
+
59
+ @staticmethod
60
+ def path(repo: Path) -> Path:
61
+ return Path(repo) / ".director" / "plan_stage.json"
62
+
63
+ @classmethod
64
+ def load(cls, repo: Path) -> PlanProgress | None:
65
+ p = cls.path(repo)
66
+ if not p.exists():
67
+ return None
68
+ return cls(**json.loads(p.read_text()))
69
+
70
+ def save(self, repo: Path) -> None:
71
+ p = self.path(repo)
72
+ p.parent.mkdir(parents=True, exist_ok=True)
73
+ p.write_text(json.dumps(asdict(self), indent=2))
74
+
75
+
76
+ @dataclass
77
+ class PlanResult:
78
+ paused: bool # True = stopped at a human gate; False = reached READY
79
+ stage: str
80
+ job_id: str
81
+ job_branch: str
82
+ n_nodes: int
83
+ artifact: str # path the human should review next (when paused)
84
+ message: str
85
+
86
+
87
+ # --------------------------------------------------------------------------- #
88
+ # prompts
89
+ # --------------------------------------------------------------------------- #
90
+ def _explorer_prompt(task: str) -> str:
91
+ return (
92
+ f"Recon for this task — read-only. Produce the relevant-files summary "
93
+ f"per your instructions.\n\nTASK:\n{task}"
94
+ )
95
+
96
+
97
+ def _brainstorm_prompt(task: str, summary: str) -> str:
98
+ return (
99
+ "Produce the design spec for this task per your instructions. Output ONLY "
100
+ "the spec Markdown.\n\n"
101
+ f"TASK:\n{task}\n\n"
102
+ f"REPO RECON SUMMARY:\n{summary}\n"
103
+ )
104
+
105
+
106
+ def _spec_critique_prompt(task: str, spec: str) -> str:
107
+ return (
108
+ "Self-critique pass. Silently re-read the spec below against the ORIGINAL "
109
+ "request and note anything missing, ambiguous, or contradictory. Then output "
110
+ "the REVISED spec that fixes those issues.\n"
111
+ "Output ONLY the final revised spec, in the same Markdown format and starting "
112
+ "at its `# Spec:` heading. Do NOT include your critique notes, a changelog, "
113
+ "or any preamble — the output replaces the spec file verbatim.\n\n"
114
+ f"ORIGINAL REQUEST:\n{task}\n\n"
115
+ f"CURRENT SPEC:\n{spec}\n"
116
+ )
117
+
118
+
119
+ def _planner_prompt(spec: str, summary: str) -> str:
120
+ return (
121
+ "Decompose the APPROVED SPEC below into a strict-JSON DAG per your "
122
+ "instructions. Build from the spec, not from a raw task. Output ONLY the "
123
+ "JSON object.\n\n"
124
+ f"APPROVED SPEC:\n{spec}\n\n"
125
+ f"REPO RECON SUMMARY:\n{summary}\n"
126
+ )
127
+
128
+
129
+ def _plan_critique_prompt(spec: str, plan_json: str) -> str:
130
+ return (
131
+ "Self-critique pass on your own DAG. Re-read the plan below against the "
132
+ "approved spec: are any acceptance criteria unaddressed? any node "
133
+ "under-specified for a junior engineer? any two independent nodes sharing "
134
+ "a file? \n"
135
+ "Respond with a SINGLE strict-JSON object and nothing else:\n"
136
+ ' {"revised": false} — if the plan already covers the spec, OR\n'
137
+ ' {"revised": true, "nodes": [ ...full revised node list... ]}\n'
138
+ "When revising, emit the COMPLETE node list (same schema as before), not a diff.\n\n"
139
+ f"APPROVED SPEC:\n{spec}\n\n"
140
+ f"CURRENT PLAN:\n{plan_json}\n"
141
+ )
142
+
143
+
144
+ def _testauthor_prompt(node: Node) -> str:
145
+ return (
146
+ "Write acceptance tests for exactly this node, in the listed test file(s), "
147
+ "and nothing else. Confirm they FAIL before implementation exists.\n\n"
148
+ f"NODE: {node.id} — {node.title}\n\n"
149
+ f"SPEC:\n{node.spec}\n\n"
150
+ f"TEST FILE(S) TO CREATE: {', '.join(node.tests)}\n"
151
+ f"IMPLEMENTATION FILES (do NOT create/implement these): {', '.join(node.files)}\n"
152
+ f"The test command will be: {node.test_cmd}\n"
153
+ )
154
+
155
+
156
+ # --------------------------------------------------------------------------- #
157
+ # helpers
158
+ # --------------------------------------------------------------------------- #
159
+ def _job_id() -> str:
160
+ return time.strftime("%Y%m%d-%H%M%S")
161
+
162
+
163
+ def _extract_json(text: str) -> dict:
164
+ """Pull a JSON object out of a reply, tolerating code fences or stray prose."""
165
+ text = text.strip()
166
+ fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
167
+ if fence:
168
+ text = fence.group(1)
169
+ try:
170
+ return json.loads(text)
171
+ except json.JSONDecodeError:
172
+ pass
173
+ start, end = text.find("{"), text.rfind("}")
174
+ if start != -1 and end > start:
175
+ return json.loads(text[start : end + 1])
176
+ raise ValueError("agent did not return parseable JSON")
177
+
178
+
179
+ def _run_shell(cmd: str, cwd: Path) -> int:
180
+ import os
181
+
182
+ env = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"} # keep the worktree clean
183
+ return subprocess.run(
184
+ cmd, cwd=str(cwd), shell=True, capture_output=True, text=True, env=env
185
+ ).returncode
186
+
187
+
188
+ def _sha256(path: Path) -> str:
189
+ return hashlib.sha256(path.read_bytes()).hexdigest()
190
+
191
+
192
+ def _build_plan(data: dict, prog: PlanProgress, repo: Path) -> Plan:
193
+ nodes = [Node.from_dict(n) for n in data["nodes"]]
194
+ return Plan(
195
+ job_id=prog.job_id,
196
+ task=prog.task,
197
+ repo=str(repo),
198
+ created_at=time.strftime("%Y-%m-%dT%H:%M:%S"),
199
+ job_branch=prog.job_branch,
200
+ nodes=nodes,
201
+ )
202
+
203
+
204
+ # --------------------------------------------------------------------------- #
205
+ # stages
206
+ # --------------------------------------------------------------------------- #
207
+ def _recon(prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log) -> str:
208
+ log(f"[plan] explorer recon ({cfg.model_for('explorer')}) …")
209
+ ex = run_agent(
210
+ agent="explorer",
211
+ model=cfg.model_for("explorer"),
212
+ message=_explorer_prompt(prog.task),
213
+ cwd=repo,
214
+ log_path=logs / f"{prog.job_id}-explorer.jsonl",
215
+ timeout=cfg.node_timeout,
216
+ )
217
+ ledger.record(role="explorer", model=cfg.model_for("explorer"), tokens=ex.tokens, cfg=cfg)
218
+ if not ex.ok:
219
+ raise RuntimeError(f"explorer failed: {ex.error or ex.returncode} (see {ex.log_path})")
220
+ summary = ex.text or "(no summary)"
221
+ (repo / ".director" / "recon.md").write_text(summary)
222
+ return summary
223
+
224
+
225
+ def _stage_a_spec(
226
+ prog: PlanProgress, repo: Path, cfg: Config, summary: str, ledger: CostLedger, logs: Path, log
227
+ ) -> None:
228
+ log(f"[plan] Stage A brainstorm/spec ({cfg.model_for('planner')}) …")
229
+ bs = run_agent(
230
+ agent="brainstorm",
231
+ model=cfg.model_for("planner"),
232
+ message=_brainstorm_prompt(prog.task, summary),
233
+ cwd=repo,
234
+ log_path=logs / f"{prog.job_id}-brainstorm.jsonl",
235
+ timeout=cfg.node_timeout,
236
+ )
237
+ ledger.record(role="planner", model=cfg.model_for("planner"), tokens=bs.tokens, cfg=cfg)
238
+ if not bs.ok or not bs.text.strip():
239
+ raise RuntimeError(f"brainstorm failed: {bs.error or bs.returncode} (see {bs.log_path})")
240
+ (repo / ".director" / "spec.md").write_text(bs.text.strip() + "\n")
241
+
242
+
243
+ def _critique_spec(
244
+ prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log
245
+ ) -> None:
246
+ spec = (repo / ".director" / "spec.md").read_text()
247
+ log(f"[plan] --auto: spec self-critique ({cfg.model_for('planner')}) …")
248
+ cr = run_agent(
249
+ agent="brainstorm",
250
+ model=cfg.model_for("planner"),
251
+ message=_spec_critique_prompt(prog.task, spec),
252
+ cwd=repo,
253
+ log_path=logs / f"{prog.job_id}-spec-critique.jsonl",
254
+ timeout=cfg.node_timeout,
255
+ )
256
+ ledger.record(role="planner", model=cfg.model_for("planner"), tokens=cr.tokens, cfg=cfg)
257
+ if cr.ok and cr.text.strip():
258
+ (repo / ".director" / "spec.md").write_text(cr.text.strip() + "\n")
259
+ log("[plan] spec revised by self-critique.")
260
+
261
+
262
+ def _author_tests(plan: Plan, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log) -> None:
263
+ """Stage C: test-author writes per-node tests, commit, hash, verify red.
264
+ Idempotent — safe to re-run after a plan revision (overwrites test files)."""
265
+ for node in [plan.node(i) for i in topo_order(plan)]:
266
+ log(
267
+ f"[plan] test-author: {node.id} → {', '.join(node.tests)} "
268
+ f"({cfg.model_for('test_author')}) …"
269
+ )
270
+ ta = run_agent(
271
+ agent="test-author",
272
+ model=cfg.model_for("test_author"),
273
+ message=_testauthor_prompt(node),
274
+ cwd=repo,
275
+ log_path=logs / f"{plan.job_id}-tests-{node.id}.jsonl",
276
+ timeout=cfg.node_timeout,
277
+ )
278
+ ledger.record(
279
+ role="test_author",
280
+ model=cfg.model_for("test_author"),
281
+ tokens=ta.tokens,
282
+ cfg=cfg,
283
+ node=node.id,
284
+ )
285
+ if not ta.ok:
286
+ raise RuntimeError(f"test-author failed on {node.id}: {ta.error or ta.returncode}")
287
+ gitutil.commit_all(f"director: acceptance tests for job {plan.job_id}", repo)
288
+
289
+ # Hash the committed test files: the node gate refuses to pass if the executor
290
+ # later edits the contract. Captured by director, not the planner.
291
+ for node in plan.nodes:
292
+ node.test_hashes = {}
293
+ for t in node.tests:
294
+ tp = repo / t
295
+ if tp.exists():
296
+ node.test_hashes[t] = _sha256(tp)
297
+ (repo / ".director" / "plan.json").write_text(plan.to_json())
298
+
299
+ not_red = [n.id for n in plan.nodes if _run_shell(n.test_cmd, repo) == 0]
300
+ if not_red:
301
+ log(
302
+ f"[plan] WARNING: tests did NOT fail first (not red) for: "
303
+ f"{', '.join(not_red)} — their contract is suspect."
304
+ )
305
+
306
+
307
+ def _stage_bc_decompose(
308
+ prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log
309
+ ) -> Plan:
310
+ summary = (
311
+ (repo / ".director" / "recon.md").read_text()
312
+ if (repo / ".director" / "recon.md").exists()
313
+ else "(no recon)"
314
+ )
315
+ spec = (repo / ".director" / "spec.md").read_text()
316
+
317
+ log(f"[plan] Stage B decompose ({cfg.model_for('planner')}) …")
318
+ pl = run_agent(
319
+ agent="planner",
320
+ model=cfg.model_for("planner"),
321
+ message=_planner_prompt(spec, summary),
322
+ cwd=repo,
323
+ log_path=logs / f"{prog.job_id}-planner.jsonl",
324
+ timeout=cfg.node_timeout,
325
+ )
326
+ ledger.record(role="planner", model=cfg.model_for("planner"), tokens=pl.tokens, cfg=cfg)
327
+ if not pl.ok:
328
+ raise RuntimeError(f"planner failed: {pl.error or pl.returncode} (see {pl.log_path})")
329
+
330
+ plan = _build_plan(_extract_json(pl.text), prog, repo)
331
+ validate(plan)
332
+ (repo / ".director" / "plan.json").write_text(plan.to_json())
333
+ log(f"[plan] {len(plan.nodes)} nodes: {', '.join(n.id for n in plan.nodes)}")
334
+
335
+ _author_tests(plan, repo, cfg, ledger, logs, log)
336
+ return plan
337
+
338
+
339
+ def _critique_plan(
340
+ plan: Plan, prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log
341
+ ) -> Plan:
342
+ spec = (repo / ".director" / "spec.md").read_text()
343
+ log(f"[plan] --auto: plan self-critique ({cfg.model_for('planner')}) …")
344
+ cr = run_agent(
345
+ agent="planner",
346
+ model=cfg.model_for("planner"),
347
+ message=_plan_critique_prompt(spec, plan.to_json()),
348
+ cwd=repo,
349
+ log_path=logs / f"{prog.job_id}-plan-critique.jsonl",
350
+ timeout=cfg.node_timeout,
351
+ )
352
+ ledger.record(role="planner", model=cfg.model_for("planner"), tokens=cr.tokens, cfg=cfg)
353
+ if not cr.ok:
354
+ log("[plan] plan self-critique failed; keeping the original plan.")
355
+ return plan
356
+ try:
357
+ data = _extract_json(cr.text)
358
+ except ValueError:
359
+ log("[plan] plan self-critique returned no JSON; keeping the original plan.")
360
+ return plan
361
+ if not data.get("revised"):
362
+ log("[plan] self-critique: plan already covers the spec.")
363
+ return plan
364
+
365
+ log("[plan] self-critique revised the DAG; re-authoring tests for the new plan.")
366
+ revised = _build_plan(data, prog, repo)
367
+ validate(revised)
368
+ (repo / ".director" / "plan.json").write_text(revised.to_json())
369
+ _author_tests(revised, repo, cfg, ledger, logs, log)
370
+ return revised
371
+
372
+
373
+ # --------------------------------------------------------------------------- #
374
+ # the re-entrant driver
375
+ # --------------------------------------------------------------------------- #
376
+ def run_plan(
377
+ task: str | None,
378
+ repo: str,
379
+ cfg: Config,
380
+ log,
381
+ *,
382
+ auto: bool = False,
383
+ critique: bool = True,
384
+ cont: bool = False,
385
+ ) -> PlanResult:
386
+ repo = Path(repo).resolve()
387
+ fdir = repo / ".director"
388
+ logs = fdir / "logs"
389
+ setup.ensure_director_gitignore(repo) # never let `git add -A` commit .director runtime files
390
+ ledger = CostLedger(fdir / "costs.jsonl")
391
+ prog = PlanProgress.load(repo)
392
+
393
+ if cont:
394
+ if prog is None:
395
+ raise RuntimeError(
396
+ 'nothing to continue: no plan in progress (run `director plan "<task>"` first)'
397
+ )
398
+ # human approval advances the current gate
399
+ if prog.stage == GATE_SPEC:
400
+ prog.stage = DECOMPOSE
401
+ elif prog.stage == GATE_PLAN:
402
+ prog.stage = READY
403
+ elif prog.stage == READY:
404
+ log("[plan] already approved and ready — run `director run`.")
405
+ # carry the flags chosen at start; --auto/--no-critique on --continue may override
406
+ auto = auto or prog.auto
407
+ critique = prog.critique if not auto else critique
408
+ if gitutil.current_branch(repo) != prog.job_branch:
409
+ gitutil.checkout(prog.job_branch, repo)
410
+ else:
411
+ if prog is not None and prog.stage != READY:
412
+ raise RuntimeError(
413
+ f"a plan is already in progress at stage '{prog.stage}' "
414
+ f"(job {prog.job_id}). Use `director plan --continue`, or remove "
415
+ f"{PlanProgress.path(repo)} to start over."
416
+ )
417
+ if not task:
418
+ raise RuntimeError("a task description is required to start a new plan")
419
+ job_id = _job_id()
420
+ job_branch = f"director/job-{job_id}"
421
+ prog = PlanProgress(
422
+ job_id=job_id,
423
+ task=task,
424
+ job_branch=job_branch,
425
+ stage=SPEC,
426
+ auto=auto,
427
+ critique=critique,
428
+ )
429
+ # Stage 0: job branch + agents BEFORE any agent call, so `--agent <role>`
430
+ # resolves the synced role prompt instead of falling back to the default.
431
+ base = gitutil.current_commit(repo)
432
+ if gitutil.branch_exists(job_branch, repo):
433
+ raise RuntimeError(f"branch {job_branch} already exists")
434
+ gitutil.create_branch(job_branch, repo, base)
435
+ gitutil.checkout(job_branch, repo)
436
+ sync_agents(repo)
437
+ gitutil.commit_all(f"director: scaffold agents for job {job_id}", repo)
438
+ _recon(prog, repo, cfg, ledger, logs, log)
439
+
440
+ prog.auto, prog.critique = auto, critique
441
+ plan: Plan | None = None
442
+
443
+ # advance through stages until a human gate pauses us or we reach READY
444
+ while True:
445
+ if prog.stage == SPEC:
446
+ _stage_a_spec(
447
+ prog, repo, cfg, (repo / ".director" / "recon.md").read_text(), ledger, logs, log
448
+ )
449
+ prog.stage = GATE_SPEC
450
+ prog.save(repo)
451
+ if not auto:
452
+ return _paused(
453
+ prog,
454
+ fdir,
455
+ "spec.md",
456
+ ledger,
457
+ "Stage A complete. Review/edit .director/spec.md, then "
458
+ "`director plan --continue`.",
459
+ )
460
+ if critique:
461
+ _critique_spec(prog, repo, cfg, ledger, logs, log)
462
+ prog.stage = DECOMPOSE
463
+ prog.save(repo)
464
+ continue
465
+
466
+ if prog.stage == DECOMPOSE:
467
+ plan = _stage_bc_decompose(prog, repo, cfg, ledger, logs, log)
468
+ prog.stage = GATE_PLAN
469
+ prog.save(repo)
470
+ if not auto:
471
+ return _paused(
472
+ prog,
473
+ fdir,
474
+ "plan.json",
475
+ ledger,
476
+ f"Stages B+C complete: {len(plan.nodes)} nodes, tests "
477
+ f"committed (red). Review .director/plan.json + the test "
478
+ f"files, then `director plan --continue` to enable `run`.",
479
+ )
480
+ if critique:
481
+ plan = _critique_plan(plan, prog, repo, cfg, ledger, logs, log)
482
+ prog.stage = READY
483
+ prog.save(repo)
484
+ continue
485
+
486
+ if prog.stage == READY:
487
+ prog.save(repo)
488
+ if plan is None:
489
+ plan = Plan.from_json((fdir / "plan.json").read_text())
490
+ log(
491
+ f"[plan] READY. job={prog.job_id} branch={prog.job_branch} "
492
+ f"nodes={len(plan.nodes)} plan-cost=${ledger.total():.4f}"
493
+ )
494
+ return PlanResult(
495
+ False,
496
+ READY,
497
+ prog.job_id,
498
+ prog.job_branch,
499
+ len(plan.nodes),
500
+ str(fdir / "plan.json"),
501
+ "Plan approved. Next: `director run`.",
502
+ )
503
+
504
+
505
+ def _paused(
506
+ prog: PlanProgress, fdir: Path, artifact: str, ledger: CostLedger, message: str
507
+ ) -> PlanResult:
508
+ n_nodes = 0
509
+ pj = fdir / "plan.json"
510
+ if pj.exists():
511
+ try:
512
+ n_nodes = len(Plan.from_json(pj.read_text()).nodes)
513
+ except Exception:
514
+ n_nodes = 0
515
+ return PlanResult(
516
+ True,
517
+ prog.stage,
518
+ prog.job_id,
519
+ prog.job_branch,
520
+ n_nodes,
521
+ str(fdir / artifact),
522
+ message + f" (plan-cost so far: ${ledger.total():.4f})",
523
+ )
director/report.py ADDED
@@ -0,0 +1,103 @@
1
+ """Human-readable summaries for `director status` and the end of `director run`."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from director.models import Plan
8
+ from director.state import RunState
9
+
10
+ _STATUS_GLYPH = {
11
+ "done": "✅",
12
+ "pending": "·",
13
+ "running": "…",
14
+ "escalated": "⚠️ ",
15
+ "failed": "❌",
16
+ }
17
+
18
+
19
+ def status_table(repo: str) -> str:
20
+ repo = Path(repo).resolve()
21
+ plan_path = repo / ".director" / "plan.json"
22
+ if not plan_path.exists():
23
+ return 'No plan found. Run `director plan "<task>"` first.'
24
+ plan = Plan.from_json(plan_path.read_text())
25
+ state = RunState.load_or_init(repo, plan)
26
+
27
+ lines = [f"job {plan.job_id} ({plan.job_branch})", f"task: {plan.task}", ""]
28
+ lines.append(f"{'node':24} {'status':10} {'tier':10} {'att':>3} {'cost':>9}")
29
+ lines.append("-" * 60)
30
+ for n in plan.nodes:
31
+ s = state[n.id]
32
+ glyph = _STATUS_GLYPH.get(s.status, "?")
33
+ lines.append(
34
+ f"{n.id[:24]:24} {glyph} {s.status:8} "
35
+ f"{(s.tier_used or '-'):10} {s.attempts:>3} ${s.cost_usd:>7.4f}"
36
+ )
37
+ done = sum(1 for n in plan.nodes if state[n.id].status == "done")
38
+ esc = sum(1 for n in plan.nodes if state[n.id].escalated)
39
+ reviewed = sum(1 for n in plan.nodes if state[n.id].review_stage_two)
40
+ blocked = sum(1 for n in plan.nodes if state[n.id].review_blocks)
41
+ wif_ok = sum(1 for n in plan.nodes if state[n.id].watch_it_fail == "observed")
42
+ flaky = sum(1 for n in plan.nodes if state[n.id].flake_failed)
43
+ lines += [
44
+ "",
45
+ f"{done}/{len(plan.nodes)} done, {esc} escalated, "
46
+ f"{reviewed} stage-two reviewed, {blocked} re-opened by review",
47
+ ]
48
+ if len(plan.nodes):
49
+ no_esc = done - esc
50
+ lines.append(
51
+ f"executor-tier completion (no escalation): "
52
+ f"{no_esc}/{len(plan.nodes)} = {100 * no_esc / len(plan.nodes):.0f}% "
53
+ f"(hypothesis target: >70%)"
54
+ )
55
+ lines.append(
56
+ f"stage-two review trigger rate: "
57
+ f"{reviewed}/{len(plan.nodes)} = {100 * reviewed / len(plan.nodes):.0f}%"
58
+ )
59
+ lines.append(
60
+ f"watch-it-fail observed (red before green): "
61
+ f"{wif_ok}/{len(plan.nodes)}"
62
+ + (f" ⚠️ {flaky} node(s) hit a flake re-run failure" if flaky else "")
63
+ )
64
+ return "\n".join(lines)
65
+
66
+
67
+ def run_summary(result: dict) -> str:
68
+ lines = ["", "=" * 60, f"RUN SUMMARY — job {result['job_id']}", "=" * 60]
69
+ lines.append(f"done: {', '.join(result['done']) or '(none)'}")
70
+ if result["escalated"]:
71
+ lines.append(f"escalated: {', '.join(result['escalated'])}")
72
+ if result.get("reviewed"):
73
+ lines.append(f"stage-two reviewed: {', '.join(result['reviewed'])}")
74
+ if result.get("review_blocked"):
75
+ lines.append(f"review re-opened: {', '.join(result['review_blocked'])}")
76
+ if result["failed"]:
77
+ lines.append(f"FAILED: {', '.join(result['failed'])}")
78
+ lines.append(f"integration gate: {'PASS' if result['integration_ok'] else 'FAIL'}")
79
+ if not result["integration_ok"] and result.get("integration_detail"):
80
+ lines.append(result["integration_detail"][-1500:])
81
+
82
+ if result.get("n_nodes"):
83
+ lines += ["", "measurement:"]
84
+ lines.append(
85
+ f" executor-tier completion (no escalation): "
86
+ f"{result['executor_tier_completion']}/{result['n_nodes']} = "
87
+ f"{result['executor_tier_pct']:.0f}% (hypothesis target: >70%)"
88
+ )
89
+ lines.append(f" escalation rate: {result['escalation_rate']:.0f}%")
90
+ lines.append(f" stage-two trigger rate: {result['stage_two_trigger_rate']:.0f}%")
91
+ lines.append(f" wall time: {result['wall_secs']:.0f}s")
92
+
93
+ lines += ["", "cost by role:"]
94
+ for role, g in sorted(result["by_role"].items()):
95
+ lines.append(
96
+ f" {role:12} {g['calls']:>2} calls "
97
+ f"in={g['input']:>8} out={g['output']:>7} ${g['cost']:.4f}"
98
+ )
99
+ lines += ["", "cost by resolved model:"]
100
+ for model, g in sorted(result["by_model"].items()):
101
+ lines.append(f" {model:48} ${g['cost']:.4f}")
102
+ lines.append(f"\nTOTAL: ${result['cost_total']:.4f}")
103
+ return "\n".join(lines)