director-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
director/review.py ADDED
@@ -0,0 +1,153 @@
1
+ """Two-stage code review (Phase 2.5) — runs after the deterministic node gate
2
+ passes, before merge.
3
+
4
+ Stage one — spec compliance. The allowlist + test gate (in gates.node_gate) are
5
+ the deterministic core and always run. An optional explorer-tier LLM compliance
6
+ check (`review.stage_one_llm`) can be layered on; it is advisory (logged, never
7
+ merge-blocking) so merge decisions stay deterministic-first.
8
+
9
+ Stage two — code quality (reviewer tier). COST-GATED: runs only when the node
10
+ escalated OR its diff touched more than `review.stage_two_file_threshold` files.
11
+ It never runs on the cheap/local executor tier — it uses the `reviewer` tier,
12
+ which a profile binds to a strong model (review on a weak model is worthless). A
13
+ `critical` finding blocks the merge and re-opens the node.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import subprocess
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+
22
+ from director import gitutil
23
+ from director.config import Config
24
+ from director.gates import _is_ignorable
25
+ from director.models import Node
26
+ from director.opencode import run_agent
27
+
28
+
29
+ @dataclass
30
+ class ReviewResult:
31
+ stage_two_ran: bool = False
32
+ blocking: bool = False
33
+ summary: str = ""
34
+ detail: str = "" # feedback fed back to the next attempt if blocking
35
+ calls: list = field(default_factory=list) # [(role, model, tokens)] for the ledger
36
+
37
+
38
+ def _diff(worktree: Path, timeout: int) -> str:
39
+ """Unified diff of the node's uncommitted work (incl. new files). Staging in
40
+ the worktree's own index is side-effect-free w.r.t. the main repo."""
41
+ gitutil.git(["add", "-A"], worktree, check=False)
42
+ p = subprocess.run(
43
+ ["git", "diff", "--cached"],
44
+ cwd=str(worktree),
45
+ capture_output=True,
46
+ text=True,
47
+ timeout=timeout,
48
+ )
49
+ return p.stdout
50
+
51
+
52
+ def _reviewer_message(node: Node, diff: str, stage: str) -> str:
53
+ return "\n".join(
54
+ [
55
+ f"Perform {stage} review of node '{node.id}' — {node.title}.",
56
+ "",
57
+ "SPEC:",
58
+ node.spec,
59
+ "",
60
+ f"FILE ALLOWLIST: {', '.join(node.files)}",
61
+ "",
62
+ "UNIFIED DIFF (already passed tests + allowlist gate):",
63
+ diff[:20000] if diff else "(empty diff)",
64
+ "",
65
+ "Emit your strict-JSON verdict per your instructions.",
66
+ ]
67
+ )
68
+
69
+
70
+ def _should_run_stage_two(node: Node, worktree: Path, cfg: Config, escalated: bool) -> bool:
71
+ if not cfg.stage_two_enabled:
72
+ return False
73
+ if escalated:
74
+ return True
75
+ # count only real source changes — ephemeral build noise (__pycache__/*.pyc,
76
+ # created by running the tests) must not inflate the file count past the
77
+ # threshold and trip stage two on a one-file diff.
78
+ n_changed = sum(1 for p in gitutil.changed_paths(worktree) if not _is_ignorable(p))
79
+ return n_changed > cfg.stage_two_file_threshold
80
+
81
+
82
+ def review_node(
83
+ node: Node, worktree: Path, cfg: Config, logs: Path, log, *, escalated: bool
84
+ ) -> ReviewResult:
85
+ from director.plan import _extract_json # reuse the tolerant JSON extractor
86
+
87
+ result = ReviewResult()
88
+
89
+ # --- Stage one (advisory LLM compliance, optional) ----------------------
90
+ if cfg.stage_one_llm:
91
+ diff = _diff(worktree, cfg.node_timeout)
92
+ s1 = run_agent(
93
+ agent="reviewer",
94
+ model=cfg.model_for("explorer"),
95
+ message=_reviewer_message(node, diff, "stage-one spec-compliance"),
96
+ cwd=worktree,
97
+ log_path=logs / f"{node.id}-review-stage1.jsonl",
98
+ timeout=cfg.node_timeout,
99
+ )
100
+ result.calls.append(("reviewer", cfg.model_for("explorer"), s1.tokens))
101
+ if s1.ok and s1.text.strip():
102
+ log(f"[review] {node.id} stage-one (advisory): {s1.text.strip()[:160]}")
103
+
104
+ # --- Stage two (code quality, conditional, blocking) --------------------
105
+ if not _should_run_stage_two(node, worktree, cfg, escalated):
106
+ return result
107
+
108
+ result.stage_two_ran = True
109
+ model = cfg.model_for("reviewer")
110
+ why = "escalated" if escalated else f">{cfg.stage_two_file_threshold} files changed"
111
+ log(f"[review] {node.id} stage-two code quality ({model}) — triggered: {why}")
112
+ diff = _diff(worktree, cfg.node_timeout)
113
+ rv = run_agent(
114
+ agent="reviewer",
115
+ model=model,
116
+ message=_reviewer_message(node, diff, "stage-two code-quality"),
117
+ cwd=worktree,
118
+ log_path=logs / f"{node.id}-review-stage2.jsonl",
119
+ timeout=cfg.node_timeout,
120
+ )
121
+ result.calls.append(("reviewer", model, rv.tokens))
122
+ if not rv.ok:
123
+ # a failed reviewer call is non-blocking — never let review infra wedge a
124
+ # node that already passed its deterministic gate.
125
+ log(
126
+ f"[review] {node.id} stage-two reviewer call failed "
127
+ f"({rv.error or rv.returncode}); not blocking."
128
+ )
129
+ return result
130
+
131
+ try:
132
+ verdict = _extract_json(rv.text)
133
+ except ValueError:
134
+ log(f"[review] {node.id} stage-two returned no JSON verdict; not blocking.")
135
+ return result
136
+
137
+ findings = verdict.get("findings", []) or []
138
+ criticals = [f for f in findings if str(f.get("severity", "")).lower() == "critical"]
139
+ result.summary = str(verdict.get("summary", ""))[:200]
140
+ if str(verdict.get("verdict", "")).lower() == "block" or criticals:
141
+ result.blocking = True
142
+ bullet = "\n".join(
143
+ f"- [{f.get('severity')}] {f.get('file', '?')}: {f.get('summary', '')}"
144
+ for f in (criticals or findings)
145
+ )
146
+ result.detail = (
147
+ "Stage-two review BLOCKED this node (critical findings). "
148
+ "Fix these without touching the tests:\n" + bullet
149
+ )
150
+ log(f"[review] {node.id} BLOCKED by stage-two: {len(criticals)} critical finding(s)")
151
+ else:
152
+ log(f"[review] {node.id} stage-two PASS: {result.summary or 'no blocking findings'}")
153
+ return result
director/run.py ADDED
@@ -0,0 +1,444 @@
1
+ """`director run` — execute the DAG.
2
+
3
+ Each node runs in an isolated git worktree on its own task branch. The executor
4
+ tier gets up to `max_attempts`, with the failing gate output fed back each time
5
+ (fresh OpenCode context per attempt — only the worktree's files and the feedback
6
+ carry over). On exhaustion the SAME node is retried once at the escalation tier
7
+ (never escalate the whole job). A passing node merges into the job branch.
8
+
9
+ Independent nodes may run in parallel (`--parallel N`); the DAG guarantees their
10
+ allowlists are disjoint, so their merges never conflict. Git mutations
11
+ (worktree add/remove, merge) are serialized; model calls run concurrently.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import shutil
17
+ import subprocess
18
+ import tempfile
19
+ import threading
20
+ import time
21
+ from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
22
+ from dataclasses import dataclass, field
23
+ from pathlib import Path
24
+
25
+ from director import dag, gitutil, setup
26
+ from director.config import Config
27
+ from director.cost import CostLedger, cost_of
28
+ from director.gates import GateResult, integration_gate, node_gate
29
+ from director.metrics import MetricsWriter
30
+ from director.models import DONE, ESCALATED, FAILED, RUNNING, Node, Plan
31
+ from director.opencode import run_agent, watch_it_fail
32
+ from director.review import review_node
33
+ from director.state import RunState
34
+
35
+
36
+ class CostCeilingExceeded(RuntimeError):
37
+ pass
38
+
39
+
40
+ @dataclass
41
+ class NodeOutcome:
42
+ node_id: str
43
+ ok: bool
44
+ tier: str | None # tier that passed: "executor" | "escalation"
45
+ escalated: bool
46
+ attempts: int
47
+ model: str | None
48
+ tokens: dict # summed across all calls (for state display)
49
+ calls: list = field(default_factory=list) # [(tier, model, tokens)] for the ledger
50
+ error: str | None = None
51
+ worktree: Path | None = None
52
+ review_stage_two: bool = False # did stage-two code review run on any attempt?
53
+ review_blocks: int = 0 # attempts re-opened by a critical review finding
54
+ review_summary: str | None = None
55
+ # Phase 3 measurement
56
+ wall_secs: float = 0.0 # wall time for the whole node (worktree → merge-ready)
57
+ watch_it_fail: dict = field(
58
+ default_factory=dict
59
+ ) # {verdict, ran_before_edit, observed_failure}
60
+ flake_failed: bool = False # a flake re-run failed this node on some attempt
61
+
62
+
63
+ def _executor_message(node: Node, worktree: Path, feedback: str) -> str:
64
+ parts = [f"Implement node '{node.id}' — {node.title}.", "", "SPEC:", node.spec, ""]
65
+ parts.append("FILES YOU MAY EDIT (allowlist — touch nothing else):")
66
+ for f in node.files:
67
+ fp = worktree / f
68
+ contents = fp.read_text() if fp.exists() else "(does not exist yet — create it)"
69
+ parts += [f"--- {f} ---", contents, ""]
70
+ parts += [
71
+ f"GATE (your tests must pass): {node.test_cmd}",
72
+ "",
73
+ "CURRENT FAILING TEST OUTPUT:",
74
+ feedback,
75
+ "",
76
+ ]
77
+ parts.append(
78
+ "Run the gate, implement in the allowlisted files only, and "
79
+ "re-run until it passes. Do not modify the test files."
80
+ )
81
+ return "\n".join(parts)
82
+
83
+
84
+ def _run_shell(cmd: str, cwd: Path, timeout: int) -> str:
85
+ import os
86
+
87
+ env = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"} # keep the worktree clean
88
+ p = subprocess.run(
89
+ cmd, cwd=str(cwd), shell=True, capture_output=True, text=True, timeout=timeout, env=env
90
+ )
91
+ return p.stdout + p.stderr
92
+
93
+
94
+ def _attempt_tiers(cfg: Config, max_attempts: int) -> list[tuple[str, str]]:
95
+ """Ordered (tier, model): executor ×max_attempts, then escalation ×1."""
96
+ return [("executor", cfg.model_for("executor"))] * max_attempts + [
97
+ ("escalation", cfg.model_for("escalation"))
98
+ ]
99
+
100
+
101
+ def _process_node(
102
+ node: Node, worktree: Path, cfg: Config, logs: Path, max_attempts: int, log
103
+ ) -> NodeOutcome:
104
+ """Run the attempt/escalation ladder inside an already-created worktree."""
105
+ feedback = _run_shell(node.test_cmd, worktree, cfg.node_timeout)[-3000:]
106
+ tokens_sum = {"input": 0, "output": 0}
107
+ calls: list = []
108
+ attempts = 0
109
+ escalated = False
110
+ review_stage_two = False
111
+ review_blocks = 0
112
+ review_summary: str | None = None
113
+ wif: dict = {} # watch-it-fail verdict of the implementing attempt
114
+ flake_failed = False
115
+
116
+ for i, (tier, model) in enumerate(_attempt_tiers(cfg, max_attempts)):
117
+ if tier == "executor":
118
+ attempts += 1
119
+ else:
120
+ escalated = True
121
+ n = attempts if tier == "executor" else 1
122
+ log(f"[run] {node.id} [{tier}#{n}] {model} …")
123
+ res = run_agent(
124
+ agent="executor",
125
+ model=model,
126
+ message=_executor_message(node, worktree, feedback),
127
+ cwd=worktree,
128
+ log_path=logs / f"{node.id}-{tier}-{i}.jsonl",
129
+ timeout=cfg.node_timeout,
130
+ )
131
+ tokens_sum["input"] += res.tokens.get("input", 0)
132
+ tokens_sum["output"] += res.tokens.get("output", 0)
133
+ calls.append((tier, model, res.tokens))
134
+
135
+ # watch-it-fail (Phase 3 §1): did this attempt run the failing tests before
136
+ # its first edit? Advisory metric; the verdict of the attempt that ends up
137
+ # passing is the one we keep.
138
+ attempt_wif = watch_it_fail(res.tool_events, node.test_cmd)
139
+
140
+ gate: GateResult = node_gate(node, worktree, cfg)
141
+ if not gate.ok:
142
+ if "flaky tests" in gate.failures:
143
+ flake_failed = True
144
+ reason = res.error or ("timeout" if res.timed_out else "; ".join(gate.failures))
145
+ log(f"[run] {node.id} fail ({reason}) at {tier}")
146
+ feedback = (gate.detail or reason)[-3000:]
147
+ continue
148
+
149
+ # deterministic gate passed → two-stage review (cost-gated) before merge
150
+ review = review_node(node, worktree, cfg, logs, log, escalated=escalated)
151
+ calls.extend(review.calls)
152
+ review_stage_two = review_stage_two or review.stage_two_ran
153
+ if review.summary:
154
+ review_summary = review.summary
155
+ if review.blocking:
156
+ review_blocks += 1
157
+ feedback = review.detail[-3000:]
158
+ continue # a critical finding re-opens the node (counts against attempts)
159
+
160
+ wif = attempt_wif.__dict__
161
+ if not attempt_wif.observed:
162
+ log(f"[run] {node.id} watch-it-fail: {attempt_wif.verdict} ({attempt_wif.detail})")
163
+ log(f"[run] {node.id} PASS at {tier} (executor attempts={attempts})")
164
+ return NodeOutcome(
165
+ node.id,
166
+ True,
167
+ tier,
168
+ escalated,
169
+ attempts,
170
+ model,
171
+ tokens_sum,
172
+ calls,
173
+ None,
174
+ worktree,
175
+ review_stage_two,
176
+ review_blocks,
177
+ review_summary,
178
+ watch_it_fail=wif,
179
+ flake_failed=flake_failed,
180
+ )
181
+
182
+ return NodeOutcome(
183
+ node.id,
184
+ False,
185
+ None,
186
+ escalated,
187
+ attempts,
188
+ None,
189
+ tokens_sum,
190
+ calls,
191
+ f"exhausted: {feedback[:200]}",
192
+ worktree,
193
+ review_stage_two,
194
+ review_blocks,
195
+ review_summary,
196
+ watch_it_fail=wif,
197
+ flake_failed=flake_failed,
198
+ )
199
+
200
+
201
+ def run_job(repo: str, cfg: Config, parallel: int, max_attempts: int, log) -> dict:
202
+ repo = Path(repo).resolve()
203
+ fdir = repo / ".director"
204
+ setup.ensure_director_gitignore(repo) # never let `git add -A` commit .director runtime files
205
+ plan = Plan.from_json((fdir / "plan.json").read_text())
206
+ state = RunState.load_or_init(repo, plan)
207
+ ledger = CostLedger(fdir / "costs.jsonl")
208
+ metrics = MetricsWriter(fdir / "metrics.jsonl")
209
+ logs = fdir / "logs"
210
+ run_t0 = time.perf_counter()
211
+ # Worktrees live OUTSIDE the repo tree: a worktree nested inside the repo lets
212
+ # OpenCode resolve the enclosing repo as the project root and leak edits out of
213
+ # the isolated checkout. A sibling temp dir keeps each worktree its own root.
214
+ wt_root = Path(tempfile.gettempdir()) / "director-worktrees" / plan.job_id
215
+ wt_root.mkdir(parents=True, exist_ok=True)
216
+ git_lock = threading.Lock()
217
+
218
+ if gitutil.current_branch(repo) != plan.job_branch:
219
+ gitutil.checkout(plan.job_branch, repo)
220
+
221
+ dag.validate(plan)
222
+ done = state.done_ids()
223
+ # `finished` = every node in a TERMINAL state (done | failed | escalated). The
224
+ # scheduler keys off this, not `done`: a node that fails must never be
225
+ # re-scheduled, and the loop must end even when not every node succeeded.
226
+ # (Seeded from state so a resumed run doesn't retry already-failed nodes.)
227
+ finished = {nid for nid, ns in state.nodes.items() if ns.status in (DONE, FAILED, ESCALATED)}
228
+ active: set[str] = set()
229
+ log(
230
+ f"[run] job={plan.job_id} branch={plan.job_branch} "
231
+ f"nodes={len(plan.nodes)} done={len(done)} parallel={parallel}"
232
+ )
233
+
234
+ def launch(node_id: str) -> NodeOutcome:
235
+ node = plan.node(node_id)
236
+ with git_lock:
237
+ wt = wt_root / node_id
238
+ gitutil.worktree_remove(wt, repo) # no-op if not registered
239
+ shutil.rmtree(wt, ignore_errors=True)
240
+ # drop any stale registration left by a killed run (dir gone but git
241
+ # still tracks it) so `worktree add` below can't fail with exit 255.
242
+ gitutil.git(["worktree", "prune"], repo, check=False)
243
+ task_branch = f"director/task-{plan.job_id}-{node_id}"
244
+ if gitutil.branch_exists(task_branch, repo):
245
+ gitutil.git(["branch", "-D", task_branch], repo, check=False)
246
+ gitutil.worktree_add(wt, task_branch, plan.job_branch, repo)
247
+ state[node_id].status = RUNNING
248
+ state[node_id].worktree = str(wt)
249
+ state.save()
250
+ node_t0 = time.perf_counter()
251
+ outcome = _process_node(node, wt, cfg, logs, max_attempts, log)
252
+ outcome.wall_secs = round(time.perf_counter() - node_t0, 1)
253
+ return outcome
254
+
255
+ aborted = False
256
+ with ThreadPoolExecutor(max_workers=max(1, parallel)) as pool:
257
+ futures: dict = {}
258
+ while len(finished) < len(plan.nodes) and not aborted:
259
+ # exclude both running and already-terminal nodes; deps are satisfied
260
+ # only by nodes that actually succeeded (`done`).
261
+ for nid in dag.ready_nodes(plan, done, active | finished):
262
+ if len(futures) >= parallel:
263
+ break
264
+ active.add(nid)
265
+ futures[pool.submit(launch, nid)] = nid
266
+ if not futures:
267
+ # nothing running and nothing runnable → the rest are blocked by a
268
+ # failed/unsatisfiable dependency. Mark them failed and stop (never
269
+ # spin re-scheduling terminal nodes).
270
+ blocked = [n.id for n in plan.nodes if n.id not in finished]
271
+ if blocked:
272
+ log(
273
+ f"[run] cannot proceed — {len(blocked)} node(s) blocked by "
274
+ f"failed/unsatisfiable deps: {', '.join(blocked)}"
275
+ )
276
+ for nid in blocked:
277
+ ns = state[nid]
278
+ ns.status = FAILED
279
+ ns.error = ns.error or "blocked by a failed dependency"
280
+ finished.add(nid)
281
+ state.save()
282
+ break
283
+ completed, _ = wait(list(futures), return_when=FIRST_COMPLETED)
284
+ for fut in completed:
285
+ nid = futures.pop(fut)
286
+ active.discard(nid)
287
+ outcome = fut.result()
288
+ try:
289
+ _finalize(outcome, plan, state, repo, git_lock, ledger, cfg, log, metrics)
290
+ except CostCeilingExceeded as e:
291
+ log(f"[run] ABORT: {e}")
292
+ aborted = True
293
+ finished.add(nid) # terminal regardless of pass/fail → never re-scheduled
294
+ if outcome.ok and state[nid].status == DONE:
295
+ done.add(nid)
296
+
297
+ integ = integration_gate(repo, cfg)
298
+ log(
299
+ f"[run] integration gate: "
300
+ f"{'PASS' if integ.ok else 'FAIL (' + ', '.join(integ.failures) + ')'}"
301
+ )
302
+
303
+ state.save()
304
+ n = len(plan.nodes)
305
+ done_l = sorted(done)
306
+ escalated_l = [nd.id for nd in plan.nodes if state[nd.id].escalated]
307
+ reviewed_l = [nd.id for nd in plan.nodes if state[nd.id].review_stage_two]
308
+ # executor-tier completion = done without ever escalating (the hypothesis metric)
309
+ exec_done = [nid for nid in done_l if nid not in set(escalated_l)]
310
+ wall = round(time.perf_counter() - run_t0, 1)
311
+ result = {
312
+ "job_id": plan.job_id,
313
+ "done": done_l,
314
+ "failed": [nd.id for nd in plan.nodes if state[nd.id].status == FAILED],
315
+ "escalated": escalated_l,
316
+ "reviewed": reviewed_l,
317
+ "review_blocked": [nd.id for nd in plan.nodes if state[nd.id].review_blocks],
318
+ "integration_ok": integ.ok,
319
+ "integration_detail": integ.detail,
320
+ "cost_total": ledger.total(),
321
+ "by_role": ledger.by_role(),
322
+ "by_model": ledger.by_model(),
323
+ "wall_secs": wall,
324
+ "n_nodes": n,
325
+ "executor_tier_completion": len(exec_done),
326
+ "executor_tier_pct": round(100 * len(exec_done) / n, 1) if n else 0.0,
327
+ "escalation_rate": round(100 * len(escalated_l) / n, 1) if n else 0.0,
328
+ "stage_two_trigger_rate": round(100 * len(reviewed_l) / n, 1) if n else 0.0,
329
+ }
330
+ # run-level metrics record (Phase 3): derived rates + the resolved tier map so a
331
+ # metrics line is self-describing about which models produced it.
332
+ metrics.write(
333
+ {
334
+ "kind": "run",
335
+ "job_id": plan.job_id,
336
+ "tiers": dict(cfg.tiers),
337
+ "n_nodes": n,
338
+ "done": len(done_l),
339
+ "failed": len(result["failed"]),
340
+ "escalated": len(escalated_l),
341
+ "executor_tier_completion": len(exec_done),
342
+ "executor_tier_pct": result["executor_tier_pct"],
343
+ "escalation_rate": result["escalation_rate"],
344
+ "stage_two_trigger_rate": result["stage_two_trigger_rate"],
345
+ "integration_ok": integ.ok,
346
+ "wall_secs": wall,
347
+ "cost_total": ledger.total(),
348
+ "by_role": ledger.by_role(),
349
+ "by_model": ledger.by_model(),
350
+ }
351
+ )
352
+ return result
353
+
354
+
355
+ def _finalize(
356
+ outcome: NodeOutcome,
357
+ plan: Plan,
358
+ state: RunState,
359
+ repo: Path,
360
+ git_lock,
361
+ ledger: CostLedger,
362
+ cfg: Config,
363
+ log,
364
+ metrics: MetricsWriter,
365
+ ):
366
+ ns = state[outcome.node_id]
367
+ ns.attempts = outcome.attempts
368
+ ns.escalated = outcome.escalated
369
+ ns.tier_used = outcome.tier
370
+ ns.model_used = outcome.model
371
+ ns.tokens = outcome.tokens
372
+ ns.review_stage_two = outcome.review_stage_two
373
+ ns.review_blocks = outcome.review_blocks
374
+ ns.review_summary = outcome.review_summary
375
+ ns.wall_secs = outcome.wall_secs
376
+ ns.watch_it_fail = (outcome.watch_it_fail or {}).get("verdict")
377
+ ns.flake_failed = outcome.flake_failed
378
+
379
+ # precise cost: one ledger entry per model call, tagged with its tier/role.
380
+ # Also accumulate a per-node by-role breakdown for the metrics record.
381
+ node_cost = 0.0
382
+ node_by_role: dict[str, dict] = {}
383
+ for tier, model, tokens in outcome.calls:
384
+ node_cost += ledger.record(
385
+ role=tier, model=model, tokens=tokens, cfg=cfg, node=outcome.node_id
386
+ )
387
+ g = node_by_role.setdefault(tier, {"input": 0, "output": 0, "cost": 0.0, "calls": 0})
388
+ g["input"] += int(tokens.get("input", 0))
389
+ g["output"] += int(tokens.get("output", 0))
390
+ g["cost"] += cost_of(model, tokens, cfg)
391
+ g["calls"] += 1
392
+ ns.cost_usd = node_cost
393
+
394
+ task_branch = f"director/task-{plan.job_id}-{outcome.node_id}"
395
+ with git_lock:
396
+ if outcome.ok and outcome.worktree:
397
+ gitutil.commit_all(
398
+ f"director: node {outcome.node_id} via {outcome.tier}", outcome.worktree
399
+ )
400
+ merge = gitutil.merge_branch(
401
+ task_branch, repo, message=f"director: merge node {outcome.node_id}"
402
+ )
403
+ if merge.returncode != 0:
404
+ gitutil.git(["merge", "--abort"], repo, check=False)
405
+ ns.status = FAILED
406
+ ns.error = f"merge conflict: {merge.stdout}{merge.stderr}"[:300]
407
+ log(f"[run] {outcome.node_id} MERGE FAILED")
408
+ else:
409
+ ns.status = DONE
410
+ else:
411
+ ns.status = ESCALATED if outcome.escalated else FAILED
412
+ ns.error = outcome.error
413
+ if outcome.worktree and Path(outcome.worktree).exists():
414
+ gitutil.worktree_remove(outcome.worktree, repo)
415
+ shutil.rmtree(outcome.worktree, ignore_errors=True)
416
+ gitutil.git(["branch", "-D", task_branch], repo, check=False)
417
+ state.save()
418
+
419
+ node = plan.node(outcome.node_id)
420
+ metrics.write(
421
+ {
422
+ "kind": "node",
423
+ "job_id": plan.job_id,
424
+ "node": outcome.node_id,
425
+ "title": node.title,
426
+ "difficulty": node.estimated_difficulty,
427
+ "status": ns.status,
428
+ "tier_used": outcome.tier,
429
+ "model_used": outcome.model,
430
+ "attempts": outcome.attempts,
431
+ "escalated": outcome.escalated,
432
+ "wall_secs": outcome.wall_secs,
433
+ "tokens": outcome.tokens,
434
+ "cost_usd": round(node_cost, 6),
435
+ "by_role": node_by_role,
436
+ "review_stage_two": outcome.review_stage_two,
437
+ "review_blocks": outcome.review_blocks,
438
+ "watch_it_fail": outcome.watch_it_fail or {"verdict": "unknown"},
439
+ "flake_failed": outcome.flake_failed,
440
+ }
441
+ )
442
+
443
+ if cfg.cost_ceiling and ledger.total() > cfg.cost_ceiling:
444
+ raise CostCeilingExceeded(f"cost ${ledger.total():.4f} > ${cfg.cost_ceiling:.2f}")