director-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- director/README.md +124 -0
- director/__init__.py +10 -0
- director/__main__.py +4 -0
- director/agent_templates/brainstorm.md +44 -0
- director/agent_templates/executor.md +37 -0
- director/agent_templates/explorer.md +24 -0
- director/agent_templates/opencode.json +39 -0
- director/agent_templates/planner.md +60 -0
- director/agent_templates/reviewer.md +46 -0
- director/agent_templates/test-author.md +29 -0
- director/bench.py +234 -0
- director/cli.py +166 -0
- director/config.example.toml +75 -0
- director/config.py +111 -0
- director/cost.py +84 -0
- director/dag.py +113 -0
- director/gates.py +145 -0
- director/gitutil.py +83 -0
- director/metrics.py +48 -0
- director/models.py +106 -0
- director/opencode.py +231 -0
- director/plan.py +523 -0
- director/report.py +103 -0
- director/review.py +153 -0
- director/run.py +444 -0
- director/setup.py +101 -0
- director/state.py +43 -0
- director_cli-0.3.0.dist-info/METADATA +174 -0
- director_cli-0.3.0.dist-info/RECORD +32 -0
- director_cli-0.3.0.dist-info/WHEEL +4 -0
- director_cli-0.3.0.dist-info/entry_points.txt +2 -0
- director_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
director/review.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Two-stage code review (Phase 2.5) — runs after the deterministic node gate
|
|
2
|
+
passes, before merge.
|
|
3
|
+
|
|
4
|
+
Stage one — spec compliance. The allowlist + test gate (in gates.node_gate) are
|
|
5
|
+
the deterministic core and always run. An optional explorer-tier LLM compliance
|
|
6
|
+
check (`review.stage_one_llm`) can be layered on; it is advisory (logged, never
|
|
7
|
+
merge-blocking) so merge decisions stay deterministic-first.
|
|
8
|
+
|
|
9
|
+
Stage two — code quality (reviewer tier). COST-GATED: runs only when the node
|
|
10
|
+
escalated OR its diff touched more than `review.stage_two_file_threshold` files.
|
|
11
|
+
It never runs on the cheap/local executor tier — it uses the `reviewer` tier,
|
|
12
|
+
which a profile binds to a strong model (review on a weak model is worthless). A
|
|
13
|
+
`critical` finding blocks the merge and re-opens the node.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import subprocess
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from director import gitutil
|
|
23
|
+
from director.config import Config
|
|
24
|
+
from director.gates import _is_ignorable
|
|
25
|
+
from director.models import Node
|
|
26
|
+
from director.opencode import run_agent
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ReviewResult:
|
|
31
|
+
stage_two_ran: bool = False
|
|
32
|
+
blocking: bool = False
|
|
33
|
+
summary: str = ""
|
|
34
|
+
detail: str = "" # feedback fed back to the next attempt if blocking
|
|
35
|
+
calls: list = field(default_factory=list) # [(role, model, tokens)] for the ledger
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _diff(worktree: Path, timeout: int) -> str:
|
|
39
|
+
"""Unified diff of the node's uncommitted work (incl. new files). Staging in
|
|
40
|
+
the worktree's own index is side-effect-free w.r.t. the main repo."""
|
|
41
|
+
gitutil.git(["add", "-A"], worktree, check=False)
|
|
42
|
+
p = subprocess.run(
|
|
43
|
+
["git", "diff", "--cached"],
|
|
44
|
+
cwd=str(worktree),
|
|
45
|
+
capture_output=True,
|
|
46
|
+
text=True,
|
|
47
|
+
timeout=timeout,
|
|
48
|
+
)
|
|
49
|
+
return p.stdout
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _reviewer_message(node: Node, diff: str, stage: str) -> str:
|
|
53
|
+
return "\n".join(
|
|
54
|
+
[
|
|
55
|
+
f"Perform {stage} review of node '{node.id}' — {node.title}.",
|
|
56
|
+
"",
|
|
57
|
+
"SPEC:",
|
|
58
|
+
node.spec,
|
|
59
|
+
"",
|
|
60
|
+
f"FILE ALLOWLIST: {', '.join(node.files)}",
|
|
61
|
+
"",
|
|
62
|
+
"UNIFIED DIFF (already passed tests + allowlist gate):",
|
|
63
|
+
diff[:20000] if diff else "(empty diff)",
|
|
64
|
+
"",
|
|
65
|
+
"Emit your strict-JSON verdict per your instructions.",
|
|
66
|
+
]
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _should_run_stage_two(node: Node, worktree: Path, cfg: Config, escalated: bool) -> bool:
|
|
71
|
+
if not cfg.stage_two_enabled:
|
|
72
|
+
return False
|
|
73
|
+
if escalated:
|
|
74
|
+
return True
|
|
75
|
+
# count only real source changes — ephemeral build noise (__pycache__/*.pyc,
|
|
76
|
+
# created by running the tests) must not inflate the file count past the
|
|
77
|
+
# threshold and trip stage two on a one-file diff.
|
|
78
|
+
n_changed = sum(1 for p in gitutil.changed_paths(worktree) if not _is_ignorable(p))
|
|
79
|
+
return n_changed > cfg.stage_two_file_threshold
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def review_node(
|
|
83
|
+
node: Node, worktree: Path, cfg: Config, logs: Path, log, *, escalated: bool
|
|
84
|
+
) -> ReviewResult:
|
|
85
|
+
from director.plan import _extract_json # reuse the tolerant JSON extractor
|
|
86
|
+
|
|
87
|
+
result = ReviewResult()
|
|
88
|
+
|
|
89
|
+
# --- Stage one (advisory LLM compliance, optional) ----------------------
|
|
90
|
+
if cfg.stage_one_llm:
|
|
91
|
+
diff = _diff(worktree, cfg.node_timeout)
|
|
92
|
+
s1 = run_agent(
|
|
93
|
+
agent="reviewer",
|
|
94
|
+
model=cfg.model_for("explorer"),
|
|
95
|
+
message=_reviewer_message(node, diff, "stage-one spec-compliance"),
|
|
96
|
+
cwd=worktree,
|
|
97
|
+
log_path=logs / f"{node.id}-review-stage1.jsonl",
|
|
98
|
+
timeout=cfg.node_timeout,
|
|
99
|
+
)
|
|
100
|
+
result.calls.append(("reviewer", cfg.model_for("explorer"), s1.tokens))
|
|
101
|
+
if s1.ok and s1.text.strip():
|
|
102
|
+
log(f"[review] {node.id} stage-one (advisory): {s1.text.strip()[:160]}")
|
|
103
|
+
|
|
104
|
+
# --- Stage two (code quality, conditional, blocking) --------------------
|
|
105
|
+
if not _should_run_stage_two(node, worktree, cfg, escalated):
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
result.stage_two_ran = True
|
|
109
|
+
model = cfg.model_for("reviewer")
|
|
110
|
+
why = "escalated" if escalated else f">{cfg.stage_two_file_threshold} files changed"
|
|
111
|
+
log(f"[review] {node.id} stage-two code quality ({model}) — triggered: {why}")
|
|
112
|
+
diff = _diff(worktree, cfg.node_timeout)
|
|
113
|
+
rv = run_agent(
|
|
114
|
+
agent="reviewer",
|
|
115
|
+
model=model,
|
|
116
|
+
message=_reviewer_message(node, diff, "stage-two code-quality"),
|
|
117
|
+
cwd=worktree,
|
|
118
|
+
log_path=logs / f"{node.id}-review-stage2.jsonl",
|
|
119
|
+
timeout=cfg.node_timeout,
|
|
120
|
+
)
|
|
121
|
+
result.calls.append(("reviewer", model, rv.tokens))
|
|
122
|
+
if not rv.ok:
|
|
123
|
+
# a failed reviewer call is non-blocking — never let review infra wedge a
|
|
124
|
+
# node that already passed its deterministic gate.
|
|
125
|
+
log(
|
|
126
|
+
f"[review] {node.id} stage-two reviewer call failed "
|
|
127
|
+
f"({rv.error or rv.returncode}); not blocking."
|
|
128
|
+
)
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
verdict = _extract_json(rv.text)
|
|
133
|
+
except ValueError:
|
|
134
|
+
log(f"[review] {node.id} stage-two returned no JSON verdict; not blocking.")
|
|
135
|
+
return result
|
|
136
|
+
|
|
137
|
+
findings = verdict.get("findings", []) or []
|
|
138
|
+
criticals = [f for f in findings if str(f.get("severity", "")).lower() == "critical"]
|
|
139
|
+
result.summary = str(verdict.get("summary", ""))[:200]
|
|
140
|
+
if str(verdict.get("verdict", "")).lower() == "block" or criticals:
|
|
141
|
+
result.blocking = True
|
|
142
|
+
bullet = "\n".join(
|
|
143
|
+
f"- [{f.get('severity')}] {f.get('file', '?')}: {f.get('summary', '')}"
|
|
144
|
+
for f in (criticals or findings)
|
|
145
|
+
)
|
|
146
|
+
result.detail = (
|
|
147
|
+
"Stage-two review BLOCKED this node (critical findings). "
|
|
148
|
+
"Fix these without touching the tests:\n" + bullet
|
|
149
|
+
)
|
|
150
|
+
log(f"[review] {node.id} BLOCKED by stage-two: {len(criticals)} critical finding(s)")
|
|
151
|
+
else:
|
|
152
|
+
log(f"[review] {node.id} stage-two PASS: {result.summary or 'no blocking findings'}")
|
|
153
|
+
return result
|
director/run.py
ADDED
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
"""`director run` — execute the DAG.
|
|
2
|
+
|
|
3
|
+
Each node runs in an isolated git worktree on its own task branch. The executor
|
|
4
|
+
tier gets up to `max_attempts`, with the failing gate output fed back each time
|
|
5
|
+
(fresh OpenCode context per attempt — only the worktree's files and the feedback
|
|
6
|
+
carry over). On exhaustion the SAME node is retried once at the escalation tier
|
|
7
|
+
(never escalate the whole job). A passing node merges into the job branch.
|
|
8
|
+
|
|
9
|
+
Independent nodes may run in parallel (`--parallel N`); the DAG guarantees their
|
|
10
|
+
allowlists are disjoint, so their merges never conflict. Git mutations
|
|
11
|
+
(worktree add/remove, merge) are serialized; model calls run concurrently.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import shutil
|
|
17
|
+
import subprocess
|
|
18
|
+
import tempfile
|
|
19
|
+
import threading
|
|
20
|
+
import time
|
|
21
|
+
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from director import dag, gitutil, setup
|
|
26
|
+
from director.config import Config
|
|
27
|
+
from director.cost import CostLedger, cost_of
|
|
28
|
+
from director.gates import GateResult, integration_gate, node_gate
|
|
29
|
+
from director.metrics import MetricsWriter
|
|
30
|
+
from director.models import DONE, ESCALATED, FAILED, RUNNING, Node, Plan
|
|
31
|
+
from director.opencode import run_agent, watch_it_fail
|
|
32
|
+
from director.review import review_node
|
|
33
|
+
from director.state import RunState
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class CostCeilingExceeded(RuntimeError):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class NodeOutcome:
|
|
42
|
+
node_id: str
|
|
43
|
+
ok: bool
|
|
44
|
+
tier: str | None # tier that passed: "executor" | "escalation"
|
|
45
|
+
escalated: bool
|
|
46
|
+
attempts: int
|
|
47
|
+
model: str | None
|
|
48
|
+
tokens: dict # summed across all calls (for state display)
|
|
49
|
+
calls: list = field(default_factory=list) # [(tier, model, tokens)] for the ledger
|
|
50
|
+
error: str | None = None
|
|
51
|
+
worktree: Path | None = None
|
|
52
|
+
review_stage_two: bool = False # did stage-two code review run on any attempt?
|
|
53
|
+
review_blocks: int = 0 # attempts re-opened by a critical review finding
|
|
54
|
+
review_summary: str | None = None
|
|
55
|
+
# Phase 3 measurement
|
|
56
|
+
wall_secs: float = 0.0 # wall time for the whole node (worktree → merge-ready)
|
|
57
|
+
watch_it_fail: dict = field(
|
|
58
|
+
default_factory=dict
|
|
59
|
+
) # {verdict, ran_before_edit, observed_failure}
|
|
60
|
+
flake_failed: bool = False # a flake re-run failed this node on some attempt
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _executor_message(node: Node, worktree: Path, feedback: str) -> str:
|
|
64
|
+
parts = [f"Implement node '{node.id}' — {node.title}.", "", "SPEC:", node.spec, ""]
|
|
65
|
+
parts.append("FILES YOU MAY EDIT (allowlist — touch nothing else):")
|
|
66
|
+
for f in node.files:
|
|
67
|
+
fp = worktree / f
|
|
68
|
+
contents = fp.read_text() if fp.exists() else "(does not exist yet — create it)"
|
|
69
|
+
parts += [f"--- {f} ---", contents, ""]
|
|
70
|
+
parts += [
|
|
71
|
+
f"GATE (your tests must pass): {node.test_cmd}",
|
|
72
|
+
"",
|
|
73
|
+
"CURRENT FAILING TEST OUTPUT:",
|
|
74
|
+
feedback,
|
|
75
|
+
"",
|
|
76
|
+
]
|
|
77
|
+
parts.append(
|
|
78
|
+
"Run the gate, implement in the allowlisted files only, and "
|
|
79
|
+
"re-run until it passes. Do not modify the test files."
|
|
80
|
+
)
|
|
81
|
+
return "\n".join(parts)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _run_shell(cmd: str, cwd: Path, timeout: int) -> str:
|
|
85
|
+
import os
|
|
86
|
+
|
|
87
|
+
env = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"} # keep the worktree clean
|
|
88
|
+
p = subprocess.run(
|
|
89
|
+
cmd, cwd=str(cwd), shell=True, capture_output=True, text=True, timeout=timeout, env=env
|
|
90
|
+
)
|
|
91
|
+
return p.stdout + p.stderr
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _attempt_tiers(cfg: Config, max_attempts: int) -> list[tuple[str, str]]:
|
|
95
|
+
"""Ordered (tier, model): executor ×max_attempts, then escalation ×1."""
|
|
96
|
+
return [("executor", cfg.model_for("executor"))] * max_attempts + [
|
|
97
|
+
("escalation", cfg.model_for("escalation"))
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _process_node(
|
|
102
|
+
node: Node, worktree: Path, cfg: Config, logs: Path, max_attempts: int, log
|
|
103
|
+
) -> NodeOutcome:
|
|
104
|
+
"""Run the attempt/escalation ladder inside an already-created worktree."""
|
|
105
|
+
feedback = _run_shell(node.test_cmd, worktree, cfg.node_timeout)[-3000:]
|
|
106
|
+
tokens_sum = {"input": 0, "output": 0}
|
|
107
|
+
calls: list = []
|
|
108
|
+
attempts = 0
|
|
109
|
+
escalated = False
|
|
110
|
+
review_stage_two = False
|
|
111
|
+
review_blocks = 0
|
|
112
|
+
review_summary: str | None = None
|
|
113
|
+
wif: dict = {} # watch-it-fail verdict of the implementing attempt
|
|
114
|
+
flake_failed = False
|
|
115
|
+
|
|
116
|
+
for i, (tier, model) in enumerate(_attempt_tiers(cfg, max_attempts)):
|
|
117
|
+
if tier == "executor":
|
|
118
|
+
attempts += 1
|
|
119
|
+
else:
|
|
120
|
+
escalated = True
|
|
121
|
+
n = attempts if tier == "executor" else 1
|
|
122
|
+
log(f"[run] {node.id} [{tier}#{n}] {model} …")
|
|
123
|
+
res = run_agent(
|
|
124
|
+
agent="executor",
|
|
125
|
+
model=model,
|
|
126
|
+
message=_executor_message(node, worktree, feedback),
|
|
127
|
+
cwd=worktree,
|
|
128
|
+
log_path=logs / f"{node.id}-{tier}-{i}.jsonl",
|
|
129
|
+
timeout=cfg.node_timeout,
|
|
130
|
+
)
|
|
131
|
+
tokens_sum["input"] += res.tokens.get("input", 0)
|
|
132
|
+
tokens_sum["output"] += res.tokens.get("output", 0)
|
|
133
|
+
calls.append((tier, model, res.tokens))
|
|
134
|
+
|
|
135
|
+
# watch-it-fail (Phase 3 §1): did this attempt run the failing tests before
|
|
136
|
+
# its first edit? Advisory metric; the verdict of the attempt that ends up
|
|
137
|
+
# passing is the one we keep.
|
|
138
|
+
attempt_wif = watch_it_fail(res.tool_events, node.test_cmd)
|
|
139
|
+
|
|
140
|
+
gate: GateResult = node_gate(node, worktree, cfg)
|
|
141
|
+
if not gate.ok:
|
|
142
|
+
if "flaky tests" in gate.failures:
|
|
143
|
+
flake_failed = True
|
|
144
|
+
reason = res.error or ("timeout" if res.timed_out else "; ".join(gate.failures))
|
|
145
|
+
log(f"[run] {node.id} fail ({reason}) at {tier}")
|
|
146
|
+
feedback = (gate.detail or reason)[-3000:]
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
# deterministic gate passed → two-stage review (cost-gated) before merge
|
|
150
|
+
review = review_node(node, worktree, cfg, logs, log, escalated=escalated)
|
|
151
|
+
calls.extend(review.calls)
|
|
152
|
+
review_stage_two = review_stage_two or review.stage_two_ran
|
|
153
|
+
if review.summary:
|
|
154
|
+
review_summary = review.summary
|
|
155
|
+
if review.blocking:
|
|
156
|
+
review_blocks += 1
|
|
157
|
+
feedback = review.detail[-3000:]
|
|
158
|
+
continue # a critical finding re-opens the node (counts against attempts)
|
|
159
|
+
|
|
160
|
+
wif = attempt_wif.__dict__
|
|
161
|
+
if not attempt_wif.observed:
|
|
162
|
+
log(f"[run] {node.id} watch-it-fail: {attempt_wif.verdict} ({attempt_wif.detail})")
|
|
163
|
+
log(f"[run] {node.id} PASS at {tier} (executor attempts={attempts})")
|
|
164
|
+
return NodeOutcome(
|
|
165
|
+
node.id,
|
|
166
|
+
True,
|
|
167
|
+
tier,
|
|
168
|
+
escalated,
|
|
169
|
+
attempts,
|
|
170
|
+
model,
|
|
171
|
+
tokens_sum,
|
|
172
|
+
calls,
|
|
173
|
+
None,
|
|
174
|
+
worktree,
|
|
175
|
+
review_stage_two,
|
|
176
|
+
review_blocks,
|
|
177
|
+
review_summary,
|
|
178
|
+
watch_it_fail=wif,
|
|
179
|
+
flake_failed=flake_failed,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return NodeOutcome(
|
|
183
|
+
node.id,
|
|
184
|
+
False,
|
|
185
|
+
None,
|
|
186
|
+
escalated,
|
|
187
|
+
attempts,
|
|
188
|
+
None,
|
|
189
|
+
tokens_sum,
|
|
190
|
+
calls,
|
|
191
|
+
f"exhausted: {feedback[:200]}",
|
|
192
|
+
worktree,
|
|
193
|
+
review_stage_two,
|
|
194
|
+
review_blocks,
|
|
195
|
+
review_summary,
|
|
196
|
+
watch_it_fail=wif,
|
|
197
|
+
flake_failed=flake_failed,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def run_job(repo: str, cfg: Config, parallel: int, max_attempts: int, log) -> dict:
|
|
202
|
+
repo = Path(repo).resolve()
|
|
203
|
+
fdir = repo / ".director"
|
|
204
|
+
setup.ensure_director_gitignore(repo) # never let `git add -A` commit .director runtime files
|
|
205
|
+
plan = Plan.from_json((fdir / "plan.json").read_text())
|
|
206
|
+
state = RunState.load_or_init(repo, plan)
|
|
207
|
+
ledger = CostLedger(fdir / "costs.jsonl")
|
|
208
|
+
metrics = MetricsWriter(fdir / "metrics.jsonl")
|
|
209
|
+
logs = fdir / "logs"
|
|
210
|
+
run_t0 = time.perf_counter()
|
|
211
|
+
# Worktrees live OUTSIDE the repo tree: a worktree nested inside the repo lets
|
|
212
|
+
# OpenCode resolve the enclosing repo as the project root and leak edits out of
|
|
213
|
+
# the isolated checkout. A sibling temp dir keeps each worktree its own root.
|
|
214
|
+
wt_root = Path(tempfile.gettempdir()) / "director-worktrees" / plan.job_id
|
|
215
|
+
wt_root.mkdir(parents=True, exist_ok=True)
|
|
216
|
+
git_lock = threading.Lock()
|
|
217
|
+
|
|
218
|
+
if gitutil.current_branch(repo) != plan.job_branch:
|
|
219
|
+
gitutil.checkout(plan.job_branch, repo)
|
|
220
|
+
|
|
221
|
+
dag.validate(plan)
|
|
222
|
+
done = state.done_ids()
|
|
223
|
+
# `finished` = every node in a TERMINAL state (done | failed | escalated). The
|
|
224
|
+
# scheduler keys off this, not `done`: a node that fails must never be
|
|
225
|
+
# re-scheduled, and the loop must end even when not every node succeeded.
|
|
226
|
+
# (Seeded from state so a resumed run doesn't retry already-failed nodes.)
|
|
227
|
+
finished = {nid for nid, ns in state.nodes.items() if ns.status in (DONE, FAILED, ESCALATED)}
|
|
228
|
+
active: set[str] = set()
|
|
229
|
+
log(
|
|
230
|
+
f"[run] job={plan.job_id} branch={plan.job_branch} "
|
|
231
|
+
f"nodes={len(plan.nodes)} done={len(done)} parallel={parallel}"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def launch(node_id: str) -> NodeOutcome:
|
|
235
|
+
node = plan.node(node_id)
|
|
236
|
+
with git_lock:
|
|
237
|
+
wt = wt_root / node_id
|
|
238
|
+
gitutil.worktree_remove(wt, repo) # no-op if not registered
|
|
239
|
+
shutil.rmtree(wt, ignore_errors=True)
|
|
240
|
+
# drop any stale registration left by a killed run (dir gone but git
|
|
241
|
+
# still tracks it) so `worktree add` below can't fail with exit 255.
|
|
242
|
+
gitutil.git(["worktree", "prune"], repo, check=False)
|
|
243
|
+
task_branch = f"director/task-{plan.job_id}-{node_id}"
|
|
244
|
+
if gitutil.branch_exists(task_branch, repo):
|
|
245
|
+
gitutil.git(["branch", "-D", task_branch], repo, check=False)
|
|
246
|
+
gitutil.worktree_add(wt, task_branch, plan.job_branch, repo)
|
|
247
|
+
state[node_id].status = RUNNING
|
|
248
|
+
state[node_id].worktree = str(wt)
|
|
249
|
+
state.save()
|
|
250
|
+
node_t0 = time.perf_counter()
|
|
251
|
+
outcome = _process_node(node, wt, cfg, logs, max_attempts, log)
|
|
252
|
+
outcome.wall_secs = round(time.perf_counter() - node_t0, 1)
|
|
253
|
+
return outcome
|
|
254
|
+
|
|
255
|
+
aborted = False
|
|
256
|
+
with ThreadPoolExecutor(max_workers=max(1, parallel)) as pool:
|
|
257
|
+
futures: dict = {}
|
|
258
|
+
while len(finished) < len(plan.nodes) and not aborted:
|
|
259
|
+
# exclude both running and already-terminal nodes; deps are satisfied
|
|
260
|
+
# only by nodes that actually succeeded (`done`).
|
|
261
|
+
for nid in dag.ready_nodes(plan, done, active | finished):
|
|
262
|
+
if len(futures) >= parallel:
|
|
263
|
+
break
|
|
264
|
+
active.add(nid)
|
|
265
|
+
futures[pool.submit(launch, nid)] = nid
|
|
266
|
+
if not futures:
|
|
267
|
+
# nothing running and nothing runnable → the rest are blocked by a
|
|
268
|
+
# failed/unsatisfiable dependency. Mark them failed and stop (never
|
|
269
|
+
# spin re-scheduling terminal nodes).
|
|
270
|
+
blocked = [n.id for n in plan.nodes if n.id not in finished]
|
|
271
|
+
if blocked:
|
|
272
|
+
log(
|
|
273
|
+
f"[run] cannot proceed — {len(blocked)} node(s) blocked by "
|
|
274
|
+
f"failed/unsatisfiable deps: {', '.join(blocked)}"
|
|
275
|
+
)
|
|
276
|
+
for nid in blocked:
|
|
277
|
+
ns = state[nid]
|
|
278
|
+
ns.status = FAILED
|
|
279
|
+
ns.error = ns.error or "blocked by a failed dependency"
|
|
280
|
+
finished.add(nid)
|
|
281
|
+
state.save()
|
|
282
|
+
break
|
|
283
|
+
completed, _ = wait(list(futures), return_when=FIRST_COMPLETED)
|
|
284
|
+
for fut in completed:
|
|
285
|
+
nid = futures.pop(fut)
|
|
286
|
+
active.discard(nid)
|
|
287
|
+
outcome = fut.result()
|
|
288
|
+
try:
|
|
289
|
+
_finalize(outcome, plan, state, repo, git_lock, ledger, cfg, log, metrics)
|
|
290
|
+
except CostCeilingExceeded as e:
|
|
291
|
+
log(f"[run] ABORT: {e}")
|
|
292
|
+
aborted = True
|
|
293
|
+
finished.add(nid) # terminal regardless of pass/fail → never re-scheduled
|
|
294
|
+
if outcome.ok and state[nid].status == DONE:
|
|
295
|
+
done.add(nid)
|
|
296
|
+
|
|
297
|
+
integ = integration_gate(repo, cfg)
|
|
298
|
+
log(
|
|
299
|
+
f"[run] integration gate: "
|
|
300
|
+
f"{'PASS' if integ.ok else 'FAIL (' + ', '.join(integ.failures) + ')'}"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
state.save()
|
|
304
|
+
n = len(plan.nodes)
|
|
305
|
+
done_l = sorted(done)
|
|
306
|
+
escalated_l = [nd.id for nd in plan.nodes if state[nd.id].escalated]
|
|
307
|
+
reviewed_l = [nd.id for nd in plan.nodes if state[nd.id].review_stage_two]
|
|
308
|
+
# executor-tier completion = done without ever escalating (the hypothesis metric)
|
|
309
|
+
exec_done = [nid for nid in done_l if nid not in set(escalated_l)]
|
|
310
|
+
wall = round(time.perf_counter() - run_t0, 1)
|
|
311
|
+
result = {
|
|
312
|
+
"job_id": plan.job_id,
|
|
313
|
+
"done": done_l,
|
|
314
|
+
"failed": [nd.id for nd in plan.nodes if state[nd.id].status == FAILED],
|
|
315
|
+
"escalated": escalated_l,
|
|
316
|
+
"reviewed": reviewed_l,
|
|
317
|
+
"review_blocked": [nd.id for nd in plan.nodes if state[nd.id].review_blocks],
|
|
318
|
+
"integration_ok": integ.ok,
|
|
319
|
+
"integration_detail": integ.detail,
|
|
320
|
+
"cost_total": ledger.total(),
|
|
321
|
+
"by_role": ledger.by_role(),
|
|
322
|
+
"by_model": ledger.by_model(),
|
|
323
|
+
"wall_secs": wall,
|
|
324
|
+
"n_nodes": n,
|
|
325
|
+
"executor_tier_completion": len(exec_done),
|
|
326
|
+
"executor_tier_pct": round(100 * len(exec_done) / n, 1) if n else 0.0,
|
|
327
|
+
"escalation_rate": round(100 * len(escalated_l) / n, 1) if n else 0.0,
|
|
328
|
+
"stage_two_trigger_rate": round(100 * len(reviewed_l) / n, 1) if n else 0.0,
|
|
329
|
+
}
|
|
330
|
+
# run-level metrics record (Phase 3): derived rates + the resolved tier map so a
|
|
331
|
+
# metrics line is self-describing about which models produced it.
|
|
332
|
+
metrics.write(
|
|
333
|
+
{
|
|
334
|
+
"kind": "run",
|
|
335
|
+
"job_id": plan.job_id,
|
|
336
|
+
"tiers": dict(cfg.tiers),
|
|
337
|
+
"n_nodes": n,
|
|
338
|
+
"done": len(done_l),
|
|
339
|
+
"failed": len(result["failed"]),
|
|
340
|
+
"escalated": len(escalated_l),
|
|
341
|
+
"executor_tier_completion": len(exec_done),
|
|
342
|
+
"executor_tier_pct": result["executor_tier_pct"],
|
|
343
|
+
"escalation_rate": result["escalation_rate"],
|
|
344
|
+
"stage_two_trigger_rate": result["stage_two_trigger_rate"],
|
|
345
|
+
"integration_ok": integ.ok,
|
|
346
|
+
"wall_secs": wall,
|
|
347
|
+
"cost_total": ledger.total(),
|
|
348
|
+
"by_role": ledger.by_role(),
|
|
349
|
+
"by_model": ledger.by_model(),
|
|
350
|
+
}
|
|
351
|
+
)
|
|
352
|
+
return result
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _finalize(
|
|
356
|
+
outcome: NodeOutcome,
|
|
357
|
+
plan: Plan,
|
|
358
|
+
state: RunState,
|
|
359
|
+
repo: Path,
|
|
360
|
+
git_lock,
|
|
361
|
+
ledger: CostLedger,
|
|
362
|
+
cfg: Config,
|
|
363
|
+
log,
|
|
364
|
+
metrics: MetricsWriter,
|
|
365
|
+
):
|
|
366
|
+
ns = state[outcome.node_id]
|
|
367
|
+
ns.attempts = outcome.attempts
|
|
368
|
+
ns.escalated = outcome.escalated
|
|
369
|
+
ns.tier_used = outcome.tier
|
|
370
|
+
ns.model_used = outcome.model
|
|
371
|
+
ns.tokens = outcome.tokens
|
|
372
|
+
ns.review_stage_two = outcome.review_stage_two
|
|
373
|
+
ns.review_blocks = outcome.review_blocks
|
|
374
|
+
ns.review_summary = outcome.review_summary
|
|
375
|
+
ns.wall_secs = outcome.wall_secs
|
|
376
|
+
ns.watch_it_fail = (outcome.watch_it_fail or {}).get("verdict")
|
|
377
|
+
ns.flake_failed = outcome.flake_failed
|
|
378
|
+
|
|
379
|
+
# precise cost: one ledger entry per model call, tagged with its tier/role.
|
|
380
|
+
# Also accumulate a per-node by-role breakdown for the metrics record.
|
|
381
|
+
node_cost = 0.0
|
|
382
|
+
node_by_role: dict[str, dict] = {}
|
|
383
|
+
for tier, model, tokens in outcome.calls:
|
|
384
|
+
node_cost += ledger.record(
|
|
385
|
+
role=tier, model=model, tokens=tokens, cfg=cfg, node=outcome.node_id
|
|
386
|
+
)
|
|
387
|
+
g = node_by_role.setdefault(tier, {"input": 0, "output": 0, "cost": 0.0, "calls": 0})
|
|
388
|
+
g["input"] += int(tokens.get("input", 0))
|
|
389
|
+
g["output"] += int(tokens.get("output", 0))
|
|
390
|
+
g["cost"] += cost_of(model, tokens, cfg)
|
|
391
|
+
g["calls"] += 1
|
|
392
|
+
ns.cost_usd = node_cost
|
|
393
|
+
|
|
394
|
+
task_branch = f"director/task-{plan.job_id}-{outcome.node_id}"
|
|
395
|
+
with git_lock:
|
|
396
|
+
if outcome.ok and outcome.worktree:
|
|
397
|
+
gitutil.commit_all(
|
|
398
|
+
f"director: node {outcome.node_id} via {outcome.tier}", outcome.worktree
|
|
399
|
+
)
|
|
400
|
+
merge = gitutil.merge_branch(
|
|
401
|
+
task_branch, repo, message=f"director: merge node {outcome.node_id}"
|
|
402
|
+
)
|
|
403
|
+
if merge.returncode != 0:
|
|
404
|
+
gitutil.git(["merge", "--abort"], repo, check=False)
|
|
405
|
+
ns.status = FAILED
|
|
406
|
+
ns.error = f"merge conflict: {merge.stdout}{merge.stderr}"[:300]
|
|
407
|
+
log(f"[run] {outcome.node_id} MERGE FAILED")
|
|
408
|
+
else:
|
|
409
|
+
ns.status = DONE
|
|
410
|
+
else:
|
|
411
|
+
ns.status = ESCALATED if outcome.escalated else FAILED
|
|
412
|
+
ns.error = outcome.error
|
|
413
|
+
if outcome.worktree and Path(outcome.worktree).exists():
|
|
414
|
+
gitutil.worktree_remove(outcome.worktree, repo)
|
|
415
|
+
shutil.rmtree(outcome.worktree, ignore_errors=True)
|
|
416
|
+
gitutil.git(["branch", "-D", task_branch], repo, check=False)
|
|
417
|
+
state.save()
|
|
418
|
+
|
|
419
|
+
node = plan.node(outcome.node_id)
|
|
420
|
+
metrics.write(
|
|
421
|
+
{
|
|
422
|
+
"kind": "node",
|
|
423
|
+
"job_id": plan.job_id,
|
|
424
|
+
"node": outcome.node_id,
|
|
425
|
+
"title": node.title,
|
|
426
|
+
"difficulty": node.estimated_difficulty,
|
|
427
|
+
"status": ns.status,
|
|
428
|
+
"tier_used": outcome.tier,
|
|
429
|
+
"model_used": outcome.model,
|
|
430
|
+
"attempts": outcome.attempts,
|
|
431
|
+
"escalated": outcome.escalated,
|
|
432
|
+
"wall_secs": outcome.wall_secs,
|
|
433
|
+
"tokens": outcome.tokens,
|
|
434
|
+
"cost_usd": round(node_cost, 6),
|
|
435
|
+
"by_role": node_by_role,
|
|
436
|
+
"review_stage_two": outcome.review_stage_two,
|
|
437
|
+
"review_blocks": outcome.review_blocks,
|
|
438
|
+
"watch_it_fail": outcome.watch_it_fail or {"verdict": "unknown"},
|
|
439
|
+
"flake_failed": outcome.flake_failed,
|
|
440
|
+
}
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
if cfg.cost_ceiling and ledger.total() > cfg.cost_ceiling:
|
|
444
|
+
raise CostCeilingExceeded(f"cost ${ledger.total():.4f} > ${cfg.cost_ceiling:.2f}")
|