director-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- director/README.md +124 -0
- director/__init__.py +10 -0
- director/__main__.py +4 -0
- director/agent_templates/brainstorm.md +44 -0
- director/agent_templates/executor.md +37 -0
- director/agent_templates/explorer.md +24 -0
- director/agent_templates/opencode.json +39 -0
- director/agent_templates/planner.md +60 -0
- director/agent_templates/reviewer.md +46 -0
- director/agent_templates/test-author.md +29 -0
- director/bench.py +234 -0
- director/cli.py +166 -0
- director/config.example.toml +75 -0
- director/config.py +111 -0
- director/cost.py +84 -0
- director/dag.py +113 -0
- director/gates.py +145 -0
- director/gitutil.py +83 -0
- director/metrics.py +48 -0
- director/models.py +106 -0
- director/opencode.py +231 -0
- director/plan.py +523 -0
- director/report.py +103 -0
- director/review.py +153 -0
- director/run.py +444 -0
- director/setup.py +101 -0
- director/state.py +43 -0
- director_cli-0.3.0.dist-info/METADATA +174 -0
- director_cli-0.3.0.dist-info/RECORD +32 -0
- director_cli-0.3.0.dist-info/WHEEL +4 -0
- director_cli-0.3.0.dist-info/entry_points.txt +2 -0
- director_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
director/plan.py
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
"""`director plan` — brainstorm → spec → decompose → test-gated DAG.
|
|
2
|
+
|
|
3
|
+
Phase 2.5 turns planning into a re-entrant pipeline with two artifact-based
|
|
4
|
+
approval gates. director writes an artifact and then either pauses (interactive)
|
|
5
|
+
or auto-approves (`--auto`); the human and the self-critic are mechanically the
|
|
6
|
+
same gate — both read an artifact, decide, and continue.
|
|
7
|
+
|
|
8
|
+
Stage 0 job branch + synced agents (so `--agent <role>` resolves correctly)
|
|
9
|
+
recon explorer (cheap) reads the repo → .director/recon.md
|
|
10
|
+
Stage A planner-tier brainstorm/spec → .director/spec.md → GATE 1
|
|
11
|
+
Stage B planner decomposes the SPEC → .director/plan.json
|
|
12
|
+
Stage C test-author writes failing tests (committed, hashed) → GATE 2
|
|
13
|
+
READY approved; `director run` may execute
|
|
14
|
+
|
|
15
|
+
Resumption is driven by `.director/plan_stage.json`. `director plan "<task>"`
|
|
16
|
+
starts fresh; `director plan --continue` advances the current gate; `--auto`
|
|
17
|
+
swaps a planner self-critique into the gate so nothing blocks.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
import json
|
|
24
|
+
import re
|
|
25
|
+
import subprocess
|
|
26
|
+
import time
|
|
27
|
+
from dataclasses import asdict, dataclass
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
from director import gitutil, setup
|
|
31
|
+
from director.config import Config
|
|
32
|
+
from director.cost import CostLedger
|
|
33
|
+
from director.dag import topo_order, validate
|
|
34
|
+
from director.models import Node, Plan
|
|
35
|
+
from director.opencode import run_agent
|
|
36
|
+
from director.setup import sync_agents
|
|
37
|
+
|
|
38
|
+
# Pipeline stages persisted to .director/plan_stage.json. SPEC/DECOMPOSE are
|
|
39
|
+
# transient (executed then advanced in one invocation); GATE_SPEC/GATE_PLAN/READY
|
|
40
|
+
# are the points where an invocation can stop.
|
|
41
|
+
SPEC, GATE_SPEC, DECOMPOSE, GATE_PLAN, READY = (
|
|
42
|
+
"spec",
|
|
43
|
+
"gate_spec",
|
|
44
|
+
"decompose",
|
|
45
|
+
"gate_plan",
|
|
46
|
+
"ready",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class PlanProgress:
|
|
52
|
+
job_id: str
|
|
53
|
+
task: str
|
|
54
|
+
job_branch: str
|
|
55
|
+
stage: str
|
|
56
|
+
auto: bool
|
|
57
|
+
critique: bool
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def path(repo: Path) -> Path:
|
|
61
|
+
return Path(repo) / ".director" / "plan_stage.json"
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def load(cls, repo: Path) -> PlanProgress | None:
|
|
65
|
+
p = cls.path(repo)
|
|
66
|
+
if not p.exists():
|
|
67
|
+
return None
|
|
68
|
+
return cls(**json.loads(p.read_text()))
|
|
69
|
+
|
|
70
|
+
def save(self, repo: Path) -> None:
|
|
71
|
+
p = self.path(repo)
|
|
72
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
p.write_text(json.dumps(asdict(self), indent=2))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class PlanResult:
|
|
78
|
+
paused: bool # True = stopped at a human gate; False = reached READY
|
|
79
|
+
stage: str
|
|
80
|
+
job_id: str
|
|
81
|
+
job_branch: str
|
|
82
|
+
n_nodes: int
|
|
83
|
+
artifact: str # path the human should review next (when paused)
|
|
84
|
+
message: str
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# --------------------------------------------------------------------------- #
|
|
88
|
+
# prompts
|
|
89
|
+
# --------------------------------------------------------------------------- #
|
|
90
|
+
def _explorer_prompt(task: str) -> str:
|
|
91
|
+
return (
|
|
92
|
+
f"Recon for this task — read-only. Produce the relevant-files summary "
|
|
93
|
+
f"per your instructions.\n\nTASK:\n{task}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _brainstorm_prompt(task: str, summary: str) -> str:
|
|
98
|
+
return (
|
|
99
|
+
"Produce the design spec for this task per your instructions. Output ONLY "
|
|
100
|
+
"the spec Markdown.\n\n"
|
|
101
|
+
f"TASK:\n{task}\n\n"
|
|
102
|
+
f"REPO RECON SUMMARY:\n{summary}\n"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _spec_critique_prompt(task: str, spec: str) -> str:
|
|
107
|
+
return (
|
|
108
|
+
"Self-critique pass. Silently re-read the spec below against the ORIGINAL "
|
|
109
|
+
"request and note anything missing, ambiguous, or contradictory. Then output "
|
|
110
|
+
"the REVISED spec that fixes those issues.\n"
|
|
111
|
+
"Output ONLY the final revised spec, in the same Markdown format and starting "
|
|
112
|
+
"at its `# Spec:` heading. Do NOT include your critique notes, a changelog, "
|
|
113
|
+
"or any preamble — the output replaces the spec file verbatim.\n\n"
|
|
114
|
+
f"ORIGINAL REQUEST:\n{task}\n\n"
|
|
115
|
+
f"CURRENT SPEC:\n{spec}\n"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _planner_prompt(spec: str, summary: str) -> str:
|
|
120
|
+
return (
|
|
121
|
+
"Decompose the APPROVED SPEC below into a strict-JSON DAG per your "
|
|
122
|
+
"instructions. Build from the spec, not from a raw task. Output ONLY the "
|
|
123
|
+
"JSON object.\n\n"
|
|
124
|
+
f"APPROVED SPEC:\n{spec}\n\n"
|
|
125
|
+
f"REPO RECON SUMMARY:\n{summary}\n"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _plan_critique_prompt(spec: str, plan_json: str) -> str:
|
|
130
|
+
return (
|
|
131
|
+
"Self-critique pass on your own DAG. Re-read the plan below against the "
|
|
132
|
+
"approved spec: are any acceptance criteria unaddressed? any node "
|
|
133
|
+
"under-specified for a junior engineer? any two independent nodes sharing "
|
|
134
|
+
"a file? \n"
|
|
135
|
+
"Respond with a SINGLE strict-JSON object and nothing else:\n"
|
|
136
|
+
' {"revised": false} — if the plan already covers the spec, OR\n'
|
|
137
|
+
' {"revised": true, "nodes": [ ...full revised node list... ]}\n'
|
|
138
|
+
"When revising, emit the COMPLETE node list (same schema as before), not a diff.\n\n"
|
|
139
|
+
f"APPROVED SPEC:\n{spec}\n\n"
|
|
140
|
+
f"CURRENT PLAN:\n{plan_json}\n"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _testauthor_prompt(node: Node) -> str:
|
|
145
|
+
return (
|
|
146
|
+
"Write acceptance tests for exactly this node, in the listed test file(s), "
|
|
147
|
+
"and nothing else. Confirm they FAIL before implementation exists.\n\n"
|
|
148
|
+
f"NODE: {node.id} — {node.title}\n\n"
|
|
149
|
+
f"SPEC:\n{node.spec}\n\n"
|
|
150
|
+
f"TEST FILE(S) TO CREATE: {', '.join(node.tests)}\n"
|
|
151
|
+
f"IMPLEMENTATION FILES (do NOT create/implement these): {', '.join(node.files)}\n"
|
|
152
|
+
f"The test command will be: {node.test_cmd}\n"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# --------------------------------------------------------------------------- #
|
|
157
|
+
# helpers
|
|
158
|
+
# --------------------------------------------------------------------------- #
|
|
159
|
+
def _job_id() -> str:
|
|
160
|
+
return time.strftime("%Y%m%d-%H%M%S")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _extract_json(text: str) -> dict:
|
|
164
|
+
"""Pull a JSON object out of a reply, tolerating code fences or stray prose."""
|
|
165
|
+
text = text.strip()
|
|
166
|
+
fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
|
167
|
+
if fence:
|
|
168
|
+
text = fence.group(1)
|
|
169
|
+
try:
|
|
170
|
+
return json.loads(text)
|
|
171
|
+
except json.JSONDecodeError:
|
|
172
|
+
pass
|
|
173
|
+
start, end = text.find("{"), text.rfind("}")
|
|
174
|
+
if start != -1 and end > start:
|
|
175
|
+
return json.loads(text[start : end + 1])
|
|
176
|
+
raise ValueError("agent did not return parseable JSON")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _run_shell(cmd: str, cwd: Path) -> int:
|
|
180
|
+
import os
|
|
181
|
+
|
|
182
|
+
env = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"} # keep the worktree clean
|
|
183
|
+
return subprocess.run(
|
|
184
|
+
cmd, cwd=str(cwd), shell=True, capture_output=True, text=True, env=env
|
|
185
|
+
).returncode
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _sha256(path: Path) -> str:
|
|
189
|
+
return hashlib.sha256(path.read_bytes()).hexdigest()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _build_plan(data: dict, prog: PlanProgress, repo: Path) -> Plan:
|
|
193
|
+
nodes = [Node.from_dict(n) for n in data["nodes"]]
|
|
194
|
+
return Plan(
|
|
195
|
+
job_id=prog.job_id,
|
|
196
|
+
task=prog.task,
|
|
197
|
+
repo=str(repo),
|
|
198
|
+
created_at=time.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
199
|
+
job_branch=prog.job_branch,
|
|
200
|
+
nodes=nodes,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# --------------------------------------------------------------------------- #
|
|
205
|
+
# stages
|
|
206
|
+
# --------------------------------------------------------------------------- #
|
|
207
|
+
def _recon(prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log) -> str:
|
|
208
|
+
log(f"[plan] explorer recon ({cfg.model_for('explorer')}) …")
|
|
209
|
+
ex = run_agent(
|
|
210
|
+
agent="explorer",
|
|
211
|
+
model=cfg.model_for("explorer"),
|
|
212
|
+
message=_explorer_prompt(prog.task),
|
|
213
|
+
cwd=repo,
|
|
214
|
+
log_path=logs / f"{prog.job_id}-explorer.jsonl",
|
|
215
|
+
timeout=cfg.node_timeout,
|
|
216
|
+
)
|
|
217
|
+
ledger.record(role="explorer", model=cfg.model_for("explorer"), tokens=ex.tokens, cfg=cfg)
|
|
218
|
+
if not ex.ok:
|
|
219
|
+
raise RuntimeError(f"explorer failed: {ex.error or ex.returncode} (see {ex.log_path})")
|
|
220
|
+
summary = ex.text or "(no summary)"
|
|
221
|
+
(repo / ".director" / "recon.md").write_text(summary)
|
|
222
|
+
return summary
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _stage_a_spec(
|
|
226
|
+
prog: PlanProgress, repo: Path, cfg: Config, summary: str, ledger: CostLedger, logs: Path, log
|
|
227
|
+
) -> None:
|
|
228
|
+
log(f"[plan] Stage A brainstorm/spec ({cfg.model_for('planner')}) …")
|
|
229
|
+
bs = run_agent(
|
|
230
|
+
agent="brainstorm",
|
|
231
|
+
model=cfg.model_for("planner"),
|
|
232
|
+
message=_brainstorm_prompt(prog.task, summary),
|
|
233
|
+
cwd=repo,
|
|
234
|
+
log_path=logs / f"{prog.job_id}-brainstorm.jsonl",
|
|
235
|
+
timeout=cfg.node_timeout,
|
|
236
|
+
)
|
|
237
|
+
ledger.record(role="planner", model=cfg.model_for("planner"), tokens=bs.tokens, cfg=cfg)
|
|
238
|
+
if not bs.ok or not bs.text.strip():
|
|
239
|
+
raise RuntimeError(f"brainstorm failed: {bs.error or bs.returncode} (see {bs.log_path})")
|
|
240
|
+
(repo / ".director" / "spec.md").write_text(bs.text.strip() + "\n")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _critique_spec(
|
|
244
|
+
prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log
|
|
245
|
+
) -> None:
|
|
246
|
+
spec = (repo / ".director" / "spec.md").read_text()
|
|
247
|
+
log(f"[plan] --auto: spec self-critique ({cfg.model_for('planner')}) …")
|
|
248
|
+
cr = run_agent(
|
|
249
|
+
agent="brainstorm",
|
|
250
|
+
model=cfg.model_for("planner"),
|
|
251
|
+
message=_spec_critique_prompt(prog.task, spec),
|
|
252
|
+
cwd=repo,
|
|
253
|
+
log_path=logs / f"{prog.job_id}-spec-critique.jsonl",
|
|
254
|
+
timeout=cfg.node_timeout,
|
|
255
|
+
)
|
|
256
|
+
ledger.record(role="planner", model=cfg.model_for("planner"), tokens=cr.tokens, cfg=cfg)
|
|
257
|
+
if cr.ok and cr.text.strip():
|
|
258
|
+
(repo / ".director" / "spec.md").write_text(cr.text.strip() + "\n")
|
|
259
|
+
log("[plan] spec revised by self-critique.")
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _author_tests(plan: Plan, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log) -> None:
|
|
263
|
+
"""Stage C: test-author writes per-node tests, commit, hash, verify red.
|
|
264
|
+
Idempotent — safe to re-run after a plan revision (overwrites test files)."""
|
|
265
|
+
for node in [plan.node(i) for i in topo_order(plan)]:
|
|
266
|
+
log(
|
|
267
|
+
f"[plan] test-author: {node.id} → {', '.join(node.tests)} "
|
|
268
|
+
f"({cfg.model_for('test_author')}) …"
|
|
269
|
+
)
|
|
270
|
+
ta = run_agent(
|
|
271
|
+
agent="test-author",
|
|
272
|
+
model=cfg.model_for("test_author"),
|
|
273
|
+
message=_testauthor_prompt(node),
|
|
274
|
+
cwd=repo,
|
|
275
|
+
log_path=logs / f"{plan.job_id}-tests-{node.id}.jsonl",
|
|
276
|
+
timeout=cfg.node_timeout,
|
|
277
|
+
)
|
|
278
|
+
ledger.record(
|
|
279
|
+
role="test_author",
|
|
280
|
+
model=cfg.model_for("test_author"),
|
|
281
|
+
tokens=ta.tokens,
|
|
282
|
+
cfg=cfg,
|
|
283
|
+
node=node.id,
|
|
284
|
+
)
|
|
285
|
+
if not ta.ok:
|
|
286
|
+
raise RuntimeError(f"test-author failed on {node.id}: {ta.error or ta.returncode}")
|
|
287
|
+
gitutil.commit_all(f"director: acceptance tests for job {plan.job_id}", repo)
|
|
288
|
+
|
|
289
|
+
# Hash the committed test files: the node gate refuses to pass if the executor
|
|
290
|
+
# later edits the contract. Captured by director, not the planner.
|
|
291
|
+
for node in plan.nodes:
|
|
292
|
+
node.test_hashes = {}
|
|
293
|
+
for t in node.tests:
|
|
294
|
+
tp = repo / t
|
|
295
|
+
if tp.exists():
|
|
296
|
+
node.test_hashes[t] = _sha256(tp)
|
|
297
|
+
(repo / ".director" / "plan.json").write_text(plan.to_json())
|
|
298
|
+
|
|
299
|
+
not_red = [n.id for n in plan.nodes if _run_shell(n.test_cmd, repo) == 0]
|
|
300
|
+
if not_red:
|
|
301
|
+
log(
|
|
302
|
+
f"[plan] WARNING: tests did NOT fail first (not red) for: "
|
|
303
|
+
f"{', '.join(not_red)} — their contract is suspect."
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _stage_bc_decompose(
|
|
308
|
+
prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log
|
|
309
|
+
) -> Plan:
|
|
310
|
+
summary = (
|
|
311
|
+
(repo / ".director" / "recon.md").read_text()
|
|
312
|
+
if (repo / ".director" / "recon.md").exists()
|
|
313
|
+
else "(no recon)"
|
|
314
|
+
)
|
|
315
|
+
spec = (repo / ".director" / "spec.md").read_text()
|
|
316
|
+
|
|
317
|
+
log(f"[plan] Stage B decompose ({cfg.model_for('planner')}) …")
|
|
318
|
+
pl = run_agent(
|
|
319
|
+
agent="planner",
|
|
320
|
+
model=cfg.model_for("planner"),
|
|
321
|
+
message=_planner_prompt(spec, summary),
|
|
322
|
+
cwd=repo,
|
|
323
|
+
log_path=logs / f"{prog.job_id}-planner.jsonl",
|
|
324
|
+
timeout=cfg.node_timeout,
|
|
325
|
+
)
|
|
326
|
+
ledger.record(role="planner", model=cfg.model_for("planner"), tokens=pl.tokens, cfg=cfg)
|
|
327
|
+
if not pl.ok:
|
|
328
|
+
raise RuntimeError(f"planner failed: {pl.error or pl.returncode} (see {pl.log_path})")
|
|
329
|
+
|
|
330
|
+
plan = _build_plan(_extract_json(pl.text), prog, repo)
|
|
331
|
+
validate(plan)
|
|
332
|
+
(repo / ".director" / "plan.json").write_text(plan.to_json())
|
|
333
|
+
log(f"[plan] {len(plan.nodes)} nodes: {', '.join(n.id for n in plan.nodes)}")
|
|
334
|
+
|
|
335
|
+
_author_tests(plan, repo, cfg, ledger, logs, log)
|
|
336
|
+
return plan
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _critique_plan(
|
|
340
|
+
plan: Plan, prog: PlanProgress, repo: Path, cfg: Config, ledger: CostLedger, logs: Path, log
|
|
341
|
+
) -> Plan:
|
|
342
|
+
spec = (repo / ".director" / "spec.md").read_text()
|
|
343
|
+
log(f"[plan] --auto: plan self-critique ({cfg.model_for('planner')}) …")
|
|
344
|
+
cr = run_agent(
|
|
345
|
+
agent="planner",
|
|
346
|
+
model=cfg.model_for("planner"),
|
|
347
|
+
message=_plan_critique_prompt(spec, plan.to_json()),
|
|
348
|
+
cwd=repo,
|
|
349
|
+
log_path=logs / f"{prog.job_id}-plan-critique.jsonl",
|
|
350
|
+
timeout=cfg.node_timeout,
|
|
351
|
+
)
|
|
352
|
+
ledger.record(role="planner", model=cfg.model_for("planner"), tokens=cr.tokens, cfg=cfg)
|
|
353
|
+
if not cr.ok:
|
|
354
|
+
log("[plan] plan self-critique failed; keeping the original plan.")
|
|
355
|
+
return plan
|
|
356
|
+
try:
|
|
357
|
+
data = _extract_json(cr.text)
|
|
358
|
+
except ValueError:
|
|
359
|
+
log("[plan] plan self-critique returned no JSON; keeping the original plan.")
|
|
360
|
+
return plan
|
|
361
|
+
if not data.get("revised"):
|
|
362
|
+
log("[plan] self-critique: plan already covers the spec.")
|
|
363
|
+
return plan
|
|
364
|
+
|
|
365
|
+
log("[plan] self-critique revised the DAG; re-authoring tests for the new plan.")
|
|
366
|
+
revised = _build_plan(data, prog, repo)
|
|
367
|
+
validate(revised)
|
|
368
|
+
(repo / ".director" / "plan.json").write_text(revised.to_json())
|
|
369
|
+
_author_tests(revised, repo, cfg, ledger, logs, log)
|
|
370
|
+
return revised
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# --------------------------------------------------------------------------- #
|
|
374
|
+
# the re-entrant driver
|
|
375
|
+
# --------------------------------------------------------------------------- #
|
|
376
|
+
def run_plan(
|
|
377
|
+
task: str | None,
|
|
378
|
+
repo: str,
|
|
379
|
+
cfg: Config,
|
|
380
|
+
log,
|
|
381
|
+
*,
|
|
382
|
+
auto: bool = False,
|
|
383
|
+
critique: bool = True,
|
|
384
|
+
cont: bool = False,
|
|
385
|
+
) -> PlanResult:
|
|
386
|
+
repo = Path(repo).resolve()
|
|
387
|
+
fdir = repo / ".director"
|
|
388
|
+
logs = fdir / "logs"
|
|
389
|
+
setup.ensure_director_gitignore(repo) # never let `git add -A` commit .director runtime files
|
|
390
|
+
ledger = CostLedger(fdir / "costs.jsonl")
|
|
391
|
+
prog = PlanProgress.load(repo)
|
|
392
|
+
|
|
393
|
+
if cont:
|
|
394
|
+
if prog is None:
|
|
395
|
+
raise RuntimeError(
|
|
396
|
+
'nothing to continue: no plan in progress (run `director plan "<task>"` first)'
|
|
397
|
+
)
|
|
398
|
+
# human approval advances the current gate
|
|
399
|
+
if prog.stage == GATE_SPEC:
|
|
400
|
+
prog.stage = DECOMPOSE
|
|
401
|
+
elif prog.stage == GATE_PLAN:
|
|
402
|
+
prog.stage = READY
|
|
403
|
+
elif prog.stage == READY:
|
|
404
|
+
log("[plan] already approved and ready — run `director run`.")
|
|
405
|
+
# carry the flags chosen at start; --auto/--no-critique on --continue may override
|
|
406
|
+
auto = auto or prog.auto
|
|
407
|
+
critique = prog.critique if not auto else critique
|
|
408
|
+
if gitutil.current_branch(repo) != prog.job_branch:
|
|
409
|
+
gitutil.checkout(prog.job_branch, repo)
|
|
410
|
+
else:
|
|
411
|
+
if prog is not None and prog.stage != READY:
|
|
412
|
+
raise RuntimeError(
|
|
413
|
+
f"a plan is already in progress at stage '{prog.stage}' "
|
|
414
|
+
f"(job {prog.job_id}). Use `director plan --continue`, or remove "
|
|
415
|
+
f"{PlanProgress.path(repo)} to start over."
|
|
416
|
+
)
|
|
417
|
+
if not task:
|
|
418
|
+
raise RuntimeError("a task description is required to start a new plan")
|
|
419
|
+
job_id = _job_id()
|
|
420
|
+
job_branch = f"director/job-{job_id}"
|
|
421
|
+
prog = PlanProgress(
|
|
422
|
+
job_id=job_id,
|
|
423
|
+
task=task,
|
|
424
|
+
job_branch=job_branch,
|
|
425
|
+
stage=SPEC,
|
|
426
|
+
auto=auto,
|
|
427
|
+
critique=critique,
|
|
428
|
+
)
|
|
429
|
+
# Stage 0: job branch + agents BEFORE any agent call, so `--agent <role>`
|
|
430
|
+
# resolves the synced role prompt instead of falling back to the default.
|
|
431
|
+
base = gitutil.current_commit(repo)
|
|
432
|
+
if gitutil.branch_exists(job_branch, repo):
|
|
433
|
+
raise RuntimeError(f"branch {job_branch} already exists")
|
|
434
|
+
gitutil.create_branch(job_branch, repo, base)
|
|
435
|
+
gitutil.checkout(job_branch, repo)
|
|
436
|
+
sync_agents(repo)
|
|
437
|
+
gitutil.commit_all(f"director: scaffold agents for job {job_id}", repo)
|
|
438
|
+
_recon(prog, repo, cfg, ledger, logs, log)
|
|
439
|
+
|
|
440
|
+
prog.auto, prog.critique = auto, critique
|
|
441
|
+
plan: Plan | None = None
|
|
442
|
+
|
|
443
|
+
# advance through stages until a human gate pauses us or we reach READY
|
|
444
|
+
while True:
|
|
445
|
+
if prog.stage == SPEC:
|
|
446
|
+
_stage_a_spec(
|
|
447
|
+
prog, repo, cfg, (repo / ".director" / "recon.md").read_text(), ledger, logs, log
|
|
448
|
+
)
|
|
449
|
+
prog.stage = GATE_SPEC
|
|
450
|
+
prog.save(repo)
|
|
451
|
+
if not auto:
|
|
452
|
+
return _paused(
|
|
453
|
+
prog,
|
|
454
|
+
fdir,
|
|
455
|
+
"spec.md",
|
|
456
|
+
ledger,
|
|
457
|
+
"Stage A complete. Review/edit .director/spec.md, then "
|
|
458
|
+
"`director plan --continue`.",
|
|
459
|
+
)
|
|
460
|
+
if critique:
|
|
461
|
+
_critique_spec(prog, repo, cfg, ledger, logs, log)
|
|
462
|
+
prog.stage = DECOMPOSE
|
|
463
|
+
prog.save(repo)
|
|
464
|
+
continue
|
|
465
|
+
|
|
466
|
+
if prog.stage == DECOMPOSE:
|
|
467
|
+
plan = _stage_bc_decompose(prog, repo, cfg, ledger, logs, log)
|
|
468
|
+
prog.stage = GATE_PLAN
|
|
469
|
+
prog.save(repo)
|
|
470
|
+
if not auto:
|
|
471
|
+
return _paused(
|
|
472
|
+
prog,
|
|
473
|
+
fdir,
|
|
474
|
+
"plan.json",
|
|
475
|
+
ledger,
|
|
476
|
+
f"Stages B+C complete: {len(plan.nodes)} nodes, tests "
|
|
477
|
+
f"committed (red). Review .director/plan.json + the test "
|
|
478
|
+
f"files, then `director plan --continue` to enable `run`.",
|
|
479
|
+
)
|
|
480
|
+
if critique:
|
|
481
|
+
plan = _critique_plan(plan, prog, repo, cfg, ledger, logs, log)
|
|
482
|
+
prog.stage = READY
|
|
483
|
+
prog.save(repo)
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
if prog.stage == READY:
|
|
487
|
+
prog.save(repo)
|
|
488
|
+
if plan is None:
|
|
489
|
+
plan = Plan.from_json((fdir / "plan.json").read_text())
|
|
490
|
+
log(
|
|
491
|
+
f"[plan] READY. job={prog.job_id} branch={prog.job_branch} "
|
|
492
|
+
f"nodes={len(plan.nodes)} plan-cost=${ledger.total():.4f}"
|
|
493
|
+
)
|
|
494
|
+
return PlanResult(
|
|
495
|
+
False,
|
|
496
|
+
READY,
|
|
497
|
+
prog.job_id,
|
|
498
|
+
prog.job_branch,
|
|
499
|
+
len(plan.nodes),
|
|
500
|
+
str(fdir / "plan.json"),
|
|
501
|
+
"Plan approved. Next: `director run`.",
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _paused(
|
|
506
|
+
prog: PlanProgress, fdir: Path, artifact: str, ledger: CostLedger, message: str
|
|
507
|
+
) -> PlanResult:
|
|
508
|
+
n_nodes = 0
|
|
509
|
+
pj = fdir / "plan.json"
|
|
510
|
+
if pj.exists():
|
|
511
|
+
try:
|
|
512
|
+
n_nodes = len(Plan.from_json(pj.read_text()).nodes)
|
|
513
|
+
except Exception:
|
|
514
|
+
n_nodes = 0
|
|
515
|
+
return PlanResult(
|
|
516
|
+
True,
|
|
517
|
+
prog.stage,
|
|
518
|
+
prog.job_id,
|
|
519
|
+
prog.job_branch,
|
|
520
|
+
n_nodes,
|
|
521
|
+
str(fdir / artifact),
|
|
522
|
+
message + f" (plan-cost so far: ${ledger.total():.4f})",
|
|
523
|
+
)
|
director/report.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Human-readable summaries for `director status` and the end of `director run`."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from director.models import Plan
|
|
8
|
+
from director.state import RunState
|
|
9
|
+
|
|
10
|
+
_STATUS_GLYPH = {
|
|
11
|
+
"done": "✅",
|
|
12
|
+
"pending": "·",
|
|
13
|
+
"running": "…",
|
|
14
|
+
"escalated": "⚠️ ",
|
|
15
|
+
"failed": "❌",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def status_table(repo: str) -> str:
|
|
20
|
+
repo = Path(repo).resolve()
|
|
21
|
+
plan_path = repo / ".director" / "plan.json"
|
|
22
|
+
if not plan_path.exists():
|
|
23
|
+
return 'No plan found. Run `director plan "<task>"` first.'
|
|
24
|
+
plan = Plan.from_json(plan_path.read_text())
|
|
25
|
+
state = RunState.load_or_init(repo, plan)
|
|
26
|
+
|
|
27
|
+
lines = [f"job {plan.job_id} ({plan.job_branch})", f"task: {plan.task}", ""]
|
|
28
|
+
lines.append(f"{'node':24} {'status':10} {'tier':10} {'att':>3} {'cost':>9}")
|
|
29
|
+
lines.append("-" * 60)
|
|
30
|
+
for n in plan.nodes:
|
|
31
|
+
s = state[n.id]
|
|
32
|
+
glyph = _STATUS_GLYPH.get(s.status, "?")
|
|
33
|
+
lines.append(
|
|
34
|
+
f"{n.id[:24]:24} {glyph} {s.status:8} "
|
|
35
|
+
f"{(s.tier_used or '-'):10} {s.attempts:>3} ${s.cost_usd:>7.4f}"
|
|
36
|
+
)
|
|
37
|
+
done = sum(1 for n in plan.nodes if state[n.id].status == "done")
|
|
38
|
+
esc = sum(1 for n in plan.nodes if state[n.id].escalated)
|
|
39
|
+
reviewed = sum(1 for n in plan.nodes if state[n.id].review_stage_two)
|
|
40
|
+
blocked = sum(1 for n in plan.nodes if state[n.id].review_blocks)
|
|
41
|
+
wif_ok = sum(1 for n in plan.nodes if state[n.id].watch_it_fail == "observed")
|
|
42
|
+
flaky = sum(1 for n in plan.nodes if state[n.id].flake_failed)
|
|
43
|
+
lines += [
|
|
44
|
+
"",
|
|
45
|
+
f"{done}/{len(plan.nodes)} done, {esc} escalated, "
|
|
46
|
+
f"{reviewed} stage-two reviewed, {blocked} re-opened by review",
|
|
47
|
+
]
|
|
48
|
+
if len(plan.nodes):
|
|
49
|
+
no_esc = done - esc
|
|
50
|
+
lines.append(
|
|
51
|
+
f"executor-tier completion (no escalation): "
|
|
52
|
+
f"{no_esc}/{len(plan.nodes)} = {100 * no_esc / len(plan.nodes):.0f}% "
|
|
53
|
+
f"(hypothesis target: >70%)"
|
|
54
|
+
)
|
|
55
|
+
lines.append(
|
|
56
|
+
f"stage-two review trigger rate: "
|
|
57
|
+
f"{reviewed}/{len(plan.nodes)} = {100 * reviewed / len(plan.nodes):.0f}%"
|
|
58
|
+
)
|
|
59
|
+
lines.append(
|
|
60
|
+
f"watch-it-fail observed (red before green): "
|
|
61
|
+
f"{wif_ok}/{len(plan.nodes)}"
|
|
62
|
+
+ (f" ⚠️ {flaky} node(s) hit a flake re-run failure" if flaky else "")
|
|
63
|
+
)
|
|
64
|
+
return "\n".join(lines)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def run_summary(result: dict) -> str:
|
|
68
|
+
lines = ["", "=" * 60, f"RUN SUMMARY — job {result['job_id']}", "=" * 60]
|
|
69
|
+
lines.append(f"done: {', '.join(result['done']) or '(none)'}")
|
|
70
|
+
if result["escalated"]:
|
|
71
|
+
lines.append(f"escalated: {', '.join(result['escalated'])}")
|
|
72
|
+
if result.get("reviewed"):
|
|
73
|
+
lines.append(f"stage-two reviewed: {', '.join(result['reviewed'])}")
|
|
74
|
+
if result.get("review_blocked"):
|
|
75
|
+
lines.append(f"review re-opened: {', '.join(result['review_blocked'])}")
|
|
76
|
+
if result["failed"]:
|
|
77
|
+
lines.append(f"FAILED: {', '.join(result['failed'])}")
|
|
78
|
+
lines.append(f"integration gate: {'PASS' if result['integration_ok'] else 'FAIL'}")
|
|
79
|
+
if not result["integration_ok"] and result.get("integration_detail"):
|
|
80
|
+
lines.append(result["integration_detail"][-1500:])
|
|
81
|
+
|
|
82
|
+
if result.get("n_nodes"):
|
|
83
|
+
lines += ["", "measurement:"]
|
|
84
|
+
lines.append(
|
|
85
|
+
f" executor-tier completion (no escalation): "
|
|
86
|
+
f"{result['executor_tier_completion']}/{result['n_nodes']} = "
|
|
87
|
+
f"{result['executor_tier_pct']:.0f}% (hypothesis target: >70%)"
|
|
88
|
+
)
|
|
89
|
+
lines.append(f" escalation rate: {result['escalation_rate']:.0f}%")
|
|
90
|
+
lines.append(f" stage-two trigger rate: {result['stage_two_trigger_rate']:.0f}%")
|
|
91
|
+
lines.append(f" wall time: {result['wall_secs']:.0f}s")
|
|
92
|
+
|
|
93
|
+
lines += ["", "cost by role:"]
|
|
94
|
+
for role, g in sorted(result["by_role"].items()):
|
|
95
|
+
lines.append(
|
|
96
|
+
f" {role:12} {g['calls']:>2} calls "
|
|
97
|
+
f"in={g['input']:>8} out={g['output']:>7} ${g['cost']:.4f}"
|
|
98
|
+
)
|
|
99
|
+
lines += ["", "cost by resolved model:"]
|
|
100
|
+
for model, g in sorted(result["by_model"].items()):
|
|
101
|
+
lines.append(f" {model:48} ${g['cost']:.4f}")
|
|
102
|
+
lines.append(f"\nTOTAL: ${result['cost_total']:.4f}")
|
|
103
|
+
return "\n".join(lines)
|