director-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- director/README.md +124 -0
- director/__init__.py +10 -0
- director/__main__.py +4 -0
- director/agent_templates/brainstorm.md +44 -0
- director/agent_templates/executor.md +37 -0
- director/agent_templates/explorer.md +24 -0
- director/agent_templates/opencode.json +39 -0
- director/agent_templates/planner.md +60 -0
- director/agent_templates/reviewer.md +46 -0
- director/agent_templates/test-author.md +29 -0
- director/bench.py +234 -0
- director/cli.py +166 -0
- director/config.example.toml +75 -0
- director/config.py +111 -0
- director/cost.py +84 -0
- director/dag.py +113 -0
- director/gates.py +145 -0
- director/gitutil.py +83 -0
- director/metrics.py +48 -0
- director/models.py +106 -0
- director/opencode.py +231 -0
- director/plan.py +523 -0
- director/report.py +103 -0
- director/review.py +153 -0
- director/run.py +444 -0
- director/setup.py +101 -0
- director/state.py +43 -0
- director_cli-0.3.0.dist-info/METADATA +174 -0
- director_cli-0.3.0.dist-info/RECORD +32 -0
- director_cli-0.3.0.dist-info/WHEEL +4 -0
- director_cli-0.3.0.dist-info/entry_points.txt +2 -0
- director_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
director/README.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# `director` — the orchestrator (Phase 2 + 2.5 + 3)
|
|
2
|
+
|
|
3
|
+
A thin CLI that drives OpenCode headlessly to run the decomposition harness.
|
|
4
|
+
Stdlib-only (Python ≥ 3.11). The harness consumes configured OpenAI-compatible
|
|
5
|
+
endpoints; it never manages providers.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
director plan "<task>" [--repo .] # interactive: stops at each approval gate
|
|
9
|
+
director plan --continue # resume after editing/approving the gate artifact
|
|
10
|
+
director plan "<task>" --auto # planner self-critiques at each gate; no pause
|
|
11
|
+
director plan "<task>" --auto --no-critique # gates auto-pass, fully hands-off
|
|
12
|
+
director run [--repo .] [--parallel N] [--max-attempts K]
|
|
13
|
+
director status [--repo .]
|
|
14
|
+
director bench "<task>" --profiles all-frontier,cheap-cloud,local-first [--plan-profile P]
|
|
15
|
+
director sync-agents [--repo .] # (re)install role agents into <repo>/.opencode
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Flow
|
|
19
|
+
|
|
20
|
+
**plan** — a re-entrant pipeline with two artifact-based approval gates (Phase 2.5).
|
|
21
|
+
A job branch `director/job-<id>` is created and the role agents synced onto it first.
|
|
22
|
+
1. `explorer` (cheap tier) does read-only recon → `.director/recon.md`.
|
|
23
|
+
2. **Stage A — brainstorm/spec.** `brainstorm` (planner tier) does a Socratic
|
|
24
|
+
refinement pass and writes a readable design spec → `.director/spec.md`.
|
|
25
|
+
→ **Gate 1.**
|
|
26
|
+
3. **Stage B — decompose.** `planner` (planner tier) turns the *approved spec*
|
|
27
|
+
into a strict-JSON DAG → `.director/plan.json`. Each node: `id, title, spec
|
|
28
|
+
(junior-engineer standard), files (allowlist), depends_on, test_cmd, tests,
|
|
29
|
+
estimated_difficulty`. Validated: acyclic, deps resolve, **concurrent nodes
|
|
30
|
+
have disjoint allowlists**.
|
|
31
|
+
4. **Stage C — test authoring.** `test-author` (frontier tier) writes each node's
|
|
32
|
+
tests, committed to the job branch; director verifies they **fail first** (red)
|
|
33
|
+
and **hashes** each test file (the contract is then immutable). → **Gate 2.**
|
|
34
|
+
|
|
35
|
+
Gates are **artifact-based, not process-blocking**: director writes the artifact and
|
|
36
|
+
exits; the human edits/approves on disk and resumes with `--continue`. `--auto`
|
|
37
|
+
swaps a one-call planner **self-critique** into the same gate (re-read artifact vs.
|
|
38
|
+
the request, revise once); `--no-critique` makes gates auto-pass. Human and
|
|
39
|
+
self-critic are mechanically the same gate — only the approver differs.
|
|
40
|
+
|
|
41
|
+
**run** — for each node in dependency order (up to `--parallel` at once):
|
|
42
|
+
1. `git worktree add` an isolated task branch off the job branch.
|
|
43
|
+
2. Invoke `executor` (executor tier) with spec + allowlist file contents + the
|
|
44
|
+
failing test output. (Executor mandate: **watch it fail first**.)
|
|
45
|
+
3. **Deterministic gate** (exit codes only): test files byte-for-byte intact (hash),
|
|
46
|
+
`node.test_cmd` passes, AND the diff touches only the allowlist. On the pass
|
|
47
|
+
path, **flake control** (Phase 3) re-runs the tests `flake_runs` times (default
|
|
48
|
+
2); any mismatch fails the node as flaky.
|
|
49
|
+
4. **Two-stage review** (Phase 2.5), after the deterministic gate, before merge:
|
|
50
|
+
- *Stage one — spec compliance:* the deterministic gate above, plus an optional
|
|
51
|
+
advisory explorer-tier check (`review.stage_one_llm`, off by default).
|
|
52
|
+
- *Stage two — code quality (`reviewer` tier):* **cost-gated** — runs only when
|
|
53
|
+
the node escalated OR its diff touched > `review.stage_two_file_threshold`
|
|
54
|
+
files (default 3). Never runs on the cheap/local tier. A `critical` finding
|
|
55
|
+
blocks the merge and **re-opens the node** (counts against `max_attempts`).
|
|
56
|
+
5. Fail/blocked → feed the gate or review output back, retry up to `max_attempts`
|
|
57
|
+
(fresh OpenCode context each attempt). Exhausted → retry the SAME node once at
|
|
58
|
+
the `escalation` tier (never the whole job).
|
|
59
|
+
6. Pass → commit + merge into the job branch; mark done in `.director/state.json`.
|
|
60
|
+
After all nodes: an **integration gate** runs the repo-wide suite/lint/typecheck.
|
|
61
|
+
|
|
62
|
+
Each node's transcript is also checked for **watch-it-fail** (Phase 3 §1): did the
|
|
63
|
+
executor run the failing tests *before* its first edit? This is advisory (the
|
|
64
|
+
deterministic gate already enforces the contract) and recorded as a metric —
|
|
65
|
+
`observed` / `not_observed` / `unknown`.
|
|
66
|
+
|
|
67
|
+
**status** — per-node state, attempts, cost, executor-tier completion rate (the
|
|
68
|
+
falsifiable hypothesis target: >70% of nodes done without escalation), stage-two
|
|
69
|
+
review trigger rate, and watch-it-fail observed count.
|
|
70
|
+
|
|
71
|
+
## Measurement (Phase 3)
|
|
72
|
+
|
|
73
|
+
Every `run` appends to **`.director/metrics.jsonl`** — one `kind:"node"` record per
|
|
74
|
+
node (tier/model, attempts, escalation, per-role tokens+cost, wall time,
|
|
75
|
+
watch-it-fail verdict, flake outcome) and one `kind:"run"` summary (the derived
|
|
76
|
+
rates: executor-tier completion, escalation, stage-two trigger, total wall time
|
|
77
|
+
and cost, plus the resolved tier map). This is the falsifiability instrument; it
|
|
78
|
+
is what `director bench` reads.
|
|
79
|
+
|
|
80
|
+
**bench** — the experiment. Plans the task **once** (under `--plan-profile`,
|
|
81
|
+
default `all-frontier`) so the DAG and acceptance tests are frozen, then runs that
|
|
82
|
+
*same* plan under each `--profiles` profile by forking a fresh job branch off the
|
|
83
|
+
frozen one (every profile faces byte-for-byte identical tests). It diffs cost /
|
|
84
|
+
quality (same acceptance tests) / wall-time and reports each profile's run-cost
|
|
85
|
+
reduction vs the `all-frontier` baseline (target: >80%). The active `config.toml`
|
|
86
|
+
is never touched — each profile's config is loaded directly from its profile TOML.
|
|
87
|
+
Per-profile metrics streams and a `summary.json` land in `.director/bench/`.
|
|
88
|
+
|
|
89
|
+
## Roles → tiers
|
|
90
|
+
|
|
91
|
+
Roles bind to `provider/model` strings in `.director/config.toml` (`[tiers]`).
|
|
92
|
+
Code/logs name only roles. `director` passes the resolved model via `opencode run
|
|
93
|
+
--agent <role> --model <tier>`, so **switching executor models is a config edit,
|
|
94
|
+
never a code change.** `sync-agents` seeds `.director/config.toml` from the bundled
|
|
95
|
+
`config.example.toml`; edit it to bind roles to models. For `bench`, create
|
|
96
|
+
`.director/profiles/<name>.toml` variants (copy `config.toml`, change the executor tier).
|
|
97
|
+
|
|
98
|
+
## Deliberate deviations from the spec
|
|
99
|
+
|
|
100
|
+
- **Tests live on the job branch**, not a separate `director/tests-<id>` branch
|
|
101
|
+
(dependent nodes need both the tests and prior nodes' impls; one branch is
|
|
102
|
+
simpler and equivalent).
|
|
103
|
+
- **The full repo-wide test suite is the *integration* gate, not a per-node gate.**
|
|
104
|
+
Sibling nodes' tests are intentionally red until their own node runs, so a
|
|
105
|
+
per-node full-suite gate would always fail mid-DAG. Per node we gate on
|
|
106
|
+
`node.test_cmd` + allowlist; the full suite/lint/typecheck run once after merge.
|
|
107
|
+
|
|
108
|
+
## Persistence (`.director/`, all resumable/debuggable)
|
|
109
|
+
|
|
110
|
+
- `spec.md` — approved design spec (Gate 1). `recon.md` — explorer summary.
|
|
111
|
+
- `plan_stage.json` — which gate the plan is paused at (drives `--continue`).
|
|
112
|
+
- `plan.json` — the DAG (incl. per-node `test_hashes`). `state.json` — per-node
|
|
113
|
+
status/attempts/cost + review trigger info (resume).
|
|
114
|
+
- `costs.jsonl` — every model call tagged with role + resolved model (local = $0).
|
|
115
|
+
- `metrics.jsonl` — per-node + per-run measurement stream (Phase 3).
|
|
116
|
+
- `bench/` — `summary.json` + per-profile `*.metrics.jsonl` from `director bench`.
|
|
117
|
+
- `logs/*.jsonl` — raw OpenCode NDJSON events per call (`.stderr` siblings = logs).
|
|
118
|
+
- `worktrees/` — transient per-node worktrees.
|
|
119
|
+
|
|
120
|
+
## Limits (config `[limits]`)
|
|
121
|
+
|
|
122
|
+
`node_timeout_secs` (per call), `cost_ceiling_usd` (abort the run when exceeded;
|
|
123
|
+
local = $0 so local-first never trips it), `max_attempts`, `flake_runs` (Phase 3
|
|
124
|
+
flake control: times to run a node's tests on success; default 2, 1 disables).
|
director/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Director — a model-agnostic decomposition coding harness.
|
|
2
|
+
|
|
3
|
+
A strong planner tier decomposes a task into atomic, well-specified units with
|
|
4
|
+
acceptance tests written first; a cheaper executor tier implements each unit in
|
|
5
|
+
an isolated git worktree with a fresh context; deterministic gates (tests, lint,
|
|
6
|
+
typecheck, exit codes — never an LLM judge) decide what merges. Roles bind to
|
|
7
|
+
model tiers in `.director/config.toml`; nothing here knows "local" vs "cloud".
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.3.0"
|
director/__main__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Socratic spec refinement — turns a raw task into an unambiguous design spec before any decomposition.
|
|
3
|
+
mode: all
|
|
4
|
+
temperature: 0.3
|
|
5
|
+
permission:
|
|
6
|
+
edit: deny
|
|
7
|
+
bash: deny
|
|
8
|
+
webfetch: deny
|
|
9
|
+
websearch: deny
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
You are the **planner**, running the brainstorm/spec pass — the first stage,
|
|
13
|
+
before any decomposition. Your job is to turn a raw, possibly-vague task into an
|
|
14
|
+
*unambiguous* design spec. A bad spec here poisons every downstream task, so do
|
|
15
|
+
not rush to a plan.
|
|
16
|
+
|
|
17
|
+
You are given the raw task and a read-only relevant-files summary from a recon
|
|
18
|
+
pass. Think hard about what the requester actually wants.
|
|
19
|
+
|
|
20
|
+
Discipline (do not skip):
|
|
21
|
+
1. **Surface ambiguities and name your assumptions.** Where the task is
|
|
22
|
+
under-specified, state the interpretation you are adopting and why — explicitly,
|
|
23
|
+
so a human reviewer can correct it at the approval gate.
|
|
24
|
+
2. **Propose a concrete design**, not options: the behavior to build, the public
|
|
25
|
+
surface (functions/signatures/endpoints), data shapes, error handling, and the
|
|
26
|
+
edge cases that matter. Reference real files/symbols from the recon summary.
|
|
27
|
+
3. **Call out what is OUT of scope** so the decomposition stays focused.
|
|
28
|
+
4. **List the acceptance criteria** in plain language — the observable behaviors
|
|
29
|
+
that, once true, mean the task is done. These become the tests later.
|
|
30
|
+
|
|
31
|
+
Output the spec as readable Markdown in clearly titled sections (not a wall of
|
|
32
|
+
text), in roughly this shape:
|
|
33
|
+
|
|
34
|
+
# Spec: <task title>
|
|
35
|
+
## Goal
|
|
36
|
+
## Assumptions & decisions
|
|
37
|
+
## Design
|
|
38
|
+
## Out of scope
|
|
39
|
+
## Acceptance criteria
|
|
40
|
+
## Open questions (if any)
|
|
41
|
+
|
|
42
|
+
Output ONLY the spec Markdown — no preamble, no code fences around the whole
|
|
43
|
+
document. Do NOT decompose into tasks and do NOT write any code or tests yet;
|
|
44
|
+
that happens after this spec is approved.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Implements exactly one atomic node to make its failing tests pass, touching only the listed files.
|
|
3
|
+
mode: all
|
|
4
|
+
temperature: 0.6
|
|
5
|
+
permission:
|
|
6
|
+
edit: allow
|
|
7
|
+
bash: allow
|
|
8
|
+
webfetch: deny
|
|
9
|
+
websearch: deny
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
You are the **executor**. You implement exactly ONE atomic node in an isolated,
|
|
13
|
+
fresh context. You have no memory of any planner reasoning or sibling node —
|
|
14
|
+
everything you need is in this message.
|
|
15
|
+
|
|
16
|
+
You receive: a self-contained **spec**, an **allowlist of files** you may modify,
|
|
17
|
+
and the **failing test output** that defines success.
|
|
18
|
+
|
|
19
|
+
Your only success condition: make the provided tests pass while keeping the
|
|
20
|
+
repo-wide gates (full test suite, lint, typecheck) green.
|
|
21
|
+
|
|
22
|
+
Rules — do not violate:
|
|
23
|
+
1. **Watch it fail first.** Run the provided tests BEFORE writing any
|
|
24
|
+
implementation and confirm they fail. If they already pass, STOP and report
|
|
25
|
+
that the task is mis-specified — do not invent work. Only after seeing red do
|
|
26
|
+
you implement, then re-run to green.
|
|
27
|
+
2. Change **nothing outside the listed files**. Never modify, rename, or delete
|
|
28
|
+
any file not on the allowlist — and in particular **never modify a test file**.
|
|
29
|
+
The tests are the contract; if a test seems wrong, STOP and say so.
|
|
30
|
+
3. Make the smallest change that turns the tests green. No unrelated refactors, no
|
|
31
|
+
new dependencies unless the spec calls for them.
|
|
32
|
+
4. Match the surrounding code's style, naming, and idioms.
|
|
33
|
+
5. When the listed tests pass, stop and report what you changed (file-by-file) and
|
|
34
|
+
the final test result. Do not claim success without having run the tests green.
|
|
35
|
+
|
|
36
|
+
If you cannot make the tests pass, say so explicitly and explain the blocker — do
|
|
37
|
+
not paper over it or weaken the tests.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Read-only codebase reconnaissance; produces a compact relevant-files summary for the planner.
|
|
3
|
+
mode: all
|
|
4
|
+
temperature: 0.3
|
|
5
|
+
permission:
|
|
6
|
+
edit: deny
|
|
7
|
+
bash: deny
|
|
8
|
+
webfetch: deny
|
|
9
|
+
websearch: deny
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
You are the **explorer**. You perform cheap, read-only reconnaissance so the
|
|
13
|
+
(expensive) planner can work from a small, accurate summary instead of the raw
|
|
14
|
+
repo. You may ONLY read, glob, and grep — never edit, write, or run anything.
|
|
15
|
+
|
|
16
|
+
Given a task, produce a concise structured summary:
|
|
17
|
+
- **Relevant files**: paths most relevant to the task, one line each.
|
|
18
|
+
- **Key symbols**: functions/classes/types the task will touch (`file:line`).
|
|
19
|
+
- **Conventions**: test framework + how tests are laid out, the exact test/lint/
|
|
20
|
+
typecheck commands you can infer, build/run commands.
|
|
21
|
+
- **Risks / unknowns**: anything ambiguous the planner must resolve.
|
|
22
|
+
|
|
23
|
+
Keep it tight — this feeds a context-limited planner. Report findings only; do
|
|
24
|
+
not propose a plan or implementation. Never speculate about code you did not read.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://opencode.ai/config.json",
|
|
3
|
+
"provider": {
|
|
4
|
+
"lmstudio": {
|
|
5
|
+
"name": "LM Studio",
|
|
6
|
+
"npm": "@ai-sdk/openai-compatible",
|
|
7
|
+
"options": {
|
|
8
|
+
"baseURL": "http://localhost:1234/v1"
|
|
9
|
+
},
|
|
10
|
+
"models": {
|
|
11
|
+
"qwen3.6-27b-mtp": {
|
|
12
|
+
"name": "Qwen3.6 27B MTP",
|
|
13
|
+
"interleaved": {
|
|
14
|
+
"field": "reasoning_content"
|
|
15
|
+
},
|
|
16
|
+
"limit": {
|
|
17
|
+
"context": 131072,
|
|
18
|
+
"output": 16384
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"qwen3-coder-next": {
|
|
22
|
+
"name": "Qwen3-Coder-Next (80B-A3B)",
|
|
23
|
+
"limit": {
|
|
24
|
+
"context": 131072,
|
|
25
|
+
"output": 16384
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"agent": {
|
|
32
|
+
"plan": {
|
|
33
|
+
"model": "amazon-bedrock/us.anthropic.claude-opus-4-7"
|
|
34
|
+
},
|
|
35
|
+
"build": {
|
|
36
|
+
"model": "lmstudio/qwen3.6-27b-mtp"
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Decomposes a task into an atomic, well-specified, test-gated DAG and emits it as strict JSON.
|
|
3
|
+
mode: all
|
|
4
|
+
temperature: 0.2
|
|
5
|
+
permission:
|
|
6
|
+
edit: deny
|
|
7
|
+
bash: deny
|
|
8
|
+
webfetch: deny
|
|
9
|
+
websearch: deny
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
You are the **planner**. You decompose an **approved spec** into a DAG of small,
|
|
13
|
+
atomic units of work that a *cheaper* model will implement independently, each in
|
|
14
|
+
a fresh context with no memory of your reasoning or of sibling units.
|
|
15
|
+
|
|
16
|
+
You are given the approved spec (the contract — do not re-litigate it) and a
|
|
17
|
+
relevant-files summary produced by a recon pass. Decompose what the spec says.
|
|
18
|
+
|
|
19
|
+
**Plan-writing standard — write every `spec` for an enthusiastic junior engineer**
|
|
20
|
+
who has no project context, exercises no judgment, and would rather not test.
|
|
21
|
+
That means each node's `spec` must give: exact relative file paths, the precise
|
|
22
|
+
function/class signatures, the explicit expected behavior and edge cases, and the
|
|
23
|
+
exact command that verifies it. Leave nothing implicit. If a node's spec relies on
|
|
24
|
+
the reader "figuring it out," it is under-specified — fix it.
|
|
25
|
+
|
|
26
|
+
Output a SINGLE strict-JSON object and NOTHING else (no prose, no code fences):
|
|
27
|
+
|
|
28
|
+
{
|
|
29
|
+
"nodes": [
|
|
30
|
+
{
|
|
31
|
+
"id": "kebab-id",
|
|
32
|
+
"title": "short title",
|
|
33
|
+
"spec": "Self-contained instructions. Readable with ZERO other context: state exactly what to implement, the function/signature, behavior, and edge cases. Do not reference other nodes.",
|
|
34
|
+
"files": ["relative/path/only/files/this/node/may/edit.py"],
|
|
35
|
+
"depends_on": ["other-node-id"],
|
|
36
|
+
"test_cmd": "exact shell command that runs THIS node's tests and exits nonzero until it's done",
|
|
37
|
+
"tests": ["relative/path/to/test_file.py"],
|
|
38
|
+
"estimated_difficulty": "easy|medium|hard"
|
|
39
|
+
}
|
|
40
|
+
]
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
Hard rules:
|
|
44
|
+
1. **Every node is an IMPLEMENTATION unit** and MUST have a non-empty `files`
|
|
45
|
+
allowlist. Do NOT create separate nodes for writing tests, and never emit a node
|
|
46
|
+
with empty `files`: the **test-author writes each node's tests automatically**
|
|
47
|
+
from that node's `tests` field — test authoring is not itself a task in the DAG.
|
|
48
|
+
So a single feature is ONE node (it lists both its implementation `files` and
|
|
49
|
+
its `tests`), not a "write tests" node plus an "implement" node.
|
|
50
|
+
2. Each node is independently implementable given only its spec + its files + its
|
|
51
|
+
failing tests. If two pieces must be edited together, they are ONE node.
|
|
52
|
+
3. **Parallel-safe allowlists:** any two nodes that are not in a depends_on chain
|
|
53
|
+
MUST have completely disjoint `files`. Never let two independent nodes edit the
|
|
54
|
+
same file.
|
|
55
|
+
4. `files` lists implementation files only — never the test files. `tests` lists
|
|
56
|
+
the test files (the test-author writes these; the executor may not touch them).
|
|
57
|
+
5. `depends_on` only when a node genuinely needs another's output. Prefer a wide,
|
|
58
|
+
shallow DAG (more parallelism) over a deep chain.
|
|
59
|
+
6. Keep nodes small — a focused function or a cohesive handful. Bias to MORE nodes.
|
|
60
|
+
7. Use realistic `test_cmd`s for this repo's stack (from the recon summary).
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Two-stage code review of one node's diff; emits a strict-JSON verdict. Runs on a strong tier only.
|
|
3
|
+
mode: all
|
|
4
|
+
temperature: 0.1
|
|
5
|
+
permission:
|
|
6
|
+
edit: deny
|
|
7
|
+
bash: deny
|
|
8
|
+
webfetch: deny
|
|
9
|
+
websearch: deny
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
You are the **reviewer**. You review the diff for exactly ONE node after its
|
|
13
|
+
deterministic gates have already passed (its tests are green and it touched only
|
|
14
|
+
allowed files). You never edit anything — you only judge.
|
|
15
|
+
|
|
16
|
+
You are given: the node's spec, its file allowlist, the unified diff it produced,
|
|
17
|
+
and which review stage to perform.
|
|
18
|
+
|
|
19
|
+
- **Stage one — spec compliance.** Does the diff implement the behavior the spec
|
|
20
|
+
describes (not just satisfy the letter of the tests), and does it stay within
|
|
21
|
+
the allowlist? Flag tests that look gamed (hard-coded return values, assertions
|
|
22
|
+
weakened, behavior special-cased to the test inputs).
|
|
23
|
+
- **Stage two — code quality.** Correctness beyond the tests, edge cases the tests
|
|
24
|
+
miss, security issues, resource/concurrency bugs, clarity, and consistency with
|
|
25
|
+
the surrounding code. Do NOT demand unrelated refactors or restyling.
|
|
26
|
+
|
|
27
|
+
Severity rubric — assign each finding exactly one:
|
|
28
|
+
- `critical` — the change is wrong, unsafe, or games the tests. This BLOCKS the
|
|
29
|
+
merge and re-opens the node. Use it only when you are confident.
|
|
30
|
+
- `major` — a real problem worth fixing but not merge-blocking.
|
|
31
|
+
- `minor` — nit / suggestion.
|
|
32
|
+
|
|
33
|
+
Output a SINGLE strict-JSON object and NOTHING else (no prose, no code fences):
|
|
34
|
+
|
|
35
|
+
{
|
|
36
|
+
"verdict": "pass" | "block",
|
|
37
|
+
"summary": "one-sentence overall assessment",
|
|
38
|
+
"findings": [
|
|
39
|
+
{"severity": "critical|major|minor", "file": "path", "summary": "what and why"}
|
|
40
|
+
]
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
Set `"verdict": "block"` if and only if there is at least one `critical` finding.
|
|
44
|
+
If the diff is sound, return `"verdict": "pass"` with an empty or minor-only
|
|
45
|
+
findings list. Be strict but fair: a clean, small diff that passes its tests
|
|
46
|
+
rarely needs blocking.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Writes acceptance tests (only) for one node; tests are the contract and must fail before implementation.
|
|
3
|
+
mode: all
|
|
4
|
+
temperature: 0.1
|
|
5
|
+
permission:
|
|
6
|
+
edit: allow
|
|
7
|
+
bash: allow
|
|
8
|
+
webfetch: deny
|
|
9
|
+
websearch: deny
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
You are the **test-author**. Tests are the contract for every node, so you run on
|
|
13
|
+
the strongest configured model. You write acceptance tests — and ONLY tests.
|
|
14
|
+
|
|
15
|
+
You receive one node's spec and the test file path(s) to create.
|
|
16
|
+
|
|
17
|
+
Rules:
|
|
18
|
+
1. Write tests **only** — create/extend the listed test files. Do NOT implement the
|
|
19
|
+
feature and do NOT modify non-test source. A different, cheaper model implements
|
|
20
|
+
against your tests later.
|
|
21
|
+
2. Tests must **fail before implementation exists** (red) for the RIGHT reason — a
|
|
22
|
+
missing function/behavior, not an import typo. After writing, run them and
|
|
23
|
+
confirm they fail; report the failing output.
|
|
24
|
+
3. Cover the spec's acceptance criteria: happy path, named edge cases, error
|
|
25
|
+
conditions. Prefer small, deterministic, isolated tests.
|
|
26
|
+
4. No flakiness — no time/network/random dependence unless the spec is about that.
|
|
27
|
+
5. Match the repo's existing test framework and conventions.
|
|
28
|
+
|
|
29
|
+
Report: the test files you created and the captured failing run that proves red.
|
director/bench.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""`director bench` — the experiment that tests the hypothesis (Phase 3 §4).
|
|
2
|
+
|
|
3
|
+
Runs the SAME task under several profiles and diffs cost / quality / wall-time.
|
|
4
|
+
The scientific control that makes the comparison fair: **plan once, run many.**
|
|
5
|
+
|
|
6
|
+
1. Plan the task a single time (under a chosen `--plan-profile`, default
|
|
7
|
+
`all-frontier`). This produces the DAG and the acceptance tests, committed to
|
|
8
|
+
the plan's job branch. That branch — with its failing tests — is *frozen*.
|
|
9
|
+
2. For each profile, branch a fresh job branch off the frozen one (so every
|
|
10
|
+
profile faces byte-for-byte identical acceptance tests), rewrite `plan.json`
|
|
11
|
+
to that branch, reset run state/cost/metrics, and `run` it.
|
|
12
|
+
|
|
13
|
+
Quality is therefore "did the same acceptance tests pass," isolating the
|
|
14
|
+
executor tier as the only independent variable — exactly what the hypothesis is
|
|
15
|
+
about. Planning cost is shared (counted once); each profile reports its own run
|
|
16
|
+
cost. Each profile's Config is loaded directly from its profile TOML (run_plan
|
|
17
|
+
and run_job take a Config object), so the repo's tracked `config.toml` is never
|
|
18
|
+
touched — only the working branch is restored at the end.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import shutil
|
|
25
|
+
import time
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
from director import config, gitutil
|
|
30
|
+
from director.cost import CostLedger
|
|
31
|
+
from director.models import Plan
|
|
32
|
+
from director.plan import run_plan
|
|
33
|
+
from director.run import run_job
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class BenchRow:
|
|
38
|
+
profile: str
|
|
39
|
+
n_nodes: int = 0
|
|
40
|
+
done: int = 0
|
|
41
|
+
executor_pct: float = 0.0
|
|
42
|
+
escalated: int = 0
|
|
43
|
+
review_pct: float = 0.0
|
|
44
|
+
integration_ok: bool = False
|
|
45
|
+
run_cost: float = 0.0
|
|
46
|
+
wall_secs: float = 0.0
|
|
47
|
+
error: str | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class BenchResult:
|
|
52
|
+
task: str
|
|
53
|
+
plan_profile: str
|
|
54
|
+
plan_cost: float
|
|
55
|
+
job_id: str
|
|
56
|
+
rows: list[BenchRow] = field(default_factory=list)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _profile_path(fdir: Path, name: str) -> Path:
|
|
60
|
+
p = fdir / "profiles" / f"{name}.toml"
|
|
61
|
+
if not p.exists():
|
|
62
|
+
raise FileNotFoundError(
|
|
63
|
+
f"profile not found: {p}\n"
|
|
64
|
+
f"bench compares config variants — create it by copying your config, e.g.:\n"
|
|
65
|
+
f" cp .director/config.toml {p}\n"
|
|
66
|
+
f"then edit its executor tier."
|
|
67
|
+
)
|
|
68
|
+
return p
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _reset_run_artifacts(fdir: Path) -> None:
|
|
72
|
+
"""Each profile gets a clean ledger/state/metrics so its numbers are its own."""
|
|
73
|
+
for name in ("state.json", "costs.jsonl", "metrics.jsonl"):
|
|
74
|
+
(fdir / name).unlink(missing_ok=True)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def run_bench(
|
|
78
|
+
task: str,
|
|
79
|
+
repo: str,
|
|
80
|
+
profiles: list[str],
|
|
81
|
+
log,
|
|
82
|
+
*,
|
|
83
|
+
plan_profile: str | None = None,
|
|
84
|
+
parallel: int = 1,
|
|
85
|
+
max_attempts: int = 0,
|
|
86
|
+
) -> BenchResult:
|
|
87
|
+
repo = Path(repo).resolve()
|
|
88
|
+
fdir = repo / ".director"
|
|
89
|
+
bench_dir = fdir / "bench"
|
|
90
|
+
bench_dir.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
|
|
92
|
+
profile_cfgs = {
|
|
93
|
+
p: config.load_file(_profile_path(fdir, p)) for p in profiles
|
|
94
|
+
} # validate up front
|
|
95
|
+
plan_profile = plan_profile or ("all-frontier" if "all-frontier" in profiles else profiles[0])
|
|
96
|
+
plan_cfg = config.load_file(_profile_path(fdir, plan_profile))
|
|
97
|
+
|
|
98
|
+
base_branch = gitutil.current_branch(repo)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# --- plan once (frozen acceptance tests) --------------------------------
|
|
102
|
+
(fdir / "plan_stage.json").unlink(missing_ok=True) # never resume a stale plan
|
|
103
|
+
_reset_run_artifacts(fdir)
|
|
104
|
+
log(f"[bench] planning once under '{plan_profile}' …")
|
|
105
|
+
pres = run_plan(task, str(repo), plan_cfg, log, auto=True, critique=True)
|
|
106
|
+
if pres.paused:
|
|
107
|
+
raise RuntimeError(
|
|
108
|
+
f"planning paused at gate '{pres.stage}' — bench needs an unattended "
|
|
109
|
+
f"plan. (run_plan returned paused under --auto, which shouldn't happen)"
|
|
110
|
+
)
|
|
111
|
+
frozen = Plan.from_json((fdir / "plan.json").read_text())
|
|
112
|
+
plan_cost = CostLedger(fdir / "costs.jsonl").total()
|
|
113
|
+
log(
|
|
114
|
+
f"[bench] plan ready: {len(frozen.nodes)} node(s) on {frozen.job_branch}, "
|
|
115
|
+
f"plan cost ${plan_cost:.4f}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
result = BenchResult(
|
|
119
|
+
task=task, plan_profile=plan_profile, plan_cost=plan_cost, job_id=frozen.job_id
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# --- run each profile against the frozen plan ---------------------------
|
|
123
|
+
for p in profiles:
|
|
124
|
+
row = BenchRow(profile=p)
|
|
125
|
+
try:
|
|
126
|
+
branch = f"director/bench-{p}-{frozen.job_id}"
|
|
127
|
+
if gitutil.branch_exists(branch, repo):
|
|
128
|
+
gitutil.checkout(base_branch, repo)
|
|
129
|
+
gitutil.git(["branch", "-D", branch], repo, check=False)
|
|
130
|
+
# branch off the frozen plan branch → identical failing tests
|
|
131
|
+
gitutil.create_branch(branch, repo, base=frozen.job_branch)
|
|
132
|
+
|
|
133
|
+
plan_d = json.loads((fdir / "plan.json").read_text())
|
|
134
|
+
plan_d["job_id"] = f"{frozen.job_id}-{p}"
|
|
135
|
+
plan_d["job_branch"] = branch
|
|
136
|
+
(fdir / "plan.json").write_text(json.dumps(plan_d, indent=2))
|
|
137
|
+
|
|
138
|
+
_reset_run_artifacts(fdir)
|
|
139
|
+
cfg_p = profile_cfgs[p]
|
|
140
|
+
|
|
141
|
+
log(f"[bench] === profile '{p}' (executor={cfg_p.model_for('executor')}) ===")
|
|
142
|
+
t0 = time.perf_counter()
|
|
143
|
+
rj = run_job(
|
|
144
|
+
str(repo),
|
|
145
|
+
cfg_p,
|
|
146
|
+
parallel=parallel,
|
|
147
|
+
max_attempts=max_attempts or cfg_p.max_attempts,
|
|
148
|
+
log=log,
|
|
149
|
+
)
|
|
150
|
+
row.wall_secs = round(time.perf_counter() - t0, 1)
|
|
151
|
+
row.n_nodes = rj["n_nodes"]
|
|
152
|
+
row.done = len(rj["done"])
|
|
153
|
+
row.executor_pct = rj["executor_tier_pct"]
|
|
154
|
+
row.escalated = len(rj["escalated"])
|
|
155
|
+
row.review_pct = rj["stage_two_trigger_rate"]
|
|
156
|
+
row.integration_ok = rj["integration_ok"]
|
|
157
|
+
row.run_cost = rj["cost_total"]
|
|
158
|
+
# keep this profile's metrics stream for the record
|
|
159
|
+
if (fdir / "metrics.jsonl").exists():
|
|
160
|
+
shutil.copyfile(fdir / "metrics.jsonl", bench_dir / f"{p}.metrics.jsonl")
|
|
161
|
+
except Exception as e: # one profile failing must not sink the whole bench
|
|
162
|
+
row.error = str(e)[:300]
|
|
163
|
+
log(f"[bench] profile '{p}' errored: {row.error}")
|
|
164
|
+
result.rows.append(row)
|
|
165
|
+
|
|
166
|
+
(bench_dir / "summary.json").write_text(json.dumps(_summary_dict(result), indent=2))
|
|
167
|
+
return result
|
|
168
|
+
finally:
|
|
169
|
+
# restore the working branch — but NON-FATALLY: by this point every
|
|
170
|
+
# profile's data is collected and summary.json is written, so a failed
|
|
171
|
+
# checkout (e.g. a target repo that committed its .director runtime files
|
|
172
|
+
# before the .gitignore seed existed) must not sink the whole bench. Log
|
|
173
|
+
# and leave the repo where it is rather than raising over the result.
|
|
174
|
+
try:
|
|
175
|
+
gitutil.checkout(base_branch, repo)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
log(
|
|
178
|
+
f"[bench] WARNING: could not restore branch '{base_branch}' "
|
|
179
|
+
f"({str(e)[:160]}); repo left on the last bench branch."
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _summary_dict(r: BenchResult) -> dict:
|
|
184
|
+
return {
|
|
185
|
+
"task": r.task,
|
|
186
|
+
"plan_profile": r.plan_profile,
|
|
187
|
+
"plan_cost": r.plan_cost,
|
|
188
|
+
"job_id": r.job_id,
|
|
189
|
+
"rows": [row.__dict__ for row in r.rows],
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def bench_report(r: BenchResult) -> str:
|
|
194
|
+
lines = [
|
|
195
|
+
"",
|
|
196
|
+
"=" * 78,
|
|
197
|
+
"BENCH — same task & acceptance tests across profiles",
|
|
198
|
+
"=" * 78,
|
|
199
|
+
f"task: {r.task}",
|
|
200
|
+
f"planned once under '{r.plan_profile}' (shared plan cost ${r.plan_cost:.4f})",
|
|
201
|
+
"",
|
|
202
|
+
]
|
|
203
|
+
hdr = (
|
|
204
|
+
f"{'profile':16} {'done':>7} {'exec%':>6} {'esc':>4} "
|
|
205
|
+
f"{'rev%':>5} {'integ':>6} {'run $':>9} {'wall':>7}"
|
|
206
|
+
)
|
|
207
|
+
lines += [hdr, "-" * len(hdr)]
|
|
208
|
+
for row in r.rows:
|
|
209
|
+
if row.error:
|
|
210
|
+
lines.append(f"{row.profile:16} ERROR: {row.error}")
|
|
211
|
+
continue
|
|
212
|
+
lines.append(
|
|
213
|
+
f"{row.profile:16} {f'{row.done}/{row.n_nodes}':>7} "
|
|
214
|
+
f"{row.executor_pct:>5.0f}% {row.escalated:>4} "
|
|
215
|
+
f"{row.review_pct:>4.0f}% {('PASS' if row.integration_ok else 'FAIL'):>6} "
|
|
216
|
+
f"${row.run_cost:>7.4f} {row.wall_secs:>6.0f}s"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# cost-reduction vs the all-frontier baseline, if it was one of the profiles
|
|
220
|
+
base = next((x for x in r.rows if x.profile == "all-frontier" and not x.error), None)
|
|
221
|
+
if base and base.run_cost > 0:
|
|
222
|
+
lines += ["", "run-cost vs all-frontier baseline:"]
|
|
223
|
+
for row in r.rows:
|
|
224
|
+
if row.error or row.profile == "all-frontier":
|
|
225
|
+
continue
|
|
226
|
+
cut = 100 * (1 - row.run_cost / base.run_cost)
|
|
227
|
+
lines.append(
|
|
228
|
+
f" {row.profile:16} {cut:>5.0f}% cheaper "
|
|
229
|
+
f"(${row.run_cost:.4f} vs ${base.run_cost:.4f}) "
|
|
230
|
+
f"[target: >80%]"
|
|
231
|
+
)
|
|
232
|
+
lines.append("")
|
|
233
|
+
lines.append("quality = identical acceptance tests; 'integ' is the repo-wide gate.")
|
|
234
|
+
return "\n".join(lines)
|