director-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
director/cli.py ADDED
@@ -0,0 +1,166 @@
1
+ """director CLI — plan | run | status | bench | sync-agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from director import __version__, config
9
+ from director.report import run_summary, status_table
10
+ from director.setup import sync_agents
11
+
12
+
13
+ def _log(msg: str) -> None:
14
+ print(msg, file=sys.stderr, flush=True)
15
+
16
+
17
+ def cmd_plan(args) -> int:
18
+ from director.plan import run_plan
19
+
20
+ cfg = config.load(args.repo)
21
+ result = run_plan(
22
+ args.task,
23
+ args.repo,
24
+ cfg,
25
+ _log,
26
+ auto=args.auto,
27
+ critique=not args.no_critique,
28
+ cont=getattr(args, "continue"),
29
+ )
30
+ print(result.message)
31
+ return 0
32
+
33
+
34
+ def cmd_run(args) -> int:
35
+ from director.run import run_job
36
+
37
+ cfg = config.load(args.repo)
38
+ result = run_job(
39
+ args.repo,
40
+ cfg,
41
+ parallel=args.parallel,
42
+ max_attempts=args.max_attempts or cfg.max_attempts,
43
+ log=_log,
44
+ )
45
+ print(run_summary(result))
46
+ failed = bool(result["failed"]) or not result["integration_ok"]
47
+ return 1 if failed else 0
48
+
49
+
50
+ def cmd_status(args) -> int:
51
+ print(status_table(args.repo))
52
+ return 0
53
+
54
+
55
+ def cmd_bench(args) -> int:
56
+ from director.bench import bench_report, run_bench
57
+
58
+ profiles = [p.strip() for p in args.profiles.split(",") if p.strip()]
59
+ if not profiles:
60
+ _log("director: error: --profiles is empty")
61
+ return 2
62
+ result = run_bench(
63
+ args.task,
64
+ args.repo,
65
+ profiles,
66
+ _log,
67
+ plan_profile=args.plan_profile,
68
+ parallel=args.parallel,
69
+ max_attempts=args.max_attempts or 0,
70
+ )
71
+ print(bench_report(result))
72
+ return 0
73
+
74
+
75
+ def cmd_sync_agents(args) -> int:
76
+ written = sync_agents(args.repo)
77
+ print("Synced:\n " + "\n ".join(written))
78
+ return 0
79
+
80
+
81
+ def build_parser() -> argparse.ArgumentParser:
82
+ p = argparse.ArgumentParser(prog="director", description=__doc__)
83
+ p.add_argument("--version", action="version", version=f"director {__version__}")
84
+ sub = p.add_subparsers(dest="cmd", required=True)
85
+
86
+ pp = sub.add_parser("plan", help="brainstorm → spec → test-gated DAG, with approval gates")
87
+ pp.add_argument(
88
+ "task", nargs="?", default=None, help="the task description (omit with --continue)"
89
+ )
90
+ pp.add_argument("--repo", default=".", help="target repo (default: .)")
91
+ pp.add_argument(
92
+ "--continue",
93
+ action="store_true",
94
+ dest="continue",
95
+ help="resume after approving/editing the current gate artifact",
96
+ )
97
+ pp.add_argument(
98
+ "--auto", action="store_true", help="planner self-critiques at each gate; no human pause"
99
+ )
100
+ pp.add_argument(
101
+ "--no-critique",
102
+ action="store_true",
103
+ help="with --auto: gates auto-pass without even self-critique",
104
+ )
105
+ pp.set_defaults(func=cmd_plan)
106
+
107
+ pr = sub.add_parser("run", help="execute the planned DAG")
108
+ pr.add_argument("--repo", default=".")
109
+ pr.add_argument(
110
+ "--parallel",
111
+ type=int,
112
+ default=1,
113
+ help="max concurrent nodes (default 1; local single-model endpoints want 1)",
114
+ )
115
+ pr.add_argument(
116
+ "--max-attempts",
117
+ type=int,
118
+ default=0,
119
+ help="executor attempts before escalation (default: config)",
120
+ )
121
+ pr.set_defaults(func=cmd_run)
122
+
123
+ ps = sub.add_parser("status", help="show per-node progress + cost")
124
+ ps.add_argument("--repo", default=".")
125
+ ps.set_defaults(func=cmd_status)
126
+
127
+ pb = sub.add_parser("bench", help="run one task across profiles; diff cost/quality/wall-time")
128
+ pb.add_argument("task", help="the task description (planned once, run per profile)")
129
+ pb.add_argument("--repo", default=".")
130
+ pb.add_argument(
131
+ "--profiles",
132
+ required=True,
133
+ help="comma-separated profile names, e.g. all-frontier,cheap-cloud,local-first",
134
+ )
135
+ pb.add_argument(
136
+ "--plan-profile",
137
+ default=None,
138
+ help="profile used for the single shared planning pass "
139
+ "(default: all-frontier if listed, else the first profile)",
140
+ )
141
+ pb.add_argument("--parallel", type=int, default=1, help="max concurrent nodes per run")
142
+ pb.add_argument(
143
+ "--max-attempts",
144
+ type=int,
145
+ default=0,
146
+ help="executor attempts before escalation (default: per-profile config)",
147
+ )
148
+ pb.set_defaults(func=cmd_bench)
149
+
150
+ psa = sub.add_parser("sync-agents", help="(re)install role agents into <repo>/.opencode")
151
+ psa.add_argument("--repo", default=".")
152
+ psa.set_defaults(func=cmd_sync_agents)
153
+ return p
154
+
155
+
156
+ def main(argv=None) -> int:
157
+ args = build_parser().parse_args(argv)
158
+ try:
159
+ return args.func(args)
160
+ except (FileNotFoundError, ValueError, RuntimeError) as e:
161
+ _log(f"director: error: {e}")
162
+ return 2
163
+
164
+
165
+ if __name__ == "__main__":
166
+ raise SystemExit(main())
@@ -0,0 +1,75 @@
1
+ # director configuration — copy to .director/config.toml and edit.
2
+ #
3
+ # `director sync-agents` drops this file at .director/config.toml for you if none
4
+ # exists. Everything director knows about a "model" comes from here; switching
5
+ # models is a config edit, never a code change.
6
+ #
7
+ # The example below is a **local-first** setup (executor on a local LM Studio
8
+ # endpoint, planning/tests on a cloud frontier model). Inline comments show how to
9
+ # turn it into a zero-local-infra "cheap-cloud" setup or an "all-frontier" baseline.
10
+
11
+ [tiers]
12
+ # Roles bound to resolved OpenCode model strings ("provider/model"). Code, prompts,
13
+ # and logs refer ONLY to these role names — never to a specific model.
14
+ planner = "amazon-bedrock/us.anthropic.claude-opus-4-7" # decomposition + DAG (use your strongest model)
15
+ test_author = "amazon-bedrock/us.anthropic.claude-opus-4-7" # tests are the contract → strongest
16
+ executor = "lmstudio/qwen3.6-27b-mtp" # implements each node. The cheap tier.
17
+ explorer = "lmstudio/qwen3.6-27b-mtp" # read-only recon (reuse the cheap tier)
18
+ reviewer = "amazon-bedrock/anthropic.claude-sonnet-4-6" # stage-two code review (NEVER the cheap/local tier)
19
+ escalation = "amazon-bedrock/anthropic.claude-sonnet-4-6" # per-task fallback when the executor exhausts attempts
20
+ #
21
+ # cheap-cloud (no local GPU): set executor/explorer to a low-cost cloud model and
22
+ # delete the [providers.local] block, e.g.
23
+ # executor = "openrouter/deepseek/deepseek-v4-pro"
24
+ # all-frontier baseline (expensive control): set executor = reviewer = escalation
25
+ # to the same frontier model as the planner.
26
+
27
+ # Only needed if a tier above points at a local OpenAI-compatible endpoint.
28
+ [providers.local]
29
+ base_url = "http://localhost:1234/v1" # e.g. LM Studio; load a model there first
30
+ api_key = "sk-noauth"
31
+
32
+ # Sampling overrides per role (optional; match the model card). Example for a
33
+ # Qwen3.6 "thinking" executor:
34
+ [sampling.executor]
35
+ temperature = 0.6
36
+ top_p = 0.95
37
+ top_k = 20
38
+
39
+ # Deterministic repo-wide gates. The exit code is the verdict — set these to your
40
+ # project's real commands. Leave a value empty ("") to skip that gate.
41
+ [gates]
42
+ test = "pytest -q"
43
+ lint = "ruff check ."
44
+ typecheck = "mypy ."
45
+
46
+ # Per-resolved-model pricing for cost accounting ($ / 1M tokens). Local = $0.
47
+ # Fill these from your provider's current price sheet; any model not listed is
48
+ # counted at $0.
49
+ [pricing."lmstudio/qwen3.6-27b-mtp"]
50
+ input = 0.0
51
+ output = 0.0
52
+ [pricing."amazon-bedrock/us.anthropic.claude-opus-4-7"]
53
+ input = 15.0
54
+ output = 75.0
55
+ [pricing."amazon-bedrock/anthropic.claude-sonnet-4-6"]
56
+ input = 3.0
57
+ output = 15.0
58
+
59
+ [limits]
60
+ node_timeout_secs = 900 # per model/gate call (15 min)
61
+ cost_ceiling_usd = 10.0 # abort the run if exceeded (local = $0 never trips it)
62
+ max_attempts = 3 # executor attempts before escalating a node one tier up
63
+ flake_runs = 2 # re-run a node's tests this many times on success (1 disables)
64
+
65
+ # Two-stage review. Stage one (spec compliance) is the always-on deterministic node
66
+ # gate. Stage two (code quality, reviewer tier) is cost-gated:
67
+ [review]
68
+ stage_two = true
69
+ stage_two_file_threshold = 3 # stage two fires when a node escalated OR its diff touched > N files
70
+ stage_one_llm = false # optional advisory explorer-tier compliance check (off: the gate already enforces it)
71
+
72
+ # --- director bench ---------------------------------------------------------
73
+ # To compare profiles with `director bench "<task>" --profiles a,b,c`, create
74
+ # .director/profiles/<name>.toml variants (copy this file and change the executor
75
+ # tier in each). bench plans once and runs the same frozen tests under each.
director/config.py ADDED
@@ -0,0 +1,111 @@
1
+ """Load and resolve `.director/config.toml` (the active profile).
2
+
3
+ Roles → tier model strings, deterministic gate commands, per-model pricing, and
4
+ run limits. Everything the orchestrator knows about a "model" comes from here;
5
+ switching executor models is a config edit, never a code change.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import tomllib
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+
14
+ ROLES = ("planner", "test_author", "executor", "explorer", "reviewer", "escalation")
15
+
16
+
17
+ @dataclass
18
+ class Config:
19
+ path: Path
20
+ tiers: dict[str, str] # role -> "provider/model"
21
+ gates: dict[str, str] # "test"|"lint"|"typecheck" -> command ("" = skip)
22
+ pricing: dict[str, dict] # "provider/model" -> {"input": $/Mtok, "output": $/Mtok}
23
+ limits: dict # node_timeout_secs, cost_ceiling_usd, max_attempts
24
+ sampling: dict # role -> {temperature, top_p, top_k}
25
+ local: dict # providers.local: base_url, api_key
26
+ review: dict # two-stage review knobs (Phase 2.5)
27
+
28
+ # --- convenience resolvers ----------------------------------------------
29
+ def model_for(self, role: str) -> str:
30
+ if role not in self.tiers:
31
+ raise KeyError(f"role '{role}' not bound in [tiers] of {self.path}")
32
+ return self.tiers[role]
33
+
34
+ def price(self, model: str) -> dict:
35
+ return self.pricing.get(model, {"input": 0.0, "output": 0.0})
36
+
37
+ @property
38
+ def node_timeout(self) -> int:
39
+ return int(self.limits.get("node_timeout_secs", 900))
40
+
41
+ @property
42
+ def cost_ceiling(self) -> float:
43
+ return float(self.limits.get("cost_ceiling_usd", 0.0)) # 0 = no ceiling
44
+
45
+ @property
46
+ def max_attempts(self) -> int:
47
+ return int(self.limits.get("max_attempts", 3))
48
+
49
+ @property
50
+ def flake_runs(self) -> int:
51
+ """How many times to run a node's tests on the success path (Phase 3 flake
52
+ control). Default 2 = run twice; a mismatch between runs fails the node as
53
+ flaky. 1 disables the extra run."""
54
+ return max(1, int(self.limits.get("flake_runs", 2)))
55
+
56
+ # --- two-stage review (Phase 2.5) ---------------------------------------
57
+ @property
58
+ def stage_two_file_threshold(self) -> int:
59
+ """Stage-two (code-quality) review fires when a node escalated OR its diff
60
+ touched MORE than this many files. Default 3 (configurable)."""
61
+ return int(self.review.get("stage_two_file_threshold", 3))
62
+
63
+ @property
64
+ def stage_one_llm(self) -> bool:
65
+ """Run the optional explorer-tier spec-compliance check in stage one.
66
+ Off by default — the deterministic node gate already enforces the
67
+ contract; this is a cheap belt-and-suspenders LLM pass."""
68
+ return bool(self.review.get("stage_one_llm", False))
69
+
70
+ @property
71
+ def stage_two_enabled(self) -> bool:
72
+ return bool(self.review.get("stage_two", True))
73
+
74
+
75
+ def load(repo: Path) -> Config:
76
+ """Load the active config from <repo>/.director/config.toml."""
77
+ path = Path(repo) / ".director" / "config.toml"
78
+ if not path.exists():
79
+ raise FileNotFoundError(
80
+ f"{path} not found. Run `director sync-agents` to seed it from the bundled "
81
+ f"example, then edit it."
82
+ )
83
+ return load_file(path)
84
+
85
+
86
+ def load_file(path: Path) -> Config:
87
+ """Load a Config from a specific TOML path (e.g. a profile). Used by
88
+ `director bench` to load each profile WITHOUT swapping the active
89
+ config.toml — run_plan/run_job take a Config object, so bench never has to
90
+ mutate (and thereby dirty) the tracked config.toml on disk."""
91
+ path = Path(path)
92
+ if not path.exists():
93
+ raise FileNotFoundError(f"{path} not found.")
94
+ with path.open("rb") as f:
95
+ data = tomllib.load(f)
96
+
97
+ tiers = data.get("tiers", {})
98
+ missing = [r for r in ROLES if r not in tiers]
99
+ if missing:
100
+ raise ValueError(f"[tiers] in {path} is missing roles: {', '.join(missing)}")
101
+
102
+ return Config(
103
+ path=path,
104
+ tiers=tiers,
105
+ gates=data.get("gates", {}),
106
+ pricing=data.get("pricing", {}),
107
+ limits=data.get("limits", {}),
108
+ sampling=data.get("sampling", {}),
109
+ local=data.get("providers", {}).get("local", {}),
110
+ review=data.get("review", {}),
111
+ )
director/cost.py ADDED
@@ -0,0 +1,84 @@
1
+ """Cost accounting — every model call is tagged with its role and resolved model.
2
+
3
+ Cost is computed from per-model pricing in config (local endpoints priced at $0
4
+ but still counted). Entries are appended to `.director/costs.jsonl` so accounting
5
+ survives across `plan` and `run` and is resumable.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import time
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ from director.config import Config
16
+
17
+
18
+ def cost_of(model: str, tokens: dict, cfg: Config) -> float:
19
+ p = cfg.price(model)
20
+ return (tokens.get("input", 0) / 1_000_000) * float(p.get("input", 0.0)) + (
21
+ tokens.get("output", 0) / 1_000_000
22
+ ) * float(p.get("output", 0.0))
23
+
24
+
25
+ @dataclass
26
+ class CostEntry:
27
+ role: str
28
+ model: str
29
+ input: int
30
+ output: int
31
+ cost: float
32
+ node: str | None = None
33
+ ts: float = 0.0
34
+
35
+
36
+ class CostLedger:
37
+ """Append-only ledger backed by .director/costs.jsonl."""
38
+
39
+ def __init__(self, path: Path):
40
+ self.path = Path(path)
41
+ self.entries: list[CostEntry] = []
42
+ if self.path.exists():
43
+ for line in self.path.read_text().splitlines():
44
+ if line.strip():
45
+ d = json.loads(line)
46
+ self.entries.append(CostEntry(**d))
47
+
48
+ def record(
49
+ self, *, role: str, model: str, tokens: dict, cfg: Config, node: str | None = None
50
+ ) -> float:
51
+ c = cost_of(model, tokens, cfg)
52
+ e = CostEntry(
53
+ role=role,
54
+ model=model,
55
+ input=int(tokens.get("input", 0)),
56
+ output=int(tokens.get("output", 0)),
57
+ cost=c,
58
+ node=node,
59
+ ts=time.time(),
60
+ )
61
+ self.entries.append(e)
62
+ self.path.parent.mkdir(parents=True, exist_ok=True)
63
+ with self.path.open("a") as f:
64
+ f.write(json.dumps(e.__dict__) + "\n")
65
+ return c
66
+
67
+ def total(self) -> float:
68
+ return sum(e.cost for e in self.entries)
69
+
70
+ def by_role(self) -> dict[str, dict]:
71
+ return self._group(lambda e: e.role)
72
+
73
+ def by_model(self) -> dict[str, dict]:
74
+ return self._group(lambda e: e.model)
75
+
76
+ def _group(self, key) -> dict[str, dict]:
77
+ out: dict[str, dict] = {}
78
+ for e in self.entries:
79
+ g = out.setdefault(key(e), {"input": 0, "output": 0, "cost": 0.0, "calls": 0})
80
+ g["input"] += e.input
81
+ g["output"] += e.output
82
+ g["cost"] += e.cost
83
+ g["calls"] += 1
84
+ return out
director/dag.py ADDED
@@ -0,0 +1,113 @@
1
+ """Task-DAG validation and scheduling.
2
+
3
+ Guarantees that make cheap parallel execution safe:
4
+ - acyclic, with all depends_on referencing real nodes,
5
+ - any two *concurrent* nodes (neither depends on the other, directly or
6
+ transitively) have disjoint file allowlists — so parallel worktrees can never
7
+ produce conflicting edits to the same file,
8
+ - every node has a test_cmd (its gate).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from director.models import Plan
14
+
15
+
16
+ class DagError(ValueError):
17
+ pass
18
+
19
+
20
+ def _descendants(nodes) -> dict[str, set[str]]:
21
+ """node id -> set of all transitive descendants (nodes that depend on it)."""
22
+ children: dict[str, set[str]] = {n.id: set() for n in nodes}
23
+ for n in nodes:
24
+ for dep in n.depends_on:
25
+ children[dep].add(n.id)
26
+ out: dict[str, set[str]] = {}
27
+ for n in nodes:
28
+ seen: set[str] = set()
29
+ stack = list(children[n.id])
30
+ while stack:
31
+ c = stack.pop()
32
+ if c in seen:
33
+ continue
34
+ seen.add(c)
35
+ stack.extend(children[c])
36
+ out[n.id] = seen
37
+ return out
38
+
39
+
40
+ def topo_order(plan: Plan) -> list[str]:
41
+ """Kahn's algorithm. Raises DagError on a cycle."""
42
+ ids = [n.id for n in plan.nodes]
43
+ indeg = dict.fromkeys(ids, 0)
44
+ adj: dict[str, list[str]] = {i: [] for i in ids}
45
+ for n in plan.nodes:
46
+ for dep in n.depends_on:
47
+ adj[dep].append(n.id)
48
+ indeg[n.id] += 1
49
+ # deterministic order: process ready nodes in their plan order
50
+ ready = [i for i in ids if indeg[i] == 0]
51
+ order: list[str] = []
52
+ while ready:
53
+ ready.sort(key=ids.index)
54
+ cur = ready.pop(0)
55
+ order.append(cur)
56
+ for nxt in adj[cur]:
57
+ indeg[nxt] -= 1
58
+ if indeg[nxt] == 0:
59
+ ready.append(nxt)
60
+ if len(order) != len(ids):
61
+ stuck = [i for i in ids if i not in order]
62
+ raise DagError(f"cycle detected among nodes: {', '.join(stuck)}")
63
+ return order
64
+
65
+
66
+ def validate(plan: Plan) -> None:
67
+ """Raise DagError on the first structural problem. Returns None if valid."""
68
+ nodes = plan.nodes
69
+ if not nodes:
70
+ raise DagError("plan has no nodes")
71
+
72
+ ids = [n.id for n in nodes]
73
+ dupes = {i for i in ids if ids.count(i) > 1}
74
+ if dupes:
75
+ raise DagError(f"duplicate node ids: {', '.join(sorted(dupes))}")
76
+ idset = set(ids)
77
+
78
+ for n in nodes:
79
+ bad = [d for d in n.depends_on if d not in idset]
80
+ if bad:
81
+ raise DagError(f"node '{n.id}' depends_on unknown node(s): {', '.join(bad)}")
82
+ if not n.test_cmd.strip():
83
+ raise DagError(f"node '{n.id}' has no test_cmd (every node needs a gate)")
84
+ if not n.files:
85
+ raise DagError(f"node '{n.id}' has an empty file allowlist")
86
+
87
+ topo_order(plan) # raises on cycle
88
+
89
+ # concurrent nodes must have disjoint allowlists
90
+ desc = _descendants(nodes)
91
+ by_id = {n.id: n for n in nodes}
92
+ for i, a in enumerate(ids):
93
+ for b in ids[i + 1 :]:
94
+ related = (b in desc[a]) or (a in desc[b])
95
+ if related:
96
+ continue
97
+ overlap = set(by_id[a].files) & set(by_id[b].files)
98
+ if overlap:
99
+ raise DagError(
100
+ f"concurrent nodes '{a}' and '{b}' share files {sorted(overlap)} "
101
+ f"— they could conflict in parallel. Add a depends_on or split files."
102
+ )
103
+
104
+
105
+ def ready_nodes(plan: Plan, done: set[str], active: set[str]) -> list[str]:
106
+ """Nodes whose deps are all done and that are neither done nor running."""
107
+ out = []
108
+ for n in plan.nodes:
109
+ if n.id in done or n.id in active:
110
+ continue
111
+ if all(d in done for d in n.depends_on):
112
+ out.append(n.id)
113
+ return out
director/gates.py ADDED
@@ -0,0 +1,145 @@
1
+ """Deterministic merge gates — exit codes decide, never an LLM.
2
+
3
+ Per-node gate (in the node's worktree):
4
+ - `node.test_cmd` must pass (the node's contract), and
5
+ - the diff must touch ONLY the node's file allowlist (rejects out-of-scope edits,
6
+ which by construction also rejects any edit to the committed test files).
7
+
8
+ Integration gate (on the job branch, after all nodes merge):
9
+ - the full repo-wide suite + lint + typecheck from config.
10
+ The full suite is NOT run per node because sibling nodes' tests are intentionally
11
+ red until their own node executes.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import hashlib
17
+ import os
18
+ import subprocess
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+
22
+ from director import gitutil
23
+ from director.config import Config
24
+ from director.models import Node
25
+
26
+ # Don't let gate test runs write .pyc into the worktree (see opencode._CLEAN_ENV).
27
+ _CLEAN_ENV = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"}
28
+
29
+
30
+ @dataclass
31
+ class GateResult:
32
+ ok: bool
33
+ failures: list[str] = field(default_factory=list)
34
+ detail: str = ""
35
+
36
+
37
+ # Ephemeral build/test byproducts that are never source and must not count as
38
+ # out-of-scope edits. Running the tests (which the executor is told to do) creates
39
+ # `__pycache__/*.pyc`; without this filter the allowlist gate rejects every node
40
+ # whose repo doesn't already .gitignore them — a node can then never pass.
41
+ _IGNORABLE_DIRS = ("__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache")
42
+ _IGNORABLE_SUFFIXES = (".pyc", ".pyo")
43
+
44
+
45
+ def _is_ignorable(path: str) -> bool:
46
+ p = path.replace("\\", "/")
47
+ if p.endswith(_IGNORABLE_SUFFIXES):
48
+ return True
49
+ return any(d in p.split("/") for d in _IGNORABLE_DIRS)
50
+
51
+
52
+ def _run(cmd: str, cwd: Path, timeout: int) -> tuple[int, str]:
53
+ try:
54
+ p = subprocess.run(
55
+ cmd,
56
+ cwd=str(cwd),
57
+ shell=True,
58
+ capture_output=True,
59
+ text=True,
60
+ timeout=timeout,
61
+ env=_CLEAN_ENV,
62
+ )
63
+ return p.returncode, (p.stdout + p.stderr)
64
+ except subprocess.TimeoutExpired:
65
+ return 124, f"(gate command timed out after {timeout}s: {cmd})"
66
+
67
+
68
+ def test_files_intact(node: Node, worktree: Path) -> list[str]:
69
+ """Test files the executor must not have touched. Returns the paths whose
70
+ on-disk hash no longer matches the contract captured at plan time. This makes
71
+ the executor's watch-it-fail mandate enforceable, not advisory: a node that
72
+ edited its own tests can never be marked done."""
73
+ tampered = []
74
+ for path, expected in (node.test_hashes or {}).items():
75
+ fp = worktree / path
76
+ actual = hashlib.sha256(fp.read_bytes()).hexdigest() if fp.exists() else None
77
+ if actual != expected:
78
+ tampered.append(path)
79
+ return tampered
80
+
81
+
82
+ def node_gate(node: Node, worktree: Path, cfg: Config) -> GateResult:
83
+ timeout = cfg.node_timeout
84
+ failures, detail = [], []
85
+
86
+ # red-green hardening: the contract (test files) must be byte-for-byte intact.
87
+ tampered = test_files_intact(node, worktree)
88
+ if tampered:
89
+ return GateResult(
90
+ False,
91
+ ["test files modified"],
92
+ "The executor changed the contract (test files): "
93
+ + ", ".join(sorted(tampered))
94
+ + ". Tests are immutable — implement the source instead.",
95
+ )
96
+
97
+ rc, out = _run(node.test_cmd, worktree, timeout)
98
+ if rc != 0:
99
+ failures.append("node tests")
100
+ detail.append(f"$ {node.test_cmd}\n{out}")
101
+ return GateResult(False, failures, "\n".join(detail))
102
+
103
+ # allowlist: only node.files may have changed (tests are committed → any edit
104
+ # to them shows as out-of-scope and is rejected here)
105
+ allowed = set(node.files)
106
+ changed = gitutil.changed_paths(worktree)
107
+ out_of_scope = [p for p in changed if p not in allowed and not _is_ignorable(p)]
108
+ if out_of_scope:
109
+ failures.append("out-of-scope edits")
110
+ detail.append(
111
+ "Modified files outside the allowlist (revert these): "
112
+ + ", ".join(sorted(out_of_scope))
113
+ + f"\nAllowed: {sorted(allowed)}"
114
+ )
115
+ return GateResult(False, failures, "\n".join(detail))
116
+
117
+ # flake control (Phase 3): a node that passed once must pass again. Re-run the
118
+ # tests `flake_runs - 1` more times; any nonzero result means the suite is
119
+ # flaky (order-dependent, time/random-sensitive, or relies on the first run's
120
+ # side effects) and the node is NOT safe to merge.
121
+ for i in range(2, cfg.flake_runs + 1):
122
+ rc2, out2 = _run(node.test_cmd, worktree, timeout)
123
+ if rc2 != 0:
124
+ return GateResult(
125
+ False,
126
+ ["flaky tests"],
127
+ f"Tests passed once but FAILED on re-run {i}/{cfg.flake_runs} — "
128
+ f"the suite is flaky and the node cannot merge.\n$ {node.test_cmd}\n{out2}",
129
+ )
130
+
131
+ return GateResult(True)
132
+
133
+
134
+ def integration_gate(repo: Path, cfg: Config) -> GateResult:
135
+ timeout = cfg.node_timeout
136
+ failures, detail = [], []
137
+ for name in ("test", "lint", "typecheck"):
138
+ cmd = cfg.gates.get(name, "").strip()
139
+ if not cmd:
140
+ continue
141
+ rc, out = _run(cmd, repo, timeout)
142
+ if rc != 0:
143
+ failures.append(name)
144
+ detail.append(f"$ {cmd}\n{out[-2000:]}")
145
+ return GateResult(not failures, failures, "\n".join(detail))