PyPI - director-cli - Versions diffs - 0.3.0__py3-none-any.whl - Mend

director-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

director/README.md +124 -0
director/__init__.py +10 -0
director/__main__.py +4 -0
director/agent_templates/brainstorm.md +44 -0
director/agent_templates/executor.md +37 -0
director/agent_templates/explorer.md +24 -0
director/agent_templates/opencode.json +39 -0
director/agent_templates/planner.md +60 -0
director/agent_templates/reviewer.md +46 -0
director/agent_templates/test-author.md +29 -0
director/bench.py +234 -0
director/cli.py +166 -0
director/config.example.toml +75 -0
director/config.py +111 -0
director/cost.py +84 -0
director/dag.py +113 -0
director/gates.py +145 -0
director/gitutil.py +83 -0
director/metrics.py +48 -0
director/models.py +106 -0
director/opencode.py +231 -0
director/plan.py +523 -0
director/report.py +103 -0
director/review.py +153 -0
director/run.py +444 -0
director/setup.py +101 -0
director/state.py +43 -0
director_cli-0.3.0.dist-info/METADATA +174 -0
director_cli-0.3.0.dist-info/RECORD +32 -0
director_cli-0.3.0.dist-info/WHEEL +4 -0
director_cli-0.3.0.dist-info/entry_points.txt +2 -0
director_cli-0.3.0.dist-info/licenses/LICENSE +21 -0

director/cli.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""director CLI — plan | run | status | bench | sync-agents."""
+from __future__ import annotations
+import argparse
+import sys
+from director import __version__, config
+from director.report import run_summary, status_table
+from director.setup import sync_agents
+def _log(msg: str) -> None:
+    print(msg, file=sys.stderr, flush=True)
+def cmd_plan(args) -> int:
+    from director.plan import run_plan
+    cfg = config.load(args.repo)
+    result = run_plan(
+        args.task,
+        args.repo,
+        cfg,
+        _log,
+        auto=args.auto,
+        critique=not args.no_critique,
+        cont=getattr(args, "continue"),
+    )
+    print(result.message)
+    return 0
+def cmd_run(args) -> int:
+    from director.run import run_job
+    cfg = config.load(args.repo)
+    result = run_job(
+        args.repo,
+        cfg,
+        parallel=args.parallel,
+        max_attempts=args.max_attempts or cfg.max_attempts,
+        log=_log,
+    )
+    print(run_summary(result))
+    failed = bool(result["failed"]) or not result["integration_ok"]
+    return 1 if failed else 0
+def cmd_status(args) -> int:
+    print(status_table(args.repo))
+    return 0
+def cmd_bench(args) -> int:
+    from director.bench import bench_report, run_bench
+    profiles = [p.strip() for p in args.profiles.split(",") if p.strip()]
+    if not profiles:
+        _log("director: error: --profiles is empty")
+        return 2
+    result = run_bench(
+        args.task,
+        args.repo,
+        profiles,
+        _log,
+        plan_profile=args.plan_profile,
+        parallel=args.parallel,
+        max_attempts=args.max_attempts or 0,
+    )
+    print(bench_report(result))
+    return 0
+def cmd_sync_agents(args) -> int:
+    written = sync_agents(args.repo)
+    print("Synced:\n  " + "\n  ".join(written))
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(prog="director", description=__doc__)
+    p.add_argument("--version", action="version", version=f"director {__version__}")
+    sub = p.add_subparsers(dest="cmd", required=True)
+    pp = sub.add_parser("plan", help="brainstorm → spec → test-gated DAG, with approval gates")
+    pp.add_argument(
+        "task", nargs="?", default=None, help="the task description (omit with --continue)"
+    )
+    pp.add_argument("--repo", default=".", help="target repo (default: .)")
+    pp.add_argument(
+        "--continue",
+        action="store_true",
+        dest="continue",
+        help="resume after approving/editing the current gate artifact",
+    )
+    pp.add_argument(
+        "--auto", action="store_true", help="planner self-critiques at each gate; no human pause"
+    )
+    pp.add_argument(
+        "--no-critique",
+        action="store_true",
+        help="with --auto: gates auto-pass without even self-critique",
+    )
+    pp.set_defaults(func=cmd_plan)
+    pr = sub.add_parser("run", help="execute the planned DAG")
+    pr.add_argument("--repo", default=".")
+    pr.add_argument(
+        "--parallel",
+        type=int,
+        default=1,
+        help="max concurrent nodes (default 1; local single-model endpoints want 1)",
+    )
+    pr.add_argument(
+        "--max-attempts",
+        type=int,
+        default=0,
+        help="executor attempts before escalation (default: config)",
+    )
+    pr.set_defaults(func=cmd_run)
+    ps = sub.add_parser("status", help="show per-node progress + cost")
+    ps.add_argument("--repo", default=".")
+    ps.set_defaults(func=cmd_status)
+    pb = sub.add_parser("bench", help="run one task across profiles; diff cost/quality/wall-time")
+    pb.add_argument("task", help="the task description (planned once, run per profile)")
+    pb.add_argument("--repo", default=".")
+    pb.add_argument(
+        "--profiles",
+        required=True,
+        help="comma-separated profile names, e.g. all-frontier,cheap-cloud,local-first",
+    )
+    pb.add_argument(
+        "--plan-profile",
+        default=None,
+        help="profile used for the single shared planning pass "
+        "(default: all-frontier if listed, else the first profile)",
+    )
+    pb.add_argument("--parallel", type=int, default=1, help="max concurrent nodes per run")
+    pb.add_argument(
+        "--max-attempts",
+        type=int,
+        default=0,
+        help="executor attempts before escalation (default: per-profile config)",
+    )
+    pb.set_defaults(func=cmd_bench)
+    psa = sub.add_parser("sync-agents", help="(re)install role agents into <repo>/.opencode")
+    psa.add_argument("--repo", default=".")
+    psa.set_defaults(func=cmd_sync_agents)
+    return p
+def main(argv=None) -> int:
+    args = build_parser().parse_args(argv)
+    try:
+        return args.func(args)
+    except (FileNotFoundError, ValueError, RuntimeError) as e:
+        _log(f"director: error: {e}")
+        return 2
+if __name__ == "__main__":
+    raise SystemExit(main())

director/config.example.toml ADDED Viewed

@@ -0,0 +1,75 @@
+# director configuration — copy to .director/config.toml and edit.
+#
+# `director sync-agents` drops this file at .director/config.toml for you if none
+# exists. Everything director knows about a "model" comes from here; switching
+# models is a config edit, never a code change.
+#
+# The example below is a **local-first** setup (executor on a local LM Studio
+# endpoint, planning/tests on a cloud frontier model). Inline comments show how to
+# turn it into a zero-local-infra "cheap-cloud" setup or an "all-frontier" baseline.
+[tiers]
+# Roles bound to resolved OpenCode model strings ("provider/model"). Code, prompts,
+# and logs refer ONLY to these role names — never to a specific model.
+planner     = "amazon-bedrock/us.anthropic.claude-opus-4-7"   # decomposition + DAG (use your strongest model)
+test_author = "amazon-bedrock/us.anthropic.claude-opus-4-7"   # tests are the contract → strongest
+executor    = "lmstudio/qwen3.6-27b-mtp"                      # implements each node. The cheap tier.
+explorer    = "lmstudio/qwen3.6-27b-mtp"                      # read-only recon (reuse the cheap tier)
+reviewer    = "amazon-bedrock/anthropic.claude-sonnet-4-6"    # stage-two code review (NEVER the cheap/local tier)
+escalation  = "amazon-bedrock/anthropic.claude-sonnet-4-6"    # per-task fallback when the executor exhausts attempts
+#
+# cheap-cloud (no local GPU): set executor/explorer to a low-cost cloud model and
+#   delete the [providers.local] block, e.g.
+#     executor = "openrouter/deepseek/deepseek-v4-pro"
+# all-frontier baseline (expensive control): set executor = reviewer = escalation
+#   to the same frontier model as the planner.
+# Only needed if a tier above points at a local OpenAI-compatible endpoint.
+[providers.local]
+base_url = "http://localhost:1234/v1"   # e.g. LM Studio; load a model there first
+api_key  = "sk-noauth"
+# Sampling overrides per role (optional; match the model card). Example for a
+# Qwen3.6 "thinking" executor:
+[sampling.executor]
+temperature = 0.6
+top_p       = 0.95
+top_k       = 20
+# Deterministic repo-wide gates. The exit code is the verdict — set these to your
+# project's real commands. Leave a value empty ("") to skip that gate.
+[gates]
+test      = "pytest -q"
+lint      = "ruff check ."
+typecheck = "mypy ."
+# Per-resolved-model pricing for cost accounting ($ / 1M tokens). Local = $0.
+# Fill these from your provider's current price sheet; any model not listed is
+# counted at $0.
+[pricing."lmstudio/qwen3.6-27b-mtp"]
+input = 0.0
+output = 0.0
+[pricing."amazon-bedrock/us.anthropic.claude-opus-4-7"]
+input = 15.0
+output = 75.0
+[pricing."amazon-bedrock/anthropic.claude-sonnet-4-6"]
+input = 3.0
+output = 15.0
+[limits]
+node_timeout_secs = 900      # per model/gate call (15 min)
+cost_ceiling_usd  = 10.0     # abort the run if exceeded (local = $0 never trips it)
+max_attempts      = 3        # executor attempts before escalating a node one tier up
+flake_runs        = 2        # re-run a node's tests this many times on success (1 disables)
+# Two-stage review. Stage one (spec compliance) is the always-on deterministic node
+# gate. Stage two (code quality, reviewer tier) is cost-gated:
+[review]
+stage_two                = true
+stage_two_file_threshold = 3      # stage two fires when a node escalated OR its diff touched > N files
+stage_one_llm            = false  # optional advisory explorer-tier compliance check (off: the gate already enforces it)
+# --- director bench ---------------------------------------------------------
+# To compare profiles with `director bench "<task>" --profiles a,b,c`, create
+# .director/profiles/<name>.toml variants (copy this file and change the executor
+# tier in each). bench plans once and runs the same frozen tests under each.

director/config.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Load and resolve `.director/config.toml` (the active profile).
+Roles → tier model strings, deterministic gate commands, per-model pricing, and
+run limits. Everything the orchestrator knows about a "model" comes from here;
+switching executor models is a config edit, never a code change.
+"""
+from __future__ import annotations
+import tomllib
+from dataclasses import dataclass
+from pathlib import Path
+ROLES = ("planner", "test_author", "executor", "explorer", "reviewer", "escalation")
+@dataclass
+class Config:
+    path: Path
+    tiers: dict[str, str]  # role -> "provider/model"
+    gates: dict[str, str]  # "test"|"lint"|"typecheck" -> command ("" = skip)
+    pricing: dict[str, dict]  # "provider/model" -> {"input": $/Mtok, "output": $/Mtok}
+    limits: dict  # node_timeout_secs, cost_ceiling_usd, max_attempts
+    sampling: dict  # role -> {temperature, top_p, top_k}
+    local: dict  # providers.local: base_url, api_key
+    review: dict  # two-stage review knobs (Phase 2.5)
+    # --- convenience resolvers ----------------------------------------------
+    def model_for(self, role: str) -> str:
+        if role not in self.tiers:
+            raise KeyError(f"role '{role}' not bound in [tiers] of {self.path}")
+        return self.tiers[role]
+    def price(self, model: str) -> dict:
+        return self.pricing.get(model, {"input": 0.0, "output": 0.0})
+    @property
+    def node_timeout(self) -> int:
+        return int(self.limits.get("node_timeout_secs", 900))
+    @property
+    def cost_ceiling(self) -> float:
+        return float(self.limits.get("cost_ceiling_usd", 0.0))  # 0 = no ceiling
+    @property
+    def max_attempts(self) -> int:
+        return int(self.limits.get("max_attempts", 3))
+    @property
+    def flake_runs(self) -> int:
+        """How many times to run a node's tests on the success path (Phase 3 flake
+        control). Default 2 = run twice; a mismatch between runs fails the node as
+        flaky. 1 disables the extra run."""
+        return max(1, int(self.limits.get("flake_runs", 2)))
+    # --- two-stage review (Phase 2.5) ---------------------------------------
+    @property
+    def stage_two_file_threshold(self) -> int:
+        """Stage-two (code-quality) review fires when a node escalated OR its diff
+        touched MORE than this many files. Default 3 (configurable)."""
+        return int(self.review.get("stage_two_file_threshold", 3))
+    @property
+    def stage_one_llm(self) -> bool:
+        """Run the optional explorer-tier spec-compliance check in stage one.
+        Off by default — the deterministic node gate already enforces the
+        contract; this is a cheap belt-and-suspenders LLM pass."""
+        return bool(self.review.get("stage_one_llm", False))
+    @property
+    def stage_two_enabled(self) -> bool:
+        return bool(self.review.get("stage_two", True))
+def load(repo: Path) -> Config:
+    """Load the active config from <repo>/.director/config.toml."""
+    path = Path(repo) / ".director" / "config.toml"
+    if not path.exists():
+        raise FileNotFoundError(
+            f"{path} not found. Run `director sync-agents` to seed it from the bundled "
+            f"example, then edit it."
+        )
+    return load_file(path)
+def load_file(path: Path) -> Config:
+    """Load a Config from a specific TOML path (e.g. a profile). Used by
+    `director bench` to load each profile WITHOUT swapping the active
+    config.toml — run_plan/run_job take a Config object, so bench never has to
+    mutate (and thereby dirty) the tracked config.toml on disk."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"{path} not found.")
+    with path.open("rb") as f:
+        data = tomllib.load(f)
+    tiers = data.get("tiers", {})
+    missing = [r for r in ROLES if r not in tiers]
+    if missing:
+        raise ValueError(f"[tiers] in {path} is missing roles: {', '.join(missing)}")
+    return Config(
+        path=path,
+        tiers=tiers,
+        gates=data.get("gates", {}),
+        pricing=data.get("pricing", {}),
+        limits=data.get("limits", {}),
+        sampling=data.get("sampling", {}),
+        local=data.get("providers", {}).get("local", {}),
+        review=data.get("review", {}),
+    )

director/cost.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Cost accounting — every model call is tagged with its role and resolved model.
+Cost is computed from per-model pricing in config (local endpoints priced at $0
+but still counted). Entries are appended to `.director/costs.jsonl` so accounting
+survives across `plan` and `run` and is resumable.
+"""
+from __future__ import annotations
+import json
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from director.config import Config
+def cost_of(model: str, tokens: dict, cfg: Config) -> float:
+    p = cfg.price(model)
+    return (tokens.get("input", 0) / 1_000_000) * float(p.get("input", 0.0)) + (
+        tokens.get("output", 0) / 1_000_000
+    ) * float(p.get("output", 0.0))
+@dataclass
+class CostEntry:
+    role: str
+    model: str
+    input: int
+    output: int
+    cost: float
+    node: str | None = None
+    ts: float = 0.0
+class CostLedger:
+    """Append-only ledger backed by .director/costs.jsonl."""
+    def __init__(self, path: Path):
+        self.path = Path(path)
+        self.entries: list[CostEntry] = []
+        if self.path.exists():
+            for line in self.path.read_text().splitlines():
+                if line.strip():
+                    d = json.loads(line)
+                    self.entries.append(CostEntry(**d))
+    def record(
+        self, *, role: str, model: str, tokens: dict, cfg: Config, node: str | None = None
+    ) -> float:
+        c = cost_of(model, tokens, cfg)
+        e = CostEntry(
+            role=role,
+            model=model,
+            input=int(tokens.get("input", 0)),
+            output=int(tokens.get("output", 0)),
+            cost=c,
+            node=node,
+            ts=time.time(),
+        )
+        self.entries.append(e)
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        with self.path.open("a") as f:
+            f.write(json.dumps(e.__dict__) + "\n")
+        return c
+    def total(self) -> float:
+        return sum(e.cost for e in self.entries)
+    def by_role(self) -> dict[str, dict]:
+        return self._group(lambda e: e.role)
+    def by_model(self) -> dict[str, dict]:
+        return self._group(lambda e: e.model)
+    def _group(self, key) -> dict[str, dict]:
+        out: dict[str, dict] = {}
+        for e in self.entries:
+            g = out.setdefault(key(e), {"input": 0, "output": 0, "cost": 0.0, "calls": 0})
+            g["input"] += e.input
+            g["output"] += e.output
+            g["cost"] += e.cost
+            g["calls"] += 1
+        return out

director/dag.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Task-DAG validation and scheduling.
+Guarantees that make cheap parallel execution safe:
+  - acyclic, with all depends_on referencing real nodes,
+  - any two *concurrent* nodes (neither depends on the other, directly or
+    transitively) have disjoint file allowlists — so parallel worktrees can never
+    produce conflicting edits to the same file,
+  - every node has a test_cmd (its gate).
+"""
+from __future__ import annotations
+from director.models import Plan
+class DagError(ValueError):
+    pass
+def _descendants(nodes) -> dict[str, set[str]]:
+    """node id -> set of all transitive descendants (nodes that depend on it)."""
+    children: dict[str, set[str]] = {n.id: set() for n in nodes}
+    for n in nodes:
+        for dep in n.depends_on:
+            children[dep].add(n.id)
+    out: dict[str, set[str]] = {}
+    for n in nodes:
+        seen: set[str] = set()
+        stack = list(children[n.id])
+        while stack:
+            c = stack.pop()
+            if c in seen:
+                continue
+            seen.add(c)
+            stack.extend(children[c])
+        out[n.id] = seen
+    return out
+def topo_order(plan: Plan) -> list[str]:
+    """Kahn's algorithm. Raises DagError on a cycle."""
+    ids = [n.id for n in plan.nodes]
+    indeg = dict.fromkeys(ids, 0)
+    adj: dict[str, list[str]] = {i: [] for i in ids}
+    for n in plan.nodes:
+        for dep in n.depends_on:
+            adj[dep].append(n.id)
+            indeg[n.id] += 1
+    # deterministic order: process ready nodes in their plan order
+    ready = [i for i in ids if indeg[i] == 0]
+    order: list[str] = []
+    while ready:
+        ready.sort(key=ids.index)
+        cur = ready.pop(0)
+        order.append(cur)
+        for nxt in adj[cur]:
+            indeg[nxt] -= 1
+            if indeg[nxt] == 0:
+                ready.append(nxt)
+    if len(order) != len(ids):
+        stuck = [i for i in ids if i not in order]
+        raise DagError(f"cycle detected among nodes: {', '.join(stuck)}")
+    return order
+def validate(plan: Plan) -> None:
+    """Raise DagError on the first structural problem. Returns None if valid."""
+    nodes = plan.nodes
+    if not nodes:
+        raise DagError("plan has no nodes")
+    ids = [n.id for n in nodes]
+    dupes = {i for i in ids if ids.count(i) > 1}
+    if dupes:
+        raise DagError(f"duplicate node ids: {', '.join(sorted(dupes))}")
+    idset = set(ids)
+    for n in nodes:
+        bad = [d for d in n.depends_on if d not in idset]
+        if bad:
+            raise DagError(f"node '{n.id}' depends_on unknown node(s): {', '.join(bad)}")
+        if not n.test_cmd.strip():
+            raise DagError(f"node '{n.id}' has no test_cmd (every node needs a gate)")
+        if not n.files:
+            raise DagError(f"node '{n.id}' has an empty file allowlist")
+    topo_order(plan)  # raises on cycle
+    # concurrent nodes must have disjoint allowlists
+    desc = _descendants(nodes)
+    by_id = {n.id: n for n in nodes}
+    for i, a in enumerate(ids):
+        for b in ids[i + 1 :]:
+            related = (b in desc[a]) or (a in desc[b])
+            if related:
+                continue
+            overlap = set(by_id[a].files) & set(by_id[b].files)
+            if overlap:
+                raise DagError(
+                    f"concurrent nodes '{a}' and '{b}' share files {sorted(overlap)} "
+                    f"— they could conflict in parallel. Add a depends_on or split files."
+                )
+def ready_nodes(plan: Plan, done: set[str], active: set[str]) -> list[str]:
+    """Nodes whose deps are all done and that are neither done nor running."""
+    out = []
+    for n in plan.nodes:
+        if n.id in done or n.id in active:
+            continue
+        if all(d in done for d in n.depends_on):
+            out.append(n.id)
+    return out

director/gates.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""Deterministic merge gates — exit codes decide, never an LLM.
+Per-node gate (in the node's worktree):
+  - `node.test_cmd` must pass (the node's contract), and
+  - the diff must touch ONLY the node's file allowlist (rejects out-of-scope edits,
+    which by construction also rejects any edit to the committed test files).
+Integration gate (on the job branch, after all nodes merge):
+  - the full repo-wide suite + lint + typecheck from config.
+The full suite is NOT run per node because sibling nodes' tests are intentionally
+red until their own node executes.
+"""
+from __future__ import annotations
+import hashlib
+import os
+import subprocess
+from dataclasses import dataclass, field
+from pathlib import Path
+from director import gitutil
+from director.config import Config
+from director.models import Node
+# Don't let gate test runs write .pyc into the worktree (see opencode._CLEAN_ENV).
+_CLEAN_ENV = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"}
+@dataclass
+class GateResult:
+    ok: bool
+    failures: list[str] = field(default_factory=list)
+    detail: str = ""
+# Ephemeral build/test byproducts that are never source and must not count as
+# out-of-scope edits. Running the tests (which the executor is told to do) creates
+# `__pycache__/*.pyc`; without this filter the allowlist gate rejects every node
+# whose repo doesn't already .gitignore them — a node can then never pass.
+_IGNORABLE_DIRS = ("__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache")
+_IGNORABLE_SUFFIXES = (".pyc", ".pyo")
+def _is_ignorable(path: str) -> bool:
+    p = path.replace("\\", "/")
+    if p.endswith(_IGNORABLE_SUFFIXES):
+        return True
+    return any(d in p.split("/") for d in _IGNORABLE_DIRS)
+def _run(cmd: str, cwd: Path, timeout: int) -> tuple[int, str]:
+    try:
+        p = subprocess.run(
+            cmd,
+            cwd=str(cwd),
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            env=_CLEAN_ENV,
+        )
+        return p.returncode, (p.stdout + p.stderr)
+    except subprocess.TimeoutExpired:
+        return 124, f"(gate command timed out after {timeout}s: {cmd})"
+def test_files_intact(node: Node, worktree: Path) -> list[str]:
+    """Test files the executor must not have touched. Returns the paths whose
+    on-disk hash no longer matches the contract captured at plan time. This makes
+    the executor's watch-it-fail mandate enforceable, not advisory: a node that
+    edited its own tests can never be marked done."""
+    tampered = []
+    for path, expected in (node.test_hashes or {}).items():
+        fp = worktree / path
+        actual = hashlib.sha256(fp.read_bytes()).hexdigest() if fp.exists() else None
+        if actual != expected:
+            tampered.append(path)
+    return tampered
+def node_gate(node: Node, worktree: Path, cfg: Config) -> GateResult:
+    timeout = cfg.node_timeout
+    failures, detail = [], []
+    # red-green hardening: the contract (test files) must be byte-for-byte intact.
+    tampered = test_files_intact(node, worktree)
+    if tampered:
+        return GateResult(
+            False,
+            ["test files modified"],
+            "The executor changed the contract (test files): "
+            + ", ".join(sorted(tampered))
+            + ". Tests are immutable — implement the source instead.",
+        )
+    rc, out = _run(node.test_cmd, worktree, timeout)
+    if rc != 0:
+        failures.append("node tests")
+        detail.append(f"$ {node.test_cmd}\n{out}")
+        return GateResult(False, failures, "\n".join(detail))
+    # allowlist: only node.files may have changed (tests are committed → any edit
+    # to them shows as out-of-scope and is rejected here)
+    allowed = set(node.files)
+    changed = gitutil.changed_paths(worktree)
+    out_of_scope = [p for p in changed if p not in allowed and not _is_ignorable(p)]
+    if out_of_scope:
+        failures.append("out-of-scope edits")
+        detail.append(
+            "Modified files outside the allowlist (revert these): "
+            + ", ".join(sorted(out_of_scope))
+            + f"\nAllowed: {sorted(allowed)}"
+        )
+        return GateResult(False, failures, "\n".join(detail))
+    # flake control (Phase 3): a node that passed once must pass again. Re-run the
+    # tests `flake_runs - 1` more times; any nonzero result means the suite is
+    # flaky (order-dependent, time/random-sensitive, or relies on the first run's
+    # side effects) and the node is NOT safe to merge.
+    for i in range(2, cfg.flake_runs + 1):
+        rc2, out2 = _run(node.test_cmd, worktree, timeout)
+        if rc2 != 0:
+            return GateResult(
+                False,
+                ["flaky tests"],
+                f"Tests passed once but FAILED on re-run {i}/{cfg.flake_runs} — "
+                f"the suite is flaky and the node cannot merge.\n$ {node.test_cmd}\n{out2}",
+            )
+    return GateResult(True)
+def integration_gate(repo: Path, cfg: Config) -> GateResult:
+    timeout = cfg.node_timeout
+    failures, detail = [], []
+    for name in ("test", "lint", "typecheck"):
+        cmd = cfg.gates.get(name, "").strip()
+        if not cmd:
+            continue
+        rc, out = _run(cmd, repo, timeout)
+        if rc != 0:
+            failures.append(name)
+            detail.append(f"$ {cmd}\n{out[-2000:]}")
+    return GateResult(not failures, failures, "\n".join(detail))