director-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- director/README.md +124 -0
- director/__init__.py +10 -0
- director/__main__.py +4 -0
- director/agent_templates/brainstorm.md +44 -0
- director/agent_templates/executor.md +37 -0
- director/agent_templates/explorer.md +24 -0
- director/agent_templates/opencode.json +39 -0
- director/agent_templates/planner.md +60 -0
- director/agent_templates/reviewer.md +46 -0
- director/agent_templates/test-author.md +29 -0
- director/bench.py +234 -0
- director/cli.py +166 -0
- director/config.example.toml +75 -0
- director/config.py +111 -0
- director/cost.py +84 -0
- director/dag.py +113 -0
- director/gates.py +145 -0
- director/gitutil.py +83 -0
- director/metrics.py +48 -0
- director/models.py +106 -0
- director/opencode.py +231 -0
- director/plan.py +523 -0
- director/report.py +103 -0
- director/review.py +153 -0
- director/run.py +444 -0
- director/setup.py +101 -0
- director/state.py +43 -0
- director_cli-0.3.0.dist-info/METADATA +174 -0
- director_cli-0.3.0.dist-info/RECORD +32 -0
- director_cli-0.3.0.dist-info/WHEEL +4 -0
- director_cli-0.3.0.dist-info/entry_points.txt +2 -0
- director_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
director/gitutil.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Thin git helpers. Director uses real git branches + worktrees for isolation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def git(args: list[str], cwd: str | Path, check: bool = True) -> subprocess.CompletedProcess:
|
|
10
|
+
return subprocess.run(
|
|
11
|
+
["git", *args],
|
|
12
|
+
cwd=str(cwd),
|
|
13
|
+
capture_output=True,
|
|
14
|
+
text=True,
|
|
15
|
+
check=check,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def current_commit(cwd: str | Path) -> str:
|
|
20
|
+
return git(["rev-parse", "HEAD"], cwd).stdout.strip()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def current_branch(cwd: str | Path) -> str:
|
|
24
|
+
return git(["rev-parse", "--abbrev-ref", "HEAD"], cwd).stdout.strip()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def branch_exists(name: str, cwd: str | Path) -> bool:
|
|
28
|
+
return git(["rev-parse", "--verify", "--quiet", name], cwd, check=False).returncode == 0
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_branch(name: str, cwd: str | Path, base: str | None = None) -> None:
|
|
32
|
+
args = ["branch", name] + ([base] if base else [])
|
|
33
|
+
git(args, cwd)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def checkout(name: str, cwd: str | Path) -> None:
|
|
37
|
+
git(["checkout", name], cwd)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def worktree_add(path: str | Path, branch: str, base: str, cwd: str | Path) -> None:
|
|
41
|
+
"""Create a new branch `branch` from `base` checked out at `path`."""
|
|
42
|
+
git(["worktree", "add", "-b", branch, str(path), base], cwd)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def worktree_remove(path: str | Path, cwd: str | Path) -> None:
|
|
46
|
+
git(["worktree", "remove", "--force", str(path)], cwd, check=False)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def changed_paths(cwd: str | Path) -> list[str]:
|
|
50
|
+
"""All paths modified/added/deleted vs HEAD, including untracked (ignored
|
|
51
|
+
files excluded). Used to enforce the file allowlist."""
|
|
52
|
+
out = git(["status", "--porcelain", "--untracked-files=all"], cwd).stdout
|
|
53
|
+
paths: list[str] = []
|
|
54
|
+
for line in out.splitlines():
|
|
55
|
+
if not line.strip():
|
|
56
|
+
continue
|
|
57
|
+
# format: "XY <path>" or rename "XY old -> new"
|
|
58
|
+
p = line[3:]
|
|
59
|
+
if " -> " in p:
|
|
60
|
+
p = p.split(" -> ", 1)[1]
|
|
61
|
+
paths.append(p.strip().strip('"'))
|
|
62
|
+
return paths
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def commit_all(message: str, cwd: str | Path) -> bool:
|
|
66
|
+
"""Stage everything and commit. Returns False if there was nothing to commit.
|
|
67
|
+
Signing is disabled so headless runs never block on a passphrase."""
|
|
68
|
+
git(["add", "-A"], cwd)
|
|
69
|
+
if not git(["status", "--porcelain"], cwd).stdout.strip():
|
|
70
|
+
return False
|
|
71
|
+
git(["-c", "commit.gpgsign=false", "commit", "-q", "-m", message], cwd)
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def merge_branch(
|
|
76
|
+
branch: str, cwd: str | Path, message: str | None = None
|
|
77
|
+
) -> subprocess.CompletedProcess:
|
|
78
|
+
"""Merge `branch` into the current branch (no fast-forward, unsigned)."""
|
|
79
|
+
args = ["-c", "commit.gpgsign=false", "merge", "--no-ff"]
|
|
80
|
+
if message:
|
|
81
|
+
args += ["-m", message]
|
|
82
|
+
args.append(branch)
|
|
83
|
+
return git(args, cwd, check=False)
|
director/metrics.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Metrics stream (Phase 3) — `.director/metrics.jsonl`.
|
|
2
|
+
|
|
3
|
+
The hypothesis is falsifiable, so every run must be measurable. This is an
|
|
4
|
+
append-only NDJSON stream: one `kind:"node"` record per finished node and one
|
|
5
|
+
`kind:"run"` summary record at the end. It is written alongside the cost ledger
|
|
6
|
+
(`costs.jsonl`) and run state (`state.json`), and is what `director bench` and any
|
|
7
|
+
external analysis read to compare profiles.
|
|
8
|
+
|
|
9
|
+
Keeping metrics in their own stream (rather than overloading the cost ledger)
|
|
10
|
+
means the cost story stays a pure per-call ledger while metrics carry the derived
|
|
11
|
+
rates (escalation, stage-two trigger, watch-it-fail) and wall time.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import time
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MetricsWriter:
|
|
22
|
+
"""Append-only metrics stream backed by .director/metrics.jsonl."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, path: Path):
|
|
25
|
+
self.path = Path(path)
|
|
26
|
+
|
|
27
|
+
def write(self, record: dict) -> None:
|
|
28
|
+
rec = {"ts": time.time(), **record}
|
|
29
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
with self.path.open("a") as f:
|
|
31
|
+
f.write(json.dumps(rec) + "\n")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def read_records(path: Path) -> list[dict]:
|
|
35
|
+
"""Load all metrics records (both kinds) from a metrics.jsonl, oldest first."""
|
|
36
|
+
path = Path(path)
|
|
37
|
+
out: list[dict] = []
|
|
38
|
+
if path.exists():
|
|
39
|
+
for line in path.read_text().splitlines():
|
|
40
|
+
if line.strip():
|
|
41
|
+
out.append(json.loads(line))
|
|
42
|
+
return out
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def latest_run(path: Path) -> dict | None:
|
|
46
|
+
"""The most recent run-level summary record, if any."""
|
|
47
|
+
runs = [r for r in read_records(path) if r.get("kind") == "run"]
|
|
48
|
+
return runs[-1] if runs else None
|
director/models.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Core data structures: the task DAG (Plan/Node) and per-node run State."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import asdict, dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Node:
|
|
11
|
+
"""One atomic unit of work. `spec` must be self-contained — readable by the
|
|
12
|
+
executor with zero other context."""
|
|
13
|
+
|
|
14
|
+
id: str
|
|
15
|
+
title: str
|
|
16
|
+
spec: str
|
|
17
|
+
files: list[str] # allowlist: the ONLY files the executor may modify
|
|
18
|
+
depends_on: list[str] = field(default_factory=list)
|
|
19
|
+
test_cmd: str = "" # command that gates this node (nonzero = fail)
|
|
20
|
+
tests: list[str] = field(default_factory=list) # test file paths (test-author writes these)
|
|
21
|
+
estimated_difficulty: str = "medium" # easy | medium | hard
|
|
22
|
+
# sha256 of each test file, captured by director once tests are authored (NOT
|
|
23
|
+
# emitted by the planner). The node gate refuses to pass if a test file's hash
|
|
24
|
+
# changed — the executor may not edit the contract. See gates.test_files_intact.
|
|
25
|
+
test_hashes: dict = field(default_factory=dict) # {test_path: sha256}
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def from_dict(d: dict) -> Node:
|
|
29
|
+
# Tolerate common field-name drift from different planner models.
|
|
30
|
+
spec = d.get("spec") or d.get("description") or d.get("desc")
|
|
31
|
+
files = d.get("files") or d.get("files_to_modify") or d.get("file_allowlist") or []
|
|
32
|
+
tests = d.get("tests") or d.get("test_files")
|
|
33
|
+
if tests is None:
|
|
34
|
+
tf = d.get("test_file") or d.get("test")
|
|
35
|
+
tests = [tf] if isinstance(tf, str) else (tf or [])
|
|
36
|
+
if spec is None:
|
|
37
|
+
raise KeyError(f"node {d.get('id')!r} has no spec/description")
|
|
38
|
+
return Node(
|
|
39
|
+
id=str(d["id"]),
|
|
40
|
+
title=d.get("title", str(d["id"])),
|
|
41
|
+
spec=spec,
|
|
42
|
+
files=list(files),
|
|
43
|
+
depends_on=[str(x) for x in d.get("depends_on", [])],
|
|
44
|
+
test_cmd=d.get("test_cmd", ""),
|
|
45
|
+
tests=list(tests),
|
|
46
|
+
estimated_difficulty=d.get("estimated_difficulty", "medium"),
|
|
47
|
+
test_hashes=dict(d.get("test_hashes", {})),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Plan:
|
|
53
|
+
job_id: str
|
|
54
|
+
task: str
|
|
55
|
+
repo: str
|
|
56
|
+
created_at: str
|
|
57
|
+
job_branch: str
|
|
58
|
+
nodes: list[Node] = field(default_factory=list)
|
|
59
|
+
|
|
60
|
+
def node(self, node_id: str) -> Node:
|
|
61
|
+
for n in self.nodes:
|
|
62
|
+
if n.id == node_id:
|
|
63
|
+
return n
|
|
64
|
+
raise KeyError(node_id)
|
|
65
|
+
|
|
66
|
+
def to_json(self) -> str:
|
|
67
|
+
d = asdict(self)
|
|
68
|
+
return json.dumps(d, indent=2)
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def from_json(text: str) -> Plan:
|
|
72
|
+
d = json.loads(text)
|
|
73
|
+
return Plan(
|
|
74
|
+
job_id=d["job_id"],
|
|
75
|
+
task=d["task"],
|
|
76
|
+
repo=d["repo"],
|
|
77
|
+
created_at=d["created_at"],
|
|
78
|
+
job_branch=d["job_branch"],
|
|
79
|
+
nodes=[Node.from_dict(n) for n in d["nodes"]],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Node lifecycle statuses persisted in .director/state.json (resumable).
|
|
84
|
+
PENDING, RUNNING, DONE, ESCALATED, FAILED = "pending", "running", "done", "escalated", "failed"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class NodeState:
|
|
89
|
+
id: str
|
|
90
|
+
status: str = PENDING
|
|
91
|
+
attempts: int = 0 # executor-tier attempts used
|
|
92
|
+
tier_used: str | None = None # "executor" | "escalation"
|
|
93
|
+
model_used: str | None = None
|
|
94
|
+
escalated: bool = False
|
|
95
|
+
tokens: dict = field(default_factory=lambda: {"input": 0, "output": 0})
|
|
96
|
+
cost_usd: float = 0.0
|
|
97
|
+
error: str | None = None
|
|
98
|
+
worktree: str | None = None
|
|
99
|
+
# Phase 2.5 two-stage review
|
|
100
|
+
review_stage_two: bool = False # did the conditional code-quality review run?
|
|
101
|
+
review_blocks: int = 0 # # of attempts re-opened by a critical finding
|
|
102
|
+
review_summary: str | None = None # reviewer's last one-line verdict summary
|
|
103
|
+
# Phase 3 measurement
|
|
104
|
+
wall_secs: float = 0.0 # wall time for the node
|
|
105
|
+
watch_it_fail: str | None = None # "observed" | "not_observed" | "unknown"
|
|
106
|
+
flake_failed: bool = False # a flake re-run failed this node on some attempt
|
director/opencode.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Headless OpenCode driver.
|
|
2
|
+
|
|
3
|
+
Wraps `opencode run --agent <role> --model <provider/model> --format json` and
|
|
4
|
+
parses the NDJSON event stream into a structured result (assistant text + token
|
|
5
|
+
usage + tool activity). This is the ONLY place that shells out to OpenCode.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import subprocess
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
# Director-spawned processes (and the test commands they run) must not litter the
|
|
17
|
+
# worktree with Python bytecode: a stray `__pycache__/*.pyc` would otherwise get
|
|
18
|
+
# `git add -A`-ed into a node commit, poison later merges, and inflate the
|
|
19
|
+
# changed-file count. Suppressing it at the source keeps every worktree clean.
|
|
20
|
+
_CLEAN_ENV = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RunResult:
|
|
25
|
+
returncode: int
|
|
26
|
+
text: str # concatenated assistant text (e.g. planner JSON)
|
|
27
|
+
tokens: dict # summed across steps: {input, output, reasoning, total}
|
|
28
|
+
cost_reported: float # OpenCode's own cost sum (cross-check; often 0 locally)
|
|
29
|
+
n_steps: int
|
|
30
|
+
tool_calls: list[tuple[str, str]] = field(default_factory=list) # (tool, status)
|
|
31
|
+
tool_events: list[dict] = field(default_factory=list) # ordered {name,status,blob}
|
|
32
|
+
error: str | None = None
|
|
33
|
+
timed_out: bool = False
|
|
34
|
+
log_path: str = ""
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def ok(self) -> bool:
|
|
38
|
+
return self.returncode == 0 and self.error is None and not self.timed_out
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def run_agent(
|
|
42
|
+
*,
|
|
43
|
+
agent: str,
|
|
44
|
+
model: str,
|
|
45
|
+
message: str,
|
|
46
|
+
cwd: str | Path,
|
|
47
|
+
log_path: str | Path,
|
|
48
|
+
timeout: int,
|
|
49
|
+
) -> RunResult:
|
|
50
|
+
"""Invoke an OpenCode agent headlessly in `cwd`. NDJSON events go to
|
|
51
|
+
`log_path`; OpenCode logs go to `log_path + '.stderr'`. Never raises on a
|
|
52
|
+
model/agent failure — inspect RunResult.ok / .error / .timed_out."""
|
|
53
|
+
log_path = Path(log_path)
|
|
54
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
err_path = log_path.with_suffix(log_path.suffix + ".stderr")
|
|
56
|
+
|
|
57
|
+
# --dir pins the project/worktree explicitly. Without it, `opencode run`
|
|
58
|
+
# resolves the project root by walking up for a .git and can land on an
|
|
59
|
+
# ENCLOSING repo (so edits leak out of an isolated worktree). Worktrees are
|
|
60
|
+
# also placed outside the repo tree (see run.py) to make this airtight.
|
|
61
|
+
cmd = [
|
|
62
|
+
"opencode",
|
|
63
|
+
"run",
|
|
64
|
+
"--dir",
|
|
65
|
+
str(cwd),
|
|
66
|
+
"--agent",
|
|
67
|
+
agent,
|
|
68
|
+
"--model",
|
|
69
|
+
model,
|
|
70
|
+
"--format",
|
|
71
|
+
"json",
|
|
72
|
+
"--print-logs",
|
|
73
|
+
message,
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
timed_out = False
|
|
77
|
+
with open(log_path, "wb") as out, open(err_path, "wb") as err:
|
|
78
|
+
proc = subprocess.Popen(cmd, cwd=str(cwd), stdout=out, stderr=err, env=_CLEAN_ENV)
|
|
79
|
+
try:
|
|
80
|
+
rc = proc.wait(timeout=timeout)
|
|
81
|
+
except subprocess.TimeoutExpired:
|
|
82
|
+
proc.kill()
|
|
83
|
+
proc.wait()
|
|
84
|
+
rc = 124
|
|
85
|
+
timed_out = True
|
|
86
|
+
|
|
87
|
+
return _parse(log_path, rc, timed_out)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _parse(log_path: Path, rc: int, timed_out: bool) -> RunResult:
|
|
91
|
+
text_parts: list[str] = []
|
|
92
|
+
tokens = {"input": 0, "output": 0, "reasoning": 0, "total": 0}
|
|
93
|
+
cost_reported = 0.0
|
|
94
|
+
n_steps = 0
|
|
95
|
+
tool_calls: list[tuple[str, str]] = []
|
|
96
|
+
tool_events: list[dict] = []
|
|
97
|
+
error: str | None = None
|
|
98
|
+
|
|
99
|
+
for line in log_path.read_text(errors="replace").splitlines():
|
|
100
|
+
line = line.strip()
|
|
101
|
+
if not line.startswith("{"):
|
|
102
|
+
continue
|
|
103
|
+
try:
|
|
104
|
+
evt = json.loads(line)
|
|
105
|
+
except json.JSONDecodeError:
|
|
106
|
+
continue
|
|
107
|
+
etype = evt.get("type")
|
|
108
|
+
part = evt.get("part", {}) if isinstance(evt.get("part"), dict) else {}
|
|
109
|
+
|
|
110
|
+
if etype == "text":
|
|
111
|
+
text_parts.append(part.get("text", ""))
|
|
112
|
+
elif etype in ("step_finish", "step-finish"):
|
|
113
|
+
n_steps += 1
|
|
114
|
+
tk = part.get("tokens", {}) or {}
|
|
115
|
+
for k in ("input", "output", "reasoning", "total"):
|
|
116
|
+
tokens[k] += int(tk.get(k, 0) or 0)
|
|
117
|
+
cost_reported += float(part.get("cost", 0) or 0)
|
|
118
|
+
elif etype in ("tool", "tool_use"):
|
|
119
|
+
name = part.get("tool", "?")
|
|
120
|
+
status = (part.get("state", {}) or {}).get("status", "?")
|
|
121
|
+
tool_calls.append((name, status))
|
|
122
|
+
# keep an ordered, capped content blob per tool event so the
|
|
123
|
+
# watch-it-fail analyzer can tell a test run from a file edit and spot
|
|
124
|
+
# a failure in the command output (shape-tolerant: just stringify part).
|
|
125
|
+
blob = json.dumps(part, default=str)[:2000].lower()
|
|
126
|
+
tool_events.append({"name": str(name).lower(), "status": str(status), "blob": blob})
|
|
127
|
+
elif etype in ("error", "invalid_request_error") or "error" in (etype or ""):
|
|
128
|
+
# error events may be at top level (evt["error"]) or in part
|
|
129
|
+
msg = evt.get("error") or part.get("error") or part.get("message") or etype
|
|
130
|
+
error = str(msg)
|
|
131
|
+
|
|
132
|
+
if tokens["total"] == 0:
|
|
133
|
+
tokens["total"] = tokens["input"] + tokens["output"]
|
|
134
|
+
|
|
135
|
+
return RunResult(
|
|
136
|
+
returncode=rc,
|
|
137
|
+
text="".join(text_parts).strip(),
|
|
138
|
+
tokens=tokens,
|
|
139
|
+
cost_reported=cost_reported,
|
|
140
|
+
n_steps=n_steps,
|
|
141
|
+
tool_calls=tool_calls,
|
|
142
|
+
tool_events=tool_events,
|
|
143
|
+
error=error,
|
|
144
|
+
timed_out=timed_out,
|
|
145
|
+
log_path=str(log_path),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# --------------------------------------------------------------------------- #
|
|
150
|
+
# watch-it-fail transcript verification (Phase 3)
|
|
151
|
+
# --------------------------------------------------------------------------- #
|
|
152
|
+
# Tool-name fragments. OpenCode names vary by provider/model, so match on
|
|
153
|
+
# substrings rather than exact names.
|
|
154
|
+
_EDIT_TOOLS = ("edit", "write", "patch", "create", "apply", "str_replace")
|
|
155
|
+
_SHELL_TOOLS = ("bash", "shell", "run", "execute", "terminal", "command")
|
|
156
|
+
_TEST_SIGNALS = ("pytest", "unittest", "test", "vitest", "jest", "go test", "cargo test")
|
|
157
|
+
_FAIL_SIGNALS = (
|
|
158
|
+
"failed",
|
|
159
|
+
"error",
|
|
160
|
+
"traceback",
|
|
161
|
+
"assertion",
|
|
162
|
+
"exit code 1",
|
|
163
|
+
"non-zero",
|
|
164
|
+
"fail (",
|
|
165
|
+
" failures=",
|
|
166
|
+
"no tests ran",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@dataclass
|
|
171
|
+
class WatchItFail:
|
|
172
|
+
verdict: str # "observed" | "not_observed" | "unknown"
|
|
173
|
+
ran_before_edit: bool
|
|
174
|
+
observed_failure: bool
|
|
175
|
+
detail: str
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def observed(self) -> bool:
|
|
179
|
+
return self.verdict == "observed"
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _is_edit(name: str) -> bool:
|
|
183
|
+
return any(t in name for t in _EDIT_TOOLS)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _is_shell(name: str) -> bool:
|
|
187
|
+
return any(t in name for t in _SHELL_TOOLS)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _looks_like_test(blob: str, test_cmd: str) -> bool:
|
|
191
|
+
if any(s in blob for s in _TEST_SIGNALS):
|
|
192
|
+
return True
|
|
193
|
+
# also match a distinctive token from the configured test command
|
|
194
|
+
_skip = ("python", "python3", "-m", "discover", "&&")
|
|
195
|
+
for tok in test_cmd.lower().replace("/", " ").replace(".", " ").split():
|
|
196
|
+
if len(tok) > 3 and tok not in _skip and tok in blob:
|
|
197
|
+
return True
|
|
198
|
+
return False
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def watch_it_fail(events: list[dict], test_cmd: str) -> WatchItFail:
|
|
202
|
+
"""Did the executor run the (failing) tests BEFORE its first edit? (Phase 3 §1)
|
|
203
|
+
|
|
204
|
+
Advisory — the deterministic node gate already enforces tests-pass and
|
|
205
|
+
contract-intact; this verifies the *discipline* (red before green) from the
|
|
206
|
+
transcript, surfaced as a metric. Returns "unknown" when the transcript
|
|
207
|
+
carries no usable tool activity (can't be held against the node)."""
|
|
208
|
+
if not events:
|
|
209
|
+
return WatchItFail("unknown", False, False, "no tool activity in transcript")
|
|
210
|
+
|
|
211
|
+
saw_test = False
|
|
212
|
+
saw_failure = False
|
|
213
|
+
for ev in events:
|
|
214
|
+
name, blob = ev.get("name", ""), ev.get("blob", "")
|
|
215
|
+
if _is_shell(name) and _looks_like_test(blob, test_cmd):
|
|
216
|
+
saw_test = True
|
|
217
|
+
if any(s in blob for s in _FAIL_SIGNALS):
|
|
218
|
+
saw_failure = True
|
|
219
|
+
return WatchItFail(
|
|
220
|
+
"observed",
|
|
221
|
+
True,
|
|
222
|
+
saw_failure,
|
|
223
|
+
"ran tests before first edit" + (" (saw failure)" if saw_failure else ""),
|
|
224
|
+
)
|
|
225
|
+
if _is_edit(name):
|
|
226
|
+
# an edit happened before any test run → watch-it-fail not honored
|
|
227
|
+
return WatchItFail("not_observed", False, False, "edited before running any tests")
|
|
228
|
+
# tools ran but none matched a test command or an edit
|
|
229
|
+
return WatchItFail(
|
|
230
|
+
"unknown", saw_test, saw_failure, "no test run or edit detected among tool calls"
|
|
231
|
+
)
|