director-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
director/gitutil.py ADDED
@@ -0,0 +1,83 @@
1
+ """Thin git helpers. Director uses real git branches + worktrees for isolation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+
9
+ def git(args: list[str], cwd: str | Path, check: bool = True) -> subprocess.CompletedProcess:
10
+ return subprocess.run(
11
+ ["git", *args],
12
+ cwd=str(cwd),
13
+ capture_output=True,
14
+ text=True,
15
+ check=check,
16
+ )
17
+
18
+
19
+ def current_commit(cwd: str | Path) -> str:
20
+ return git(["rev-parse", "HEAD"], cwd).stdout.strip()
21
+
22
+
23
+ def current_branch(cwd: str | Path) -> str:
24
+ return git(["rev-parse", "--abbrev-ref", "HEAD"], cwd).stdout.strip()
25
+
26
+
27
+ def branch_exists(name: str, cwd: str | Path) -> bool:
28
+ return git(["rev-parse", "--verify", "--quiet", name], cwd, check=False).returncode == 0
29
+
30
+
31
+ def create_branch(name: str, cwd: str | Path, base: str | None = None) -> None:
32
+ args = ["branch", name] + ([base] if base else [])
33
+ git(args, cwd)
34
+
35
+
36
+ def checkout(name: str, cwd: str | Path) -> None:
37
+ git(["checkout", name], cwd)
38
+
39
+
40
+ def worktree_add(path: str | Path, branch: str, base: str, cwd: str | Path) -> None:
41
+ """Create a new branch `branch` from `base` checked out at `path`."""
42
+ git(["worktree", "add", "-b", branch, str(path), base], cwd)
43
+
44
+
45
+ def worktree_remove(path: str | Path, cwd: str | Path) -> None:
46
+ git(["worktree", "remove", "--force", str(path)], cwd, check=False)
47
+
48
+
49
+ def changed_paths(cwd: str | Path) -> list[str]:
50
+ """All paths modified/added/deleted vs HEAD, including untracked (ignored
51
+ files excluded). Used to enforce the file allowlist."""
52
+ out = git(["status", "--porcelain", "--untracked-files=all"], cwd).stdout
53
+ paths: list[str] = []
54
+ for line in out.splitlines():
55
+ if not line.strip():
56
+ continue
57
+ # format: "XY <path>" or rename "XY old -> new"
58
+ p = line[3:]
59
+ if " -> " in p:
60
+ p = p.split(" -> ", 1)[1]
61
+ paths.append(p.strip().strip('"'))
62
+ return paths
63
+
64
+
65
+ def commit_all(message: str, cwd: str | Path) -> bool:
66
+ """Stage everything and commit. Returns False if there was nothing to commit.
67
+ Signing is disabled so headless runs never block on a passphrase."""
68
+ git(["add", "-A"], cwd)
69
+ if not git(["status", "--porcelain"], cwd).stdout.strip():
70
+ return False
71
+ git(["-c", "commit.gpgsign=false", "commit", "-q", "-m", message], cwd)
72
+ return True
73
+
74
+
75
+ def merge_branch(
76
+ branch: str, cwd: str | Path, message: str | None = None
77
+ ) -> subprocess.CompletedProcess:
78
+ """Merge `branch` into the current branch (no fast-forward, unsigned)."""
79
+ args = ["-c", "commit.gpgsign=false", "merge", "--no-ff"]
80
+ if message:
81
+ args += ["-m", message]
82
+ args.append(branch)
83
+ return git(args, cwd, check=False)
director/metrics.py ADDED
@@ -0,0 +1,48 @@
1
+ """Metrics stream (Phase 3) — `.director/metrics.jsonl`.
2
+
3
+ The hypothesis is falsifiable, so every run must be measurable. This is an
4
+ append-only NDJSON stream: one `kind:"node"` record per finished node and one
5
+ `kind:"run"` summary record at the end. It is written alongside the cost ledger
6
+ (`costs.jsonl`) and run state (`state.json`), and is what `director bench` and any
7
+ external analysis read to compare profiles.
8
+
9
+ Keeping metrics in their own stream (rather than overloading the cost ledger)
10
+ means the cost story stays a pure per-call ledger while metrics carry the derived
11
+ rates (escalation, stage-two trigger, watch-it-fail) and wall time.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import time
18
+ from pathlib import Path
19
+
20
+
21
+ class MetricsWriter:
22
+ """Append-only metrics stream backed by .director/metrics.jsonl."""
23
+
24
+ def __init__(self, path: Path):
25
+ self.path = Path(path)
26
+
27
+ def write(self, record: dict) -> None:
28
+ rec = {"ts": time.time(), **record}
29
+ self.path.parent.mkdir(parents=True, exist_ok=True)
30
+ with self.path.open("a") as f:
31
+ f.write(json.dumps(rec) + "\n")
32
+
33
+
34
+ def read_records(path: Path) -> list[dict]:
35
+ """Load all metrics records (both kinds) from a metrics.jsonl, oldest first."""
36
+ path = Path(path)
37
+ out: list[dict] = []
38
+ if path.exists():
39
+ for line in path.read_text().splitlines():
40
+ if line.strip():
41
+ out.append(json.loads(line))
42
+ return out
43
+
44
+
45
+ def latest_run(path: Path) -> dict | None:
46
+ """The most recent run-level summary record, if any."""
47
+ runs = [r for r in read_records(path) if r.get("kind") == "run"]
48
+ return runs[-1] if runs else None
director/models.py ADDED
@@ -0,0 +1,106 @@
1
+ """Core data structures: the task DAG (Plan/Node) and per-node run State."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import asdict, dataclass, field
7
+
8
+
9
+ @dataclass
10
+ class Node:
11
+ """One atomic unit of work. `spec` must be self-contained — readable by the
12
+ executor with zero other context."""
13
+
14
+ id: str
15
+ title: str
16
+ spec: str
17
+ files: list[str] # allowlist: the ONLY files the executor may modify
18
+ depends_on: list[str] = field(default_factory=list)
19
+ test_cmd: str = "" # command that gates this node (nonzero = fail)
20
+ tests: list[str] = field(default_factory=list) # test file paths (test-author writes these)
21
+ estimated_difficulty: str = "medium" # easy | medium | hard
22
+ # sha256 of each test file, captured by director once tests are authored (NOT
23
+ # emitted by the planner). The node gate refuses to pass if a test file's hash
24
+ # changed — the executor may not edit the contract. See gates.test_files_intact.
25
+ test_hashes: dict = field(default_factory=dict) # {test_path: sha256}
26
+
27
+ @staticmethod
28
+ def from_dict(d: dict) -> Node:
29
+ # Tolerate common field-name drift from different planner models.
30
+ spec = d.get("spec") or d.get("description") or d.get("desc")
31
+ files = d.get("files") or d.get("files_to_modify") or d.get("file_allowlist") or []
32
+ tests = d.get("tests") or d.get("test_files")
33
+ if tests is None:
34
+ tf = d.get("test_file") or d.get("test")
35
+ tests = [tf] if isinstance(tf, str) else (tf or [])
36
+ if spec is None:
37
+ raise KeyError(f"node {d.get('id')!r} has no spec/description")
38
+ return Node(
39
+ id=str(d["id"]),
40
+ title=d.get("title", str(d["id"])),
41
+ spec=spec,
42
+ files=list(files),
43
+ depends_on=[str(x) for x in d.get("depends_on", [])],
44
+ test_cmd=d.get("test_cmd", ""),
45
+ tests=list(tests),
46
+ estimated_difficulty=d.get("estimated_difficulty", "medium"),
47
+ test_hashes=dict(d.get("test_hashes", {})),
48
+ )
49
+
50
+
51
+ @dataclass
52
+ class Plan:
53
+ job_id: str
54
+ task: str
55
+ repo: str
56
+ created_at: str
57
+ job_branch: str
58
+ nodes: list[Node] = field(default_factory=list)
59
+
60
+ def node(self, node_id: str) -> Node:
61
+ for n in self.nodes:
62
+ if n.id == node_id:
63
+ return n
64
+ raise KeyError(node_id)
65
+
66
+ def to_json(self) -> str:
67
+ d = asdict(self)
68
+ return json.dumps(d, indent=2)
69
+
70
+ @staticmethod
71
+ def from_json(text: str) -> Plan:
72
+ d = json.loads(text)
73
+ return Plan(
74
+ job_id=d["job_id"],
75
+ task=d["task"],
76
+ repo=d["repo"],
77
+ created_at=d["created_at"],
78
+ job_branch=d["job_branch"],
79
+ nodes=[Node.from_dict(n) for n in d["nodes"]],
80
+ )
81
+
82
+
83
+ # Node lifecycle statuses persisted in .director/state.json (resumable).
84
+ PENDING, RUNNING, DONE, ESCALATED, FAILED = "pending", "running", "done", "escalated", "failed"
85
+
86
+
87
+ @dataclass
88
+ class NodeState:
89
+ id: str
90
+ status: str = PENDING
91
+ attempts: int = 0 # executor-tier attempts used
92
+ tier_used: str | None = None # "executor" | "escalation"
93
+ model_used: str | None = None
94
+ escalated: bool = False
95
+ tokens: dict = field(default_factory=lambda: {"input": 0, "output": 0})
96
+ cost_usd: float = 0.0
97
+ error: str | None = None
98
+ worktree: str | None = None
99
+ # Phase 2.5 two-stage review
100
+ review_stage_two: bool = False # did the conditional code-quality review run?
101
+ review_blocks: int = 0 # # of attempts re-opened by a critical finding
102
+ review_summary: str | None = None # reviewer's last one-line verdict summary
103
+ # Phase 3 measurement
104
+ wall_secs: float = 0.0 # wall time for the node
105
+ watch_it_fail: str | None = None # "observed" | "not_observed" | "unknown"
106
+ flake_failed: bool = False # a flake re-run failed this node on some attempt
director/opencode.py ADDED
@@ -0,0 +1,231 @@
1
+ """Headless OpenCode driver.
2
+
3
+ Wraps `opencode run --agent <role> --model <provider/model> --format json` and
4
+ parses the NDJSON event stream into a structured result (assistant text + token
5
+ usage + tool activity). This is the ONLY place that shells out to OpenCode.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import subprocess
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+
16
+ # Director-spawned processes (and the test commands they run) must not litter the
17
+ # worktree with Python bytecode: a stray `__pycache__/*.pyc` would otherwise get
18
+ # `git add -A`-ed into a node commit, poison later merges, and inflate the
19
+ # changed-file count. Suppressing it at the source keeps every worktree clean.
20
+ _CLEAN_ENV = {**os.environ, "PYTHONDONTWRITEBYTECODE": "1"}
21
+
22
+
23
+ @dataclass
24
+ class RunResult:
25
+ returncode: int
26
+ text: str # concatenated assistant text (e.g. planner JSON)
27
+ tokens: dict # summed across steps: {input, output, reasoning, total}
28
+ cost_reported: float # OpenCode's own cost sum (cross-check; often 0 locally)
29
+ n_steps: int
30
+ tool_calls: list[tuple[str, str]] = field(default_factory=list) # (tool, status)
31
+ tool_events: list[dict] = field(default_factory=list) # ordered {name,status,blob}
32
+ error: str | None = None
33
+ timed_out: bool = False
34
+ log_path: str = ""
35
+
36
+ @property
37
+ def ok(self) -> bool:
38
+ return self.returncode == 0 and self.error is None and not self.timed_out
39
+
40
+
41
+ def run_agent(
42
+ *,
43
+ agent: str,
44
+ model: str,
45
+ message: str,
46
+ cwd: str | Path,
47
+ log_path: str | Path,
48
+ timeout: int,
49
+ ) -> RunResult:
50
+ """Invoke an OpenCode agent headlessly in `cwd`. NDJSON events go to
51
+ `log_path`; OpenCode logs go to `log_path + '.stderr'`. Never raises on a
52
+ model/agent failure — inspect RunResult.ok / .error / .timed_out."""
53
+ log_path = Path(log_path)
54
+ log_path.parent.mkdir(parents=True, exist_ok=True)
55
+ err_path = log_path.with_suffix(log_path.suffix + ".stderr")
56
+
57
+ # --dir pins the project/worktree explicitly. Without it, `opencode run`
58
+ # resolves the project root by walking up for a .git and can land on an
59
+ # ENCLOSING repo (so edits leak out of an isolated worktree). Worktrees are
60
+ # also placed outside the repo tree (see run.py) to make this airtight.
61
+ cmd = [
62
+ "opencode",
63
+ "run",
64
+ "--dir",
65
+ str(cwd),
66
+ "--agent",
67
+ agent,
68
+ "--model",
69
+ model,
70
+ "--format",
71
+ "json",
72
+ "--print-logs",
73
+ message,
74
+ ]
75
+
76
+ timed_out = False
77
+ with open(log_path, "wb") as out, open(err_path, "wb") as err:
78
+ proc = subprocess.Popen(cmd, cwd=str(cwd), stdout=out, stderr=err, env=_CLEAN_ENV)
79
+ try:
80
+ rc = proc.wait(timeout=timeout)
81
+ except subprocess.TimeoutExpired:
82
+ proc.kill()
83
+ proc.wait()
84
+ rc = 124
85
+ timed_out = True
86
+
87
+ return _parse(log_path, rc, timed_out)
88
+
89
+
90
+ def _parse(log_path: Path, rc: int, timed_out: bool) -> RunResult:
91
+ text_parts: list[str] = []
92
+ tokens = {"input": 0, "output": 0, "reasoning": 0, "total": 0}
93
+ cost_reported = 0.0
94
+ n_steps = 0
95
+ tool_calls: list[tuple[str, str]] = []
96
+ tool_events: list[dict] = []
97
+ error: str | None = None
98
+
99
+ for line in log_path.read_text(errors="replace").splitlines():
100
+ line = line.strip()
101
+ if not line.startswith("{"):
102
+ continue
103
+ try:
104
+ evt = json.loads(line)
105
+ except json.JSONDecodeError:
106
+ continue
107
+ etype = evt.get("type")
108
+ part = evt.get("part", {}) if isinstance(evt.get("part"), dict) else {}
109
+
110
+ if etype == "text":
111
+ text_parts.append(part.get("text", ""))
112
+ elif etype in ("step_finish", "step-finish"):
113
+ n_steps += 1
114
+ tk = part.get("tokens", {}) or {}
115
+ for k in ("input", "output", "reasoning", "total"):
116
+ tokens[k] += int(tk.get(k, 0) or 0)
117
+ cost_reported += float(part.get("cost", 0) or 0)
118
+ elif etype in ("tool", "tool_use"):
119
+ name = part.get("tool", "?")
120
+ status = (part.get("state", {}) or {}).get("status", "?")
121
+ tool_calls.append((name, status))
122
+ # keep an ordered, capped content blob per tool event so the
123
+ # watch-it-fail analyzer can tell a test run from a file edit and spot
124
+ # a failure in the command output (shape-tolerant: just stringify part).
125
+ blob = json.dumps(part, default=str)[:2000].lower()
126
+ tool_events.append({"name": str(name).lower(), "status": str(status), "blob": blob})
127
+ elif etype in ("error", "invalid_request_error") or "error" in (etype or ""):
128
+ # error events may be at top level (evt["error"]) or in part
129
+ msg = evt.get("error") or part.get("error") or part.get("message") or etype
130
+ error = str(msg)
131
+
132
+ if tokens["total"] == 0:
133
+ tokens["total"] = tokens["input"] + tokens["output"]
134
+
135
+ return RunResult(
136
+ returncode=rc,
137
+ text="".join(text_parts).strip(),
138
+ tokens=tokens,
139
+ cost_reported=cost_reported,
140
+ n_steps=n_steps,
141
+ tool_calls=tool_calls,
142
+ tool_events=tool_events,
143
+ error=error,
144
+ timed_out=timed_out,
145
+ log_path=str(log_path),
146
+ )
147
+
148
+
149
+ # --------------------------------------------------------------------------- #
150
+ # watch-it-fail transcript verification (Phase 3)
151
+ # --------------------------------------------------------------------------- #
152
+ # Tool-name fragments. OpenCode names vary by provider/model, so match on
153
+ # substrings rather than exact names.
154
+ _EDIT_TOOLS = ("edit", "write", "patch", "create", "apply", "str_replace")
155
+ _SHELL_TOOLS = ("bash", "shell", "run", "execute", "terminal", "command")
156
+ _TEST_SIGNALS = ("pytest", "unittest", "test", "vitest", "jest", "go test", "cargo test")
157
+ _FAIL_SIGNALS = (
158
+ "failed",
159
+ "error",
160
+ "traceback",
161
+ "assertion",
162
+ "exit code 1",
163
+ "non-zero",
164
+ "fail (",
165
+ " failures=",
166
+ "no tests ran",
167
+ )
168
+
169
+
170
+ @dataclass
171
+ class WatchItFail:
172
+ verdict: str # "observed" | "not_observed" | "unknown"
173
+ ran_before_edit: bool
174
+ observed_failure: bool
175
+ detail: str
176
+
177
+ @property
178
+ def observed(self) -> bool:
179
+ return self.verdict == "observed"
180
+
181
+
182
+ def _is_edit(name: str) -> bool:
183
+ return any(t in name for t in _EDIT_TOOLS)
184
+
185
+
186
+ def _is_shell(name: str) -> bool:
187
+ return any(t in name for t in _SHELL_TOOLS)
188
+
189
+
190
+ def _looks_like_test(blob: str, test_cmd: str) -> bool:
191
+ if any(s in blob for s in _TEST_SIGNALS):
192
+ return True
193
+ # also match a distinctive token from the configured test command
194
+ _skip = ("python", "python3", "-m", "discover", "&&")
195
+ for tok in test_cmd.lower().replace("/", " ").replace(".", " ").split():
196
+ if len(tok) > 3 and tok not in _skip and tok in blob:
197
+ return True
198
+ return False
199
+
200
+
201
+ def watch_it_fail(events: list[dict], test_cmd: str) -> WatchItFail:
202
+ """Did the executor run the (failing) tests BEFORE its first edit? (Phase 3 §1)
203
+
204
+ Advisory — the deterministic node gate already enforces tests-pass and
205
+ contract-intact; this verifies the *discipline* (red before green) from the
206
+ transcript, surfaced as a metric. Returns "unknown" when the transcript
207
+ carries no usable tool activity (can't be held against the node)."""
208
+ if not events:
209
+ return WatchItFail("unknown", False, False, "no tool activity in transcript")
210
+
211
+ saw_test = False
212
+ saw_failure = False
213
+ for ev in events:
214
+ name, blob = ev.get("name", ""), ev.get("blob", "")
215
+ if _is_shell(name) and _looks_like_test(blob, test_cmd):
216
+ saw_test = True
217
+ if any(s in blob for s in _FAIL_SIGNALS):
218
+ saw_failure = True
219
+ return WatchItFail(
220
+ "observed",
221
+ True,
222
+ saw_failure,
223
+ "ran tests before first edit" + (" (saw failure)" if saw_failure else ""),
224
+ )
225
+ if _is_edit(name):
226
+ # an edit happened before any test run → watch-it-fail not honored
227
+ return WatchItFail("not_observed", False, False, "edited before running any tests")
228
+ # tools ran but none matched a test command or an edit
229
+ return WatchItFail(
230
+ "unknown", saw_test, saw_failure, "no test run or edit detected among tool calls"
231
+ )