opencode-goal-mode 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Read-only projection of persisted guard state for the TUI sidebar banner.
3
+ *
4
+ * The sidebar plugin runs in OpenCode's TUI process, separate from the server
5
+ * plugin that owns the live store. The two are paired through the same on-disk
6
+ * snapshot the server plugin already writes (persistence.js). This module reads
7
+ * that snapshot and projects the active session's goal into a compact banner
8
+ * model. It is pure and synchronous (a cheap file read), so it is unit-testable
9
+ * without a TUI runtime.
10
+ */
11
+
12
+ import { readFileSync } from "node:fs";
13
+ import { join } from "node:path";
14
+ import { stateBaseDir, projectKey } from "./persistence.js";
15
+ import { DEFAULT_CONFIG } from "./config.js";
16
+ import { sidebarView } from "./summary.js";
17
+
18
+ /** Absolute path of the guard's state file for a given worktree. */
19
+ export function sidebarStateFile(worktree, env = process.env) {
20
+ return join(stateBaseDir(env), `${projectKey(worktree)}.json`);
21
+ }
22
+
23
+ /** Defensive normalisation so a partial/legacy record never throws in projection. */
24
+ function normalize(record) {
25
+ const st = record && typeof record === "object" ? record : {};
26
+ if (!Array.isArray(st.stickyGates)) st.stickyGates = [];
27
+ if (!Array.isArray(st.changedFiles)) st.changedFiles = [];
28
+ if (!st.latestVerdict || typeof st.latestVerdict !== "object") st.latestVerdict = {};
29
+ return st;
30
+ }
31
+
32
+ /**
33
+ * Choose which session's goal to show: the most-recently-touched ACTIVE session
34
+ * (optionally preferring an explicit sessionId when it is present and active).
35
+ */
36
+ export function pickSession(snapshot, sessionId) {
37
+ if (!snapshot || !Array.isArray(snapshot.sessions)) return null;
38
+ const records = snapshot.sessions
39
+ .filter((e) => Array.isArray(e) && e.length === 2)
40
+ .map(([key, st]) => [key, normalize(st)]);
41
+ if (sessionId) {
42
+ const direct = records.find(([key, st]) => key === sessionId && st.active);
43
+ if (direct) return direct[1];
44
+ }
45
+ const active = records.filter(([, st]) => st.active);
46
+ if (active.length === 0) return null;
47
+ active.sort((a, b) => (b[1].touchedAt || 0) - (a[1].touchedAt || 0));
48
+ return active[0][1];
49
+ }
50
+
51
+ /**
52
+ * Build the sidebar banner model for a worktree, or null if there is nothing to
53
+ * show. Returns { goal, status, allowed, … } (see summary.sidebarView).
54
+ *
55
+ * @param {object} opts
56
+ * @param {string} opts.worktree Project worktree root (same key the guard uses).
57
+ * @param {string} [opts.sessionId]
58
+ * @param {object} [opts.config]
59
+ * @param {Record<string,string|undefined>} [opts.env]
60
+ */
61
+ export function readSidebarModel({ worktree, sessionId, config = DEFAULT_CONFIG, env = process.env } = {}) {
62
+ let snapshot;
63
+ try {
64
+ snapshot = JSON.parse(readFileSync(sidebarStateFile(worktree, env), "utf8"));
65
+ } catch {
66
+ return null; // no state yet, or unreadable — show nothing.
67
+ }
68
+ const record = pickSession(snapshot, sessionId);
69
+ if (!record) return null;
70
+ return sidebarView(record, config);
71
+ }
@@ -5,6 +5,39 @@
5
5
 
6
6
  import { requiredGates, missingGates, gatePassedFresh } from "./gates.js";
7
7
 
8
+ /**
9
+ * A short, single-line human label for the current goal — preferring the
10
+ * recorded Goal Contract's original request, falling back to the captured goal
11
+ * text. Collapses whitespace and truncates to `max` chars for compact display
12
+ * (status reports, the TUI sidebar banner).
13
+ */
14
+ export function shortGoalLabel(state, max = 80) {
15
+ const raw = String(state?.contract?.original || state?.goalText || "").replace(/\s+/g, " ").trim();
16
+ if (!raw) return "";
17
+ // Prefer the first sentence/clause if it is reasonably short.
18
+ const firstSentence = raw.split(/(?<=[.!?])\s/)[0];
19
+ const base = firstSentence.length > 0 && firstSentence.length <= max ? firstSentence : raw;
20
+ if (base.length <= max) return base;
21
+ return `${base.slice(0, max - 1).trimEnd()}…`;
22
+ }
23
+
24
+ /**
25
+ * Compact projection for the TUI sidebar banner: the short goal label, a
26
+ * one-line gate/dirty status, and whether completion is currently allowed.
27
+ * Returns null when there is no active goal worth showing.
28
+ */
29
+ export function sidebarView(state, config) {
30
+ if (!state || !state.active) return null;
31
+ const goal = shortGoalLabel(state);
32
+ if (!goal) return null;
33
+ const required = requiredGates(state, config);
34
+ const missing = missingGates(state, config);
35
+ const passing = required.length - missing.length;
36
+ const allowed = required.length > 0 && missing.length === 0 && !state.dirty;
37
+ const status = `${passing}/${required.length} gates` + (state.dirty ? " · dirty" : "") + (allowed ? " · ready" : "");
38
+ return { goal, status, allowed, reviewCycles: state.reviewCycles, passing, required: required.length, dirty: Boolean(state.dirty) };
39
+ }
40
+
8
41
  export function summarizeState(state, config) {
9
42
  const verdictSummary =
10
43
  state.verdicts
@@ -50,6 +83,7 @@ export function statusReport(state, config) {
50
83
  const missing = missingGates(state, config);
51
84
  return {
52
85
  active: Boolean(state.active),
86
+ goal: shortGoalLabel(state),
53
87
  dirty: Boolean(state.dirty),
54
88
  reviewCycles: state.reviewCycles,
55
89
  requiredGates: required,
@@ -38,10 +38,16 @@ export function createGoalTools({ store, config, persist }) {
38
38
  async execute(_args, ctx) {
39
39
  const state = store.stateFor(ctx.sessionID);
40
40
  const report = statusReport(state, config);
41
+ const goal = report.goal ? `“${report.goal}” — ` : "";
41
42
  return {
42
- title: `Goal status: completion ${report.completionAllowed ? "allowed" : "blocked"}`,
43
+ title: `Goal status: ${goal}completion ${report.completionAllowed ? "allowed" : "blocked"}`,
43
44
  output: JSON.stringify(report, null, 2),
44
- metadata: { completionAllowed: report.completionAllowed, reviewCycles: report.reviewCycles },
45
+ metadata: {
46
+ goal: report.goal,
47
+ completionAllowed: report.completionAllowed,
48
+ reviewCycles: report.reviewCycles,
49
+ missingGates: report.missingGates,
50
+ },
45
51
  };
46
52
  },
47
53
  }),
@@ -179,7 +179,9 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
179
179
  // records against that same session (never another), so it can neither
180
180
  // mis-credit a sibling session nor break the parent goal, which the task
181
181
  // path already covers. Split by tool type so the two never double-count.
182
+ const wasAllowed = completionAllowed(state, config);
182
183
  let recordedAgent = null;
184
+ let recordedVerdict = null;
183
185
  if (tool === "task") {
184
186
  const sub = normalizedSubagent(inp);
185
187
  if (isReviewAgent(sub)) {
@@ -188,6 +190,7 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
188
190
  if (verdict) {
189
191
  recordVerdict(store, state, sub, verdict, text);
190
192
  recordedAgent = sub;
193
+ recordedVerdict = verdict;
191
194
  }
192
195
  }
193
196
  } else if (isReviewAgent(state.currentAgent)) {
@@ -196,12 +199,22 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
196
199
  if (verdict) {
197
200
  recordVerdict(store, state, state.currentAgent, verdict, text);
198
201
  recordedAgent = state.currentAgent;
202
+ recordedVerdict = verdict;
199
203
  }
200
204
  }
201
205
 
202
206
  if (recordedAgent === CYCLE_CLOSING_AGENT) {
203
207
  maybeClearDirtyOnFinalPass(state, config);
204
208
  }
209
+
210
+ // Surface review progress in the TUI: a toast per recorded verdict, and a
211
+ // single celebratory toast the moment the last required gate clears.
212
+ if (recordedAgent && recordedVerdict && config.toastOnReview) {
213
+ logger.toast(`Goal Guard: ${recordedAgent} → ${recordedVerdict}`, recordedVerdict === "PASS" ? "success" : "warning");
214
+ if (!wasAllowed && completionAllowed(state, config)) {
215
+ logger.toast("Goal Guard: all required gates passed — completion unlocked", "success");
216
+ }
217
+ }
205
218
  persist();
206
219
  } catch {
207
220
  /* never break a turn */
@@ -0,0 +1,141 @@
1
+ /** @jsxImportSource @opentui/solid */
2
+ /**
3
+ * Goal Mode — experimental TUI sidebar banner.
4
+ *
5
+ * EXPERIMENTAL. This is a TUI plugin module (the companion to the server-side
6
+ * goal-guard plugin). It renders the current goal as a short, shining-yellow
7
+ * banner in the OpenCode sidebar, with a compact `passing/total gates ·
8
+ * dirty/ready` status line, and updates as reviews land.
9
+ *
10
+ * It only does anything inside a TUI-plugin-capable OpenCode (one exposing
11
+ * `api.slots.register`). On any older runtime, missing API, or render error it
12
+ * silently no-ops — it can never break your TUI.
13
+ *
14
+ * Pairing: it reads the SAME on-disk snapshot the goal-guard server plugin
15
+ * writes (see goal-guard/persistence.js), so the two stay in sync with no extra
16
+ * IPC. The pure projection (`summary.sidebarView`) is shared with the server
17
+ * plugin and unit-tested via goal-guard/sidebar-data.js; only the file read and
18
+ * state-path computation are reimplemented here.
19
+ *
20
+ * Runtime constraints (mirrored from working OpenCode TUI plugins):
21
+ * - TUI plugin modules export `export default { id, tui }`.
22
+ * - The Bun TUI plugin runtime does NOT support top-level ESM imports of Node
23
+ * built-ins, so `node:fs`/`node:path`/`node:os`/`node:crypto` are `require()`d
24
+ * lazily inside functions. Top-level imports of regular packages (solid-js)
25
+ * and of our Node-built-in-free local modules are fine.
26
+ * - This file uses Solid/opentui JSX and is loaded only by OpenCode's (Bun) TUI
27
+ * runtime, which transpiles it; it is never imported by the Node test suite.
28
+ */
29
+
30
+ import { createSignal, onCleanup, Show } from "solid-js";
31
+ import { sidebarView } from "./goal-guard/summary.js";
32
+ import { DEFAULT_CONFIG } from "./goal-guard/config.js";
33
+
34
+ const DEFAULT_COLOR = "#FFD700"; // shining yellow
35
+ const POLL_MS = 1500;
36
+
37
+ function resolveOptions(options, env) {
38
+ const e = env || {};
39
+ const enabledOpt = options?.sidebarBanner;
40
+ const enabledEnv = e.GOAL_GUARD_SIDEBAR_BANNER;
41
+ const disabled =
42
+ enabledOpt === false || enabledEnv === "0" || enabledEnv === "false" || enabledEnv === "off";
43
+ const color = options?.sidebarColor || e.GOAL_GUARD_SIDEBAR_COLOR || DEFAULT_COLOR;
44
+ return { enabled: !disabled, color };
45
+ }
46
+
47
+ /**
48
+ * Read the guard's persisted snapshot for a worktree. The state-path logic is
49
+ * kept identical to goal-guard/persistence.js (stateBaseDir + projectKey); node
50
+ * built-ins are required lazily to satisfy the TUI runtime.
51
+ */
52
+ function readSnapshot(worktree) {
53
+ try {
54
+ const fs = require("node:fs");
55
+ const path = require("node:path");
56
+ const os = require("node:os");
57
+ const crypto = require("node:crypto");
58
+ const xdg = process.env.XDG_STATE_HOME && process.env.XDG_STATE_HOME.trim();
59
+ const base = xdg || path.join(os.homedir(), ".local", "state");
60
+ const key = crypto.createHash("sha256").update(String(worktree || "default")).digest("hex").slice(0, 16);
61
+ const file = path.join(base, "opencode", "goal-guard", `${key}.json`);
62
+ return JSON.parse(fs.readFileSync(file, "utf8"));
63
+ } catch {
64
+ return null;
65
+ }
66
+ }
67
+
68
+ /** Most-recently-touched active session, preferring an explicit active sessionId. */
69
+ function pickSession(snapshot, sessionId) {
70
+ if (!snapshot || !Array.isArray(snapshot.sessions)) return null;
71
+ const records = snapshot.sessions
72
+ .filter((e) => Array.isArray(e) && e.length === 2)
73
+ .map(([key, st]) => [key, st && typeof st === "object" ? st : {}]);
74
+ if (sessionId) {
75
+ const direct = records.find(([key, st]) => key === sessionId && st.active);
76
+ if (direct) return direct[1];
77
+ }
78
+ const active = records.filter(([, st]) => st.active);
79
+ if (active.length === 0) return null;
80
+ active.sort((a, b) => (b[1].touchedAt || 0) - (a[1].touchedAt || 0));
81
+ return active[0][1];
82
+ }
83
+
84
+ function readModel(worktree, sessionId) {
85
+ const snapshot = readSnapshot(worktree);
86
+ if (!snapshot) return null;
87
+ const record = pickSession(snapshot, sessionId);
88
+ if (!record) return null;
89
+ try {
90
+ return sidebarView(record, DEFAULT_CONFIG);
91
+ } catch {
92
+ return null;
93
+ }
94
+ }
95
+
96
+ export const id = "goal-mode-sidebar";
97
+
98
+ /** @type {import("@opencode-ai/plugin/tui").TuiPlugin} */
99
+ export const tui = async (api, options) => {
100
+ try {
101
+ const { enabled, color } = resolveOptions(options, typeof process !== "undefined" ? process.env : {});
102
+ if (!enabled) return;
103
+ if (!api?.slots?.register) return; // runtime without the slot API → no-op.
104
+
105
+ const worktree = api.state?.path?.worktree || api.state?.path?.directory;
106
+
107
+ api.slots.register({
108
+ order: 50,
109
+ slots: {
110
+ sidebar_content(_ctx, props) {
111
+ const read = () => {
112
+ try {
113
+ return readModel(worktree, props?.session_id);
114
+ } catch {
115
+ return null;
116
+ }
117
+ };
118
+ const [model, setModel] = createSignal(read());
119
+ const timer = setInterval(() => setModel(read()), POLL_MS);
120
+ onCleanup(() => clearInterval(timer));
121
+ return (
122
+ <Show when={model()}>
123
+ <box flexDirection="column">
124
+ <text fg={color}>
125
+ {"◆ "}
126
+ <b>GOAL</b>
127
+ {` ${model().goal}`}
128
+ </text>
129
+ <text fg={color}>{model().status}</text>
130
+ </box>
131
+ </Show>
132
+ );
133
+ },
134
+ },
135
+ });
136
+ } catch {
137
+ /* TUI runtime missing or API drift — render nothing rather than crash. */
138
+ }
139
+ };
140
+
141
+ export default { id, tui };
@@ -4,81 +4,87 @@ Reproducible measurement of the destructive-command guard from a repository
4
4
  checkout. Run:
5
5
 
6
6
  ```bash
7
- npm run bench # detection / false-positive / latency benchmark
8
- npm run bench:truthfulness # print the completion truthfulness benchmark JSON
9
- npm run bench:compare # regenerate the capability-comparison chart
7
+ npm run bench # external + fixture benchmarks results.json + charts
8
+ node benchmarks/external.mjs # external benchmark only (add --json for full detail)
9
+ npm run bench:truthfulness # print the completion-enforcement fixture JSON
10
+ npm run bench:compare # regenerate the capability-comparison chart
10
11
  ```
11
12
 
12
13
  `npm run bench` writes `docs/benchmarks/results.json` and the SVG charts the
13
14
  README embeds.
14
15
 
15
- ## Methodology
16
-
17
- - **Corpus** (`benchmarks/corpus.mjs`): 71 real shell commands a coding agent
18
- might emit, each labeled `destructive` (a guard must block) or `safe` (a guard
19
- must not block). Split into families: *classic* (plain `rm -rf`, `git reset
20
- --hard`), *obfuscated* (the bypass corpussubstitutions, wrappers, `bash -c`,
21
- interpreters, weaponized git), *remote-exec* (`curl | sh`), and *safe*
22
- (read-only and quoted-text commands, including ones the old guard
23
- false-positived).
24
- - **Baseline** (`benchmarks/legacy-analyzer.mjs`): the original regex classifier,
25
- preserved **verbatim** from the first published release (commit `130956d`), so
26
- the comparison is apples-to-apples against the same code that shipped.
27
- - **A command counts as "blocked"** when the analyzer flags it `destructive` or
28
- `networkExec` (the two signals `tool.execute.before` throws on). `mutating`
29
- marks the session dirty but does not block, so it is not counted here.
30
- - **Metrics**: detection rate (recall over destructive commands),
31
- false-positive rate (safe commands wrongly blocked), and per-command latency.
32
- - **False Completion Dataset** (`benchmarks/completion-corpus.mjs`): labeled final
33
- answer scenarios for premature and valid completion claims. It checks whether
34
- `completion.js` blocks missing review-cycle lines, zero cycles, stale reviews,
35
- mismatched cycle counts, missing contextual gates, and allows inactive or valid
36
- completions.
37
- - **Truthfulness Score** (`benchmarks/truthfulness.mjs`): weighted score over the
38
- dataset: 65% decision accuracy (blocked vs allowed) and 35% reason accuracy for
39
- blocked false-completion claims.
40
-
41
- ## Results
42
-
43
- Representative run (Node 22, single-threaded; latency varies by machine, the
44
- accuracy figures do not):
45
-
46
- | Metric | Legacy regex guard | Goal Mode analyzer |
16
+ ## Why this was rewritten
17
+
18
+ The previous benchmark reported "20.8% 100% detection, 21.7% 0% false
19
+ positives" on a **71-command corpus the analyzer's author wrote**. The analyzer
20
+ was, in effect, the specification of that corpus, so 100%/0% mostly restated
21
+ "my code passes my own examples." Those numbers are still produced but they are
22
+ now labeled as *regression fixtures*, and the headline figure comes from an
23
+ **external corpus the analyzer was never fitted to**.
24
+
25
+ ## Headline: external corpus
26
+
27
+ - **Source**: real example commands from
28
+ [tldr-pages](https://github.com/tldr-pages/tldr) (`common`, `linux`, `osx`
29
+ English pages), pinned by commit in `benchmarks/external-corpus.json`. These
30
+ are written by hundreds of contributors with no knowledge of this analyzer, so
31
+ it cannot have been tuned to them. `tldr` `{{placeholder}}` tokens and
32
+ `[-f|--force]` alternative-flag notation are canonicalized into literal
33
+ commands by `benchmarks/build-external-corpus.mjs`.
34
+ - **Ground-truth labels** come from `labelDestructive()` in that builder: a
35
+ deliberately simple, transparent rule (primary utility a fixed irreversible
36
+ set; specific destructive `git` subcommands; `curl|wget | sh`). It is
37
+ intentionally **independent of the analyzer's own logic**. No automatic labeler
38
+ is perfect, so the benchmark prints every disagreement for audit rather than
39
+ hiding them.
40
+ - **Sampling**: all destructive examples found are kept (they are rare in real
41
+ docs); safe examples are stride-sampled to a cap. This imbalance is recorded in
42
+ the corpus `totals` and disclosed here so it is not mistaken for a base rate.
43
+
44
+ Representative run (sample of 704 commands: 104 destructive, 600 safe):
45
+
46
+ | On real third-party commands | Legacy regex guard | Goal Mode analyzer |
47
47
  | --- | --- | --- |
48
- | Detection rate | **20.8%** (10/48) | **100%** (48/48) |
49
- | False-positive rate | **21.7%** (5/23) | **0%** (0/23) |
50
- | Detection — classic | 100% | 100% |
51
- | Detection obfuscated | 0% (0/35) | 100% (35/35) |
52
- | Detection — remote-exec | 0% (0/3) | 100% (3/3) |
53
- | Latency per command | ~2.3 µs | ~3.8 µs |
54
-
55
- False Completion Dataset run:
56
-
57
- | Metric | Goal Mode |
58
- | --- | --- |
59
- | Truthfulness score | **100.0%** |
60
- | Decision accuracy | **100.0%** |
61
- | Reason accuracy | **100.0%** |
62
- | False-completion block rate | **100.0%** |
63
- | Valid-completion allow rate | **100.0%** |
64
-
65
- The legacy guard catches only the *classic* family and misses every obfuscated
66
- and remote-execution command, while wrongly blocking 1-in-5 benign commands. The
67
- tokenizer catches the entire corpus with zero false positives, for an extra
68
- ~1.5 µs per command on this run — negligible for a per-tool-call guard (still
69
- hundreds of thousands of classifications per second).
48
+ | Detection rate | 53.8% | **93.3%** |
49
+ | False-positive rate | 0.2% | 0.2% |
50
+
51
+ Reading the result honestly:
52
+
53
+ - The remaining Goal Mode misses are almost entirely un-flagged single-target
54
+ `rm <file>` (and `rm -i`/`-v`/`-d`), which the guard **intentionally permits**:
55
+ it blocks `rm -r`/`rm -f`, command-substitution/`bash -c`/interpreter deletes,
56
+ and remote exec, but not a plain single-file `rm`. Under the strict
57
+ every-`rm`-is-destructive labeler these are counted as misses.
58
+ - The one counted false positive (`git filter-repo …`) genuinely rewrites
59
+ history, so the real-world false-positive rate is effectively zero. Run
60
+ `node benchmarks/external.mjs --json` to see the full miss / false-positive
61
+ lists.
62
+ - This benchmark directly drove real fixes: `mkfs.<fstype>` variants, `srm`, and
63
+ `mkswap` were missing from the analyzer and were added after the external run
64
+ exposed them.
65
+
66
+ ## Curated regression fixtures (a spec, not a survey)
67
+
68
+ `benchmarks/corpus.mjs` (71 commands) and `benchmarks/completion-corpus.mjs`
69
+ (9 completion-claim cases) define the patterns the analyzer must catch and the
70
+ completion-policy decisions it must make. They pass **by construction** and exist
71
+ to prevent regressions. The 100%/0% / "all cases pass" numbers there are not
72
+ measured accuracy — treat them as a checklist the code is required to satisfy.
73
+
74
+ - **Baseline** for the fixture comparison (`benchmarks/legacy-analyzer.mjs`) is
75
+ the original regex classifier, preserved **verbatim** from the first published
76
+ release (commit `130956d`), so it is the author's own prior code, not a
77
+ strawman built to lose.
78
+ - **A command counts as "blocked"** when the analyzer flags it `destructive` or
79
+ `networkExec` (the signals `tool.execute.before` throws on). `mutating` marks
80
+ the session dirty but does not block, so it is not counted.
70
81
 
71
82
  ## Honesty notes
72
83
 
73
- - The corpus is hand-built to exercise the known bypass classes; it is a
74
- capability benchmark, not a claim of catching *every* possible obfuscation
75
- (the analyzer fails open on un-analyzable dynamic commands — see
76
- [shell-hardening.md](shell-hardening.md)).
77
- - The latency comparison is intentionally shown even though the new analyzer is
78
- slower: the win is accuracy, and the parse cost is still only a few
79
- microseconds per tool-call candidate.
80
- - "100% on this corpus" means 100% of the labeled set; new bypass classes that
81
- are discovered get added to the corpus and fixed (that is how the second-wave
82
- findings — `sudo -u`, `pnpm dlx`, interpreter shell-out — entered it).
83
- - The Truthfulness Score is corpus truthfulness for mechanical completion claims,
84
- not a global claim that an LLM's prose is semantically true in every domain.
84
+ - The analyzer fails **open** on un-analyzable dynamic commands (deferring to the
85
+ host's permission rules); it is defense-in-depth, not a jail see
86
+ [shell-hardening.md](shell-hardening.md).
87
+ - The latency comparison is shown even though the tokenizer is slower than a
88
+ regex: the win is accuracy, and the parse cost is ~1µs per candidate.
89
+ - The completion-enforcement fixtures verify mechanical completion-claim policy,
90
+ not that an LLM's prose is semantically true in every domain.