opencode-goal-mode 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +47 -7
- package/CHANGELOG.md +27 -0
- package/README.md +81 -23
- package/benchmarks/build-external-corpus.mjs +177 -0
- package/benchmarks/charts.mjs +176 -0
- package/benchmarks/comparison.mjs +48 -0
- package/benchmarks/completion-corpus.mjs +70 -0
- package/benchmarks/corpus.mjs +92 -0
- package/benchmarks/external-corpus.json +3540 -0
- package/benchmarks/external.mjs +110 -0
- package/benchmarks/legacy-analyzer.mjs +54 -0
- package/benchmarks/run.mjs +252 -0
- package/benchmarks/truthfulness.mjs +64 -0
- package/commands/goal-evidence-map.md +27 -0
- package/commands/goal.md +16 -1
- package/docs/benchmarks/detection-by-family.svg +2 -2
- package/docs/benchmarks/external-scorecard.svg +32 -0
- package/docs/benchmarks/latency.svg +3 -3
- package/docs/benchmarks/overall-scorecard.svg +2 -2
- package/docs/benchmarks/results.json +207 -67
- package/docs/benchmarks/truthfulness-score.svg +17 -0
- package/package.json +5 -1
- package/plugins/goal-guard/config.js +9 -0
- package/plugins/goal-guard/events.js +6 -3
- package/plugins/goal-guard/shell.js +4 -3
- package/plugins/goal-guard/sidebar-data.js +71 -0
- package/plugins/goal-guard/state.js +2 -1
- package/plugins/goal-guard/summary.js +139 -1
- package/plugins/goal-guard/system.js +3 -0
- package/plugins/goal-guard/tools.js +43 -3
- package/plugins/goal-guard/verdicts.js +38 -1
- package/plugins/goal-guard.js +20 -5
- package/plugins/goal-sidebar.js +141 -0
- package/research/README.md +1 -1
- package/research/benchmarks.md +72 -45
|
@@ -33,6 +33,9 @@ export function buildSystemInjection(state, config) {
|
|
|
33
33
|
lines.push(`- Verification observed: ${r.verificationSeen ? "yes" : "no"}.`);
|
|
34
34
|
lines.push(`- Required review gates: ${bullet(r.requiredGates)}.`);
|
|
35
35
|
lines.push(`- Gates still missing or stale: ${bullet(r.missingGates)}.`);
|
|
36
|
+
if (r.reviewerMemory.open.length) {
|
|
37
|
+
lines.push(`- Open Reviewer Memory: ${r.reviewerMemory.open.map((m) => `${m.agent}: ${m.finding}`).join(" | ")}.`);
|
|
38
|
+
}
|
|
36
39
|
lines.push(
|
|
37
40
|
`- Completion is currently ${r.completionAllowed ? "ALLOWED" : "BLOCKED"}. ` +
|
|
38
41
|
(r.completionAllowed
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
import { tool } from "@opencode-ai/plugin";
|
|
15
|
-
import { statusReport } from "./summary.js";
|
|
15
|
+
import { evidenceMapReport, reviewerMemoryReport, statusReport } from "./summary.js";
|
|
16
16
|
import { recordEvidence } from "./events.js";
|
|
17
17
|
import { refreshStickyGates } from "./gates.js";
|
|
18
18
|
import { createState } from "./state.js";
|
|
@@ -38,10 +38,50 @@ export function createGoalTools({ store, config, persist }) {
|
|
|
38
38
|
async execute(_args, ctx) {
|
|
39
39
|
const state = store.stateFor(ctx.sessionID);
|
|
40
40
|
const report = statusReport(state, config);
|
|
41
|
+
const goal = report.goal ? `“${report.goal}” — ` : "";
|
|
41
42
|
return {
|
|
42
|
-
title: `Goal status: completion ${report.completionAllowed ? "allowed" : "blocked"}`,
|
|
43
|
+
title: `Goal status: ${goal}completion ${report.completionAllowed ? "allowed" : "blocked"}`,
|
|
43
44
|
output: JSON.stringify(report, null, 2),
|
|
44
|
-
metadata: {
|
|
45
|
+
metadata: {
|
|
46
|
+
goal: report.goal,
|
|
47
|
+
completionAllowed: report.completionAllowed,
|
|
48
|
+
reviewCycles: report.reviewCycles,
|
|
49
|
+
missingGates: report.missingGates,
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
},
|
|
53
|
+
}),
|
|
54
|
+
|
|
55
|
+
goal_evidence_map: tool({
|
|
56
|
+
description:
|
|
57
|
+
"Return an authoritative read-only evidence map for this session: each acceptance " +
|
|
58
|
+
"criterion, matching recorded evidence, required reviewer gate status, coverage status, " +
|
|
59
|
+
"gaps, and next action.",
|
|
60
|
+
args: {},
|
|
61
|
+
async execute(_args, ctx) {
|
|
62
|
+
const state = store.stateFor(ctx.sessionID);
|
|
63
|
+
const report = evidenceMapReport(state, config);
|
|
64
|
+
const covered = report.criteria.filter((item) => item.status === "covered").length;
|
|
65
|
+
return {
|
|
66
|
+
title: `Evidence map: ${covered}/${report.criteria.length} criteria covered`,
|
|
67
|
+
output: JSON.stringify(report, null, 2),
|
|
68
|
+
metadata: { criteriaCount: report.criteria.length, coveredCount: covered, missingGates: report.missingGates },
|
|
69
|
+
};
|
|
70
|
+
},
|
|
71
|
+
}),
|
|
72
|
+
|
|
73
|
+
goal_reviewer_memory: tool({
|
|
74
|
+
description:
|
|
75
|
+
"Return durable Reviewer Memory for this session: unresolved and recently resolved " +
|
|
76
|
+
"reviewer findings carried across cycles. Read-only.",
|
|
77
|
+
args: {},
|
|
78
|
+
async execute(_args, ctx) {
|
|
79
|
+
const state = store.stateFor(ctx.sessionID);
|
|
80
|
+
const report = reviewerMemoryReport(state);
|
|
81
|
+
return {
|
|
82
|
+
title: `Reviewer Memory: ${report.open.length} open findings`,
|
|
83
|
+
output: JSON.stringify(report, null, 2),
|
|
84
|
+
metadata: { openCount: report.open.length, total: report.total },
|
|
45
85
|
};
|
|
46
86
|
},
|
|
47
87
|
}),
|
|
@@ -68,17 +68,54 @@ export function latestVerdictFor(state, agent) {
|
|
|
68
68
|
return state.latestVerdict[agent] || null;
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
+
function summarizeFinding(text) {
|
|
72
|
+
const headingRe = /^(blocking findings?|findings?|non-blocking findings?|open questions?|summary|verdict|blocking|issues?)[:\s]*$/i;
|
|
73
|
+
const lines = String(text || "")
|
|
74
|
+
.split(/\r?\n/)
|
|
75
|
+
.map((line) => line.replace(/^[\s>*_-]+/, "").trim())
|
|
76
|
+
.filter(Boolean)
|
|
77
|
+
.filter((line) => !headingRe.test(line))
|
|
78
|
+
.filter((line) => !/^verdict:?\s*(pass|fail)\b/i.test(line));
|
|
79
|
+
const blocking = lines.find((line) => /block|fail|finding|risk|missing|gap|regression/i.test(line));
|
|
80
|
+
return String(blocking || lines[0] || "Reviewer reported a blocking finding.").slice(0, 240);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function updateReviewerMemory(state, agent, verdict, at, seq, text) {
|
|
84
|
+
state.reviewerMemory ||= [];
|
|
85
|
+
if (verdict === "PASS") {
|
|
86
|
+
for (const item of state.reviewerMemory) {
|
|
87
|
+
if (item.agent === agent && item.status === "open") {
|
|
88
|
+
item.status = "resolved";
|
|
89
|
+
item.resolvedAt = at;
|
|
90
|
+
item.resolvedSeq = seq;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
const finding = summarizeFinding(text);
|
|
96
|
+
const open = state.reviewerMemory.find((item) => item.agent === agent && item.status === "open" && item.finding === finding);
|
|
97
|
+
if (open) {
|
|
98
|
+
open.lastAt = at;
|
|
99
|
+
open.lastSeq = seq;
|
|
100
|
+
open.count += 1;
|
|
101
|
+
} else {
|
|
102
|
+
state.reviewerMemory.push({ agent, finding, severity: "blocking", status: "open", firstAt: at, firstSeq: seq, lastAt: at, lastSeq: seq, count: 1 });
|
|
103
|
+
}
|
|
104
|
+
if (state.reviewerMemory.length > 100) state.reviewerMemory.splice(0, state.reviewerMemory.length - 100);
|
|
105
|
+
}
|
|
106
|
+
|
|
71
107
|
/**
|
|
72
108
|
* Record a review verdict for `agent`, stamping it with the next monotonic seq.
|
|
73
109
|
* Increments the review-cycle count when the cycle-closing agent reports.
|
|
74
110
|
*/
|
|
75
|
-
export function recordVerdict(store, state, agent, verdict) {
|
|
111
|
+
export function recordVerdict(store, state, agent, verdict, text = "") {
|
|
76
112
|
const at = store.nowIso();
|
|
77
113
|
const seq = store.nextSeq();
|
|
78
114
|
const entry = { agent, verdict, at, seq };
|
|
79
115
|
state.verdicts.push(entry);
|
|
80
116
|
if (state.verdicts.length > 200) state.verdicts.splice(0, state.verdicts.length - 200);
|
|
81
117
|
state.latestVerdict[agent] = { verdict, at, seq };
|
|
118
|
+
updateReviewerMemory(state, agent, verdict, at, seq, text);
|
|
82
119
|
state.lastReviewAt = at;
|
|
83
120
|
state.lastReviewSeq = seq;
|
|
84
121
|
state.updatedAt = at;
|
package/plugins/goal-guard.js
CHANGED
|
@@ -179,27 +179,42 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
|
|
|
179
179
|
// records against that same session (never another), so it can neither
|
|
180
180
|
// mis-credit a sibling session nor break the parent goal, which the task
|
|
181
181
|
// path already covers. Split by tool type so the two never double-count.
|
|
182
|
+
const wasAllowed = completionAllowed(state, config);
|
|
182
183
|
let recordedAgent = null;
|
|
184
|
+
let recordedVerdict = null;
|
|
183
185
|
if (tool === "task") {
|
|
184
186
|
const sub = normalizedSubagent(inp);
|
|
185
187
|
if (isReviewAgent(sub)) {
|
|
186
|
-
const
|
|
188
|
+
const text = textOf(out);
|
|
189
|
+
const verdict = parseVerdict(text);
|
|
187
190
|
if (verdict) {
|
|
188
|
-
recordVerdict(store, state, sub, verdict);
|
|
191
|
+
recordVerdict(store, state, sub, verdict, text);
|
|
189
192
|
recordedAgent = sub;
|
|
193
|
+
recordedVerdict = verdict;
|
|
190
194
|
}
|
|
191
195
|
}
|
|
192
196
|
} else if (isReviewAgent(state.currentAgent)) {
|
|
193
|
-
const
|
|
197
|
+
const text = textOf(out);
|
|
198
|
+
const verdict = parseVerdict(text);
|
|
194
199
|
if (verdict) {
|
|
195
|
-
recordVerdict(store, state, state.currentAgent, verdict);
|
|
200
|
+
recordVerdict(store, state, state.currentAgent, verdict, text);
|
|
196
201
|
recordedAgent = state.currentAgent;
|
|
202
|
+
recordedVerdict = verdict;
|
|
197
203
|
}
|
|
198
204
|
}
|
|
199
205
|
|
|
200
206
|
if (recordedAgent === CYCLE_CLOSING_AGENT) {
|
|
201
207
|
maybeClearDirtyOnFinalPass(state, config);
|
|
202
208
|
}
|
|
209
|
+
|
|
210
|
+
// Surface review progress in the TUI: a toast per recorded verdict, and a
|
|
211
|
+
// single celebratory toast the moment the last required gate clears.
|
|
212
|
+
if (recordedAgent && recordedVerdict && config.toastOnReview) {
|
|
213
|
+
logger.toast(`Goal Guard: ${recordedAgent} → ${recordedVerdict}`, recordedVerdict === "PASS" ? "success" : "warning");
|
|
214
|
+
if (!wasAllowed && completionAllowed(state, config)) {
|
|
215
|
+
logger.toast("Goal Guard: all required gates passed — completion unlocked", "success");
|
|
216
|
+
}
|
|
217
|
+
}
|
|
203
218
|
persist();
|
|
204
219
|
} catch {
|
|
205
220
|
/* never break a turn */
|
|
@@ -231,7 +246,7 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
|
|
|
231
246
|
const state = store.stateFor(inp.sessionID);
|
|
232
247
|
out.context.push(
|
|
233
248
|
`Goal Guard state: ${summarizeState(state, config)}. Preserve Goal Contract, Verification Ledger, ` +
|
|
234
|
-
`Review Ledger, review cycle count, dirty state, and open findings across compaction.`,
|
|
249
|
+
`Review Ledger, Reviewer Memory, review cycle count, dirty state, and open findings across compaction.`,
|
|
235
250
|
);
|
|
236
251
|
} catch {
|
|
237
252
|
/* ignore */
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/** @jsxImportSource @opentui/solid */
|
|
2
|
+
/**
|
|
3
|
+
* Goal Mode — experimental TUI sidebar banner.
|
|
4
|
+
*
|
|
5
|
+
* EXPERIMENTAL. This is a TUI plugin module (the companion to the server-side
|
|
6
|
+
* goal-guard plugin). It renders the current goal as a short, shining-yellow
|
|
7
|
+
* banner in the OpenCode sidebar, with a compact `passing/total gates ·
|
|
8
|
+
* dirty/ready` status line, and updates as reviews land.
|
|
9
|
+
*
|
|
10
|
+
* It only does anything inside a TUI-plugin-capable OpenCode (one exposing
|
|
11
|
+
* `api.slots.register`). On any older runtime, missing API, or render error it
|
|
12
|
+
* silently no-ops — it can never break your TUI.
|
|
13
|
+
*
|
|
14
|
+
* Pairing: it reads the SAME on-disk snapshot the goal-guard server plugin
|
|
15
|
+
* writes (see goal-guard/persistence.js), so the two stay in sync with no extra
|
|
16
|
+
* IPC. The pure projection (`summary.sidebarView`) is shared with the server
|
|
17
|
+
* plugin and unit-tested via goal-guard/sidebar-data.js; only the file read and
|
|
18
|
+
* state-path computation are reimplemented here.
|
|
19
|
+
*
|
|
20
|
+
* Runtime constraints (mirrored from working OpenCode TUI plugins):
|
|
21
|
+
* - TUI plugin modules export `export default { id, tui }`.
|
|
22
|
+
* - The Bun TUI plugin runtime does NOT support top-level ESM imports of Node
|
|
23
|
+
* built-ins, so `node:fs`/`node:path`/`node:os`/`node:crypto` are `require()`d
|
|
24
|
+
* lazily inside functions. Top-level imports of regular packages (solid-js)
|
|
25
|
+
* and of our Node-built-in-free local modules are fine.
|
|
26
|
+
* - This file uses Solid/opentui JSX and is loaded only by OpenCode's (Bun) TUI
|
|
27
|
+
* runtime, which transpiles it; it is never imported by the Node test suite.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
import { createSignal, onCleanup, Show } from "solid-js";
|
|
31
|
+
import { sidebarView } from "./goal-guard/summary.js";
|
|
32
|
+
import { DEFAULT_CONFIG } from "./goal-guard/config.js";
|
|
33
|
+
|
|
34
|
+
const DEFAULT_COLOR = "#FFD700"; // shining yellow
|
|
35
|
+
const POLL_MS = 1500;
|
|
36
|
+
|
|
37
|
+
function resolveOptions(options, env) {
|
|
38
|
+
const e = env || {};
|
|
39
|
+
const enabledOpt = options?.sidebarBanner;
|
|
40
|
+
const enabledEnv = e.GOAL_GUARD_SIDEBAR_BANNER;
|
|
41
|
+
const disabled =
|
|
42
|
+
enabledOpt === false || enabledEnv === "0" || enabledEnv === "false" || enabledEnv === "off";
|
|
43
|
+
const color = options?.sidebarColor || e.GOAL_GUARD_SIDEBAR_COLOR || DEFAULT_COLOR;
|
|
44
|
+
return { enabled: !disabled, color };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Read the guard's persisted snapshot for a worktree. The state-path logic is
|
|
49
|
+
* kept identical to goal-guard/persistence.js (stateBaseDir + projectKey); node
|
|
50
|
+
* built-ins are required lazily to satisfy the TUI runtime.
|
|
51
|
+
*/
|
|
52
|
+
function readSnapshot(worktree) {
|
|
53
|
+
try {
|
|
54
|
+
const fs = require("node:fs");
|
|
55
|
+
const path = require("node:path");
|
|
56
|
+
const os = require("node:os");
|
|
57
|
+
const crypto = require("node:crypto");
|
|
58
|
+
const xdg = process.env.XDG_STATE_HOME && process.env.XDG_STATE_HOME.trim();
|
|
59
|
+
const base = xdg || path.join(os.homedir(), ".local", "state");
|
|
60
|
+
const key = crypto.createHash("sha256").update(String(worktree || "default")).digest("hex").slice(0, 16);
|
|
61
|
+
const file = path.join(base, "opencode", "goal-guard", `${key}.json`);
|
|
62
|
+
return JSON.parse(fs.readFileSync(file, "utf8"));
|
|
63
|
+
} catch {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Most-recently-touched active session, preferring an explicit active sessionId. */
|
|
69
|
+
function pickSession(snapshot, sessionId) {
|
|
70
|
+
if (!snapshot || !Array.isArray(snapshot.sessions)) return null;
|
|
71
|
+
const records = snapshot.sessions
|
|
72
|
+
.filter((e) => Array.isArray(e) && e.length === 2)
|
|
73
|
+
.map(([key, st]) => [key, st && typeof st === "object" ? st : {}]);
|
|
74
|
+
if (sessionId) {
|
|
75
|
+
const direct = records.find(([key, st]) => key === sessionId && st.active);
|
|
76
|
+
if (direct) return direct[1];
|
|
77
|
+
}
|
|
78
|
+
const active = records.filter(([, st]) => st.active);
|
|
79
|
+
if (active.length === 0) return null;
|
|
80
|
+
active.sort((a, b) => (b[1].touchedAt || 0) - (a[1].touchedAt || 0));
|
|
81
|
+
return active[0][1];
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function readModel(worktree, sessionId) {
|
|
85
|
+
const snapshot = readSnapshot(worktree);
|
|
86
|
+
if (!snapshot) return null;
|
|
87
|
+
const record = pickSession(snapshot, sessionId);
|
|
88
|
+
if (!record) return null;
|
|
89
|
+
try {
|
|
90
|
+
return sidebarView(record, DEFAULT_CONFIG);
|
|
91
|
+
} catch {
|
|
92
|
+
return null;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export const id = "goal-mode-sidebar";
|
|
97
|
+
|
|
98
|
+
/** @type {import("@opencode-ai/plugin/tui").TuiPlugin} */
|
|
99
|
+
export const tui = async (api, options) => {
|
|
100
|
+
try {
|
|
101
|
+
const { enabled, color } = resolveOptions(options, typeof process !== "undefined" ? process.env : {});
|
|
102
|
+
if (!enabled) return;
|
|
103
|
+
if (!api?.slots?.register) return; // runtime without the slot API → no-op.
|
|
104
|
+
|
|
105
|
+
const worktree = api.state?.path?.worktree || api.state?.path?.directory;
|
|
106
|
+
|
|
107
|
+
api.slots.register({
|
|
108
|
+
order: 50,
|
|
109
|
+
slots: {
|
|
110
|
+
sidebar_content(_ctx, props) {
|
|
111
|
+
const read = () => {
|
|
112
|
+
try {
|
|
113
|
+
return readModel(worktree, props?.session_id);
|
|
114
|
+
} catch {
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
const [model, setModel] = createSignal(read());
|
|
119
|
+
const timer = setInterval(() => setModel(read()), POLL_MS);
|
|
120
|
+
onCleanup(() => clearInterval(timer));
|
|
121
|
+
return (
|
|
122
|
+
<Show when={model()}>
|
|
123
|
+
<box flexDirection="column">
|
|
124
|
+
<text fg={color}>
|
|
125
|
+
{"◆ "}
|
|
126
|
+
<b>GOAL</b>
|
|
127
|
+
{` ${model().goal}`}
|
|
128
|
+
</text>
|
|
129
|
+
<text fg={color}>{model().status}</text>
|
|
130
|
+
</box>
|
|
131
|
+
</Show>
|
|
132
|
+
);
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
});
|
|
136
|
+
} catch {
|
|
137
|
+
/* TUI runtime missing or API drift — render nothing rather than crash. */
|
|
138
|
+
}
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
export default { id, tui };
|
package/research/README.md
CHANGED
|
@@ -10,7 +10,7 @@ links resolve in the npm package, but they are not runtime files.
|
|
|
10
10
|
| [opencode-plugin-platform.md](opencode-plugin-platform.md) | Verified OpenCode plugin-runtime facts (hooks, discovery, permissions, tools) from `@opencode-ai/plugin@1.15.13` source. The pinned runtime reference the plugin is built against. |
|
|
11
11
|
| [goal-mode-comparison.md](goal-mode-comparison.md) | How Goal Mode's mechanical enforcement compares to Claude Code and OpenAI Codex, with citations and honest caveats. |
|
|
12
12
|
| [shell-hardening.md](shell-hardening.md) | The shell-analyzer threat model: the bypass classes the old regex guard missed and how the tokenizer closes each. |
|
|
13
|
-
| [benchmarks.md](benchmarks.md) | Benchmark methodology and results (
|
|
13
|
+
| [benchmarks.md](benchmarks.md) | Benchmark methodology and results (shell guard accuracy plus completion truthfulness). Reproduce charts with `npm run bench` and JSON with `npm run bench:truthfulness`. |
|
|
14
14
|
|
|
15
15
|
Every non-obvious platform claim in these documents was verified against the
|
|
16
16
|
installed `@opencode-ai/plugin` type definitions and/or the `sst/opencode`
|
package/research/benchmarks.md
CHANGED
|
@@ -4,60 +4,87 @@ Reproducible measurement of the destructive-command guard from a repository
|
|
|
4
4
|
checkout. Run:
|
|
5
5
|
|
|
6
6
|
```bash
|
|
7
|
-
npm run bench
|
|
8
|
-
|
|
7
|
+
npm run bench # external + fixture benchmarks → results.json + charts
|
|
8
|
+
node benchmarks/external.mjs # external benchmark only (add --json for full detail)
|
|
9
|
+
npm run bench:truthfulness # print the completion-enforcement fixture JSON
|
|
10
|
+
npm run bench:compare # regenerate the capability-comparison chart
|
|
9
11
|
```
|
|
10
12
|
|
|
11
13
|
`npm run bench` writes `docs/benchmarks/results.json` and the SVG charts the
|
|
12
14
|
README embeds.
|
|
13
15
|
|
|
14
|
-
##
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
preserved **verbatim** from the first published release (commit `130956d`), so
|
|
25
|
-
the comparison is apples-to-apples against the same code that shipped.
|
|
26
|
-
- **A command counts as "blocked"** when the analyzer flags it `destructive` or
|
|
27
|
-
`networkExec` (the two signals `tool.execute.before` throws on). `mutating`
|
|
28
|
-
marks the session dirty but does not block, so it is not counted here.
|
|
29
|
-
- **Metrics**: detection rate (recall over destructive commands),
|
|
30
|
-
false-positive rate (safe commands wrongly blocked), and per-command latency.
|
|
16
|
+
## Why this was rewritten
|
|
17
|
+
|
|
18
|
+
The previous benchmark reported "20.8% → 100% detection, 21.7% → 0% false
|
|
19
|
+
positives" on a **71-command corpus the analyzer's author wrote**. The analyzer
|
|
20
|
+
was, in effect, the specification of that corpus, so 100%/0% mostly restated
|
|
21
|
+
"my code passes my own examples." Those numbers are still produced — but they are
|
|
22
|
+
now labeled as *regression fixtures*, and the headline figure comes from an
|
|
23
|
+
**external corpus the analyzer was never fitted to**.
|
|
24
|
+
|
|
25
|
+
## Headline: external corpus
|
|
31
26
|
|
|
32
|
-
|
|
27
|
+
- **Source**: real example commands from
|
|
28
|
+
[tldr-pages](https://github.com/tldr-pages/tldr) (`common`, `linux`, `osx`
|
|
29
|
+
English pages), pinned by commit in `benchmarks/external-corpus.json`. These
|
|
30
|
+
are written by hundreds of contributors with no knowledge of this analyzer, so
|
|
31
|
+
it cannot have been tuned to them. `tldr` `{{placeholder}}` tokens and
|
|
32
|
+
`[-f|--force]` alternative-flag notation are canonicalized into literal
|
|
33
|
+
commands by `benchmarks/build-external-corpus.mjs`.
|
|
34
|
+
- **Ground-truth labels** come from `labelDestructive()` in that builder: a
|
|
35
|
+
deliberately simple, transparent rule (primary utility ∈ a fixed irreversible
|
|
36
|
+
set; specific destructive `git` subcommands; `curl|wget … | sh`). It is
|
|
37
|
+
intentionally **independent of the analyzer's own logic**. No automatic labeler
|
|
38
|
+
is perfect, so the benchmark prints every disagreement for audit rather than
|
|
39
|
+
hiding them.
|
|
40
|
+
- **Sampling**: all destructive examples found are kept (they are rare in real
|
|
41
|
+
docs); safe examples are stride-sampled to a cap. This imbalance is recorded in
|
|
42
|
+
the corpus `totals` and disclosed here so it is not mistaken for a base rate.
|
|
33
43
|
|
|
34
|
-
Representative run (
|
|
35
|
-
accuracy figures do not):
|
|
44
|
+
Representative run (sample of 704 commands: 104 destructive, 600 safe):
|
|
36
45
|
|
|
37
|
-
|
|
|
46
|
+
| On real third-party commands | Legacy regex guard | Goal Mode analyzer |
|
|
38
47
|
| --- | --- | --- |
|
|
39
|
-
| Detection rate |
|
|
40
|
-
| False-positive rate |
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
and remote
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
| Detection rate | 53.8% | **93.3%** |
|
|
49
|
+
| False-positive rate | 0.2% | 0.2% |
|
|
50
|
+
|
|
51
|
+
Reading the result honestly:
|
|
52
|
+
|
|
53
|
+
- The remaining Goal Mode misses are almost entirely un-flagged single-target
|
|
54
|
+
`rm <file>` (and `rm -i`/`-v`/`-d`), which the guard **intentionally permits**:
|
|
55
|
+
it blocks `rm -r`/`rm -f`, command-substitution/`bash -c`/interpreter deletes,
|
|
56
|
+
and remote exec, but not a plain single-file `rm`. Under the strict
|
|
57
|
+
every-`rm`-is-destructive labeler these are counted as misses.
|
|
58
|
+
- The one counted false positive (`git filter-repo …`) genuinely rewrites
|
|
59
|
+
history, so the real-world false-positive rate is effectively zero. Run
|
|
60
|
+
`node benchmarks/external.mjs --json` to see the full miss / false-positive
|
|
61
|
+
lists.
|
|
62
|
+
- This benchmark directly drove real fixes: `mkfs.<fstype>` variants, `srm`, and
|
|
63
|
+
`mkswap` were missing from the analyzer and were added after the external run
|
|
64
|
+
exposed them.
|
|
65
|
+
|
|
66
|
+
## Curated regression fixtures (a spec, not a survey)
|
|
67
|
+
|
|
68
|
+
`benchmarks/corpus.mjs` (71 commands) and `benchmarks/completion-corpus.mjs`
|
|
69
|
+
(9 completion-claim cases) define the patterns the analyzer must catch and the
|
|
70
|
+
completion-policy decisions it must make. They pass **by construction** and exist
|
|
71
|
+
to prevent regressions. The 100%/0% / "all cases pass" numbers there are not
|
|
72
|
+
measured accuracy — treat them as a checklist the code is required to satisfy.
|
|
73
|
+
|
|
74
|
+
- **Baseline** for the fixture comparison (`benchmarks/legacy-analyzer.mjs`) is
|
|
75
|
+
the original regex classifier, preserved **verbatim** from the first published
|
|
76
|
+
release (commit `130956d`), so it is the author's own prior code, not a
|
|
77
|
+
strawman built to lose.
|
|
78
|
+
- **A command counts as "blocked"** when the analyzer flags it `destructive` or
|
|
79
|
+
`networkExec` (the signals `tool.execute.before` throws on). `mutating` marks
|
|
80
|
+
the session dirty but does not block, so it is not counted.
|
|
51
81
|
|
|
52
82
|
## Honesty notes
|
|
53
83
|
|
|
54
|
-
- The
|
|
55
|
-
|
|
56
|
-
(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
- "100% on this corpus" means 100% of the labeled set; new bypass classes that
|
|
62
|
-
are discovered get added to the corpus and fixed (that is how the second-wave
|
|
63
|
-
findings — `sudo -u`, `pnpm dlx`, interpreter shell-out — entered it).
|
|
84
|
+
- The analyzer fails **open** on un-analyzable dynamic commands (deferring to the
|
|
85
|
+
host's permission rules); it is defense-in-depth, not a jail — see
|
|
86
|
+
[shell-hardening.md](shell-hardening.md).
|
|
87
|
+
- The latency comparison is shown even though the tokenizer is slower than a
|
|
88
|
+
regex: the win is accuracy, and the parse cost is ~1µs per candidate.
|
|
89
|
+
- The completion-enforcement fixtures verify mechanical completion-claim policy,
|
|
90
|
+
not that an LLM's prose is semantically true in every domain.
|