opencode-goal-mode 0.2.4 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,110 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Run the shell guard against the EXTERNAL, third-party-authored corpus built by
4
+ * build-external-corpus.mjs (real tldr-pages commands). This is the honest
5
+ * benchmark: the analyzer authors did not write or curate these commands, so the
6
+ * detection / false-positive numbers reflect real-world behavior, warts and all.
7
+ *
8
+ * It deliberately also reports DISAGREEMENTS between the analyzer and the
9
+ * independent ground-truth labeler, so misses and false positives are auditable
10
+ * rather than averaged away.
11
+ *
12
+ * node benchmarks/external.mjs # summary
13
+ * node benchmarks/external.mjs --json # full machine-readable result
14
+ */
15
+
16
+ import { readFileSync } from "node:fs";
17
+ import { join, dirname } from "node:path";
18
+ import { fileURLToPath } from "node:url";
19
+ import * as current from "../plugins/goal-guard/shell.js";
20
+ import * as legacy from "./legacy-analyzer.mjs";
21
+
22
+ const here = dirname(fileURLToPath(import.meta.url));
23
+
24
+ export function loadExternalCorpus() {
25
+ return JSON.parse(readFileSync(join(here, "external-corpus.json"), "utf8"));
26
+ }
27
+
28
+ function blocked(analyzer, cmd) {
29
+ const a = analyzer.analyzeCommand(cmd);
30
+ return Boolean(a.destructive || a.networkExec);
31
+ }
32
+
33
+ /** Evaluate one analyzer over labeled entries (each {cmd, page, destructive}). */
34
+ function score(analyzer, labeled) {
35
+ let destTotal = 0;
36
+ let destCaught = 0;
37
+ let safeTotal = 0;
38
+ let safeFalsePos = 0;
39
+ const misses = [];
40
+ const falsePositives = [];
41
+ for (const e of labeled) {
42
+ const isBlocked = blocked(analyzer, e.cmd);
43
+ if (e.destructive) {
44
+ destTotal += 1;
45
+ if (isBlocked) destCaught += 1;
46
+ else misses.push({ cmd: e.cmd, page: e.page });
47
+ } else {
48
+ safeTotal += 1;
49
+ if (isBlocked) {
50
+ safeFalsePos += 1;
51
+ falsePositives.push({ cmd: e.cmd, page: e.page });
52
+ }
53
+ }
54
+ }
55
+ return {
56
+ detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
57
+ falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
58
+ destCaught,
59
+ destTotal,
60
+ safeFalsePos,
61
+ safeTotal,
62
+ misses,
63
+ falsePositives,
64
+ };
65
+ }
66
+
67
+ export function runExternalBenchmark() {
68
+ const corpus = loadExternalCorpus();
69
+ // The corpus is written destructive-first then safe (see build-external-corpus.mjs),
70
+ // so the recorded count is the label boundary — no re-running the labeler needed.
71
+ const labeled = corpus.entries.map((e, i) => ({ ...e, destructive: i < corpus.totals.destructiveFound }));
72
+ return {
73
+ source: corpus.source,
74
+ commit: corpus.commit,
75
+ totals: corpus.totals,
76
+ sampleSize: labeled.length,
77
+ legacy: score(legacy, labeled),
78
+ current: score(current, labeled),
79
+ };
80
+ }
81
+
82
+ function pct(n) {
83
+ return `${n.toFixed(1)}%`;
84
+ }
85
+
86
+ if (process.argv[1] === fileURLToPath(import.meta.url)) {
87
+ const r = runExternalBenchmark();
88
+ if (process.argv.includes("--json")) {
89
+ console.log(JSON.stringify(r, null, 2));
90
+ } else {
91
+ console.log("External shell-guard benchmark (third-party tldr-pages commands)");
92
+ console.log("================================================================");
93
+ console.log(`Source: ${r.source} @ ${r.commit.slice(0, 12)}`);
94
+ console.log(
95
+ `Sample: ${r.sampleSize} commands ` +
96
+ `(${r.totals.destructiveFound} destructive [all found], ` +
97
+ `${r.totals.safeSampled}/${r.totals.safeFound} safe sampled)`,
98
+ );
99
+ console.log("");
100
+ console.log(`Detection (destructive caught) legacy ${pct(r.legacy.detectionRate)} → current ${pct(r.current.detectionRate)}`);
101
+ console.log(`False positives on safe commands legacy ${pct(r.legacy.falsePositiveRate)} → current ${pct(r.current.falsePositiveRate)}`);
102
+ console.log("");
103
+ console.log(`Current analyzer misses (${r.current.misses.length}):`);
104
+ for (const m of r.current.misses.slice(0, 20)) console.log(` - ${m.cmd} [${m.page}]`);
105
+ if (r.current.misses.length > 20) console.log(` … ${r.current.misses.length - 20} more`);
106
+ console.log(`Current analyzer false positives (${r.current.falsePositives.length}):`);
107
+ for (const f of r.current.falsePositives.slice(0, 20)) console.log(` - ${f.cmd} [${f.page}]`);
108
+ if (r.current.falsePositives.length > 20) console.log(` … ${r.current.falsePositives.length - 20} more`);
109
+ }
110
+ }
@@ -20,6 +20,7 @@ import * as current from "../plugins/goal-guard/shell.js";
20
20
  import * as legacy from "./legacy-analyzer.mjs";
21
21
  import { groupedBarChart, horizontalBarChart } from "./charts.mjs";
22
22
  import { runTruthfulnessBenchmark } from "./truthfulness.mjs";
23
+ import { runExternalBenchmark } from "./external.mjs";
23
24
 
24
25
  const root = fileURLToPath(new URL("..", import.meta.url));
25
26
  const outDir = join(root, "docs", "benchmarks");
@@ -92,6 +93,7 @@ function fmt(n) {
92
93
 
93
94
  const legacyEval = evaluate(legacy);
94
95
  const currentEval = evaluate(current);
96
+ const external = runExternalBenchmark();
95
97
  const truthfulness = runTruthfulnessBenchmark();
96
98
  const legacyOps = throughput(legacy);
97
99
  const currentOps = throughput(current);
@@ -110,23 +112,74 @@ function familyRate(ev, fam) {
110
112
  return f && f.destTotal ? (f.destCaught / f.destTotal) * 100 : 0;
111
113
  }
112
114
 
115
+ // Trim the per-command miss/false-positive lists to keep results.json readable;
116
+ // the full lists are always available via `node benchmarks/external.mjs --json`.
117
+ const externalSummary = {
118
+ source: external.source,
119
+ commit: external.commit,
120
+ totals: external.totals,
121
+ sampleSize: external.sampleSize,
122
+ legacy: {
123
+ detectionRate: Number(external.legacy.detectionRate.toFixed(1)),
124
+ falsePositiveRate: Number(external.legacy.falsePositiveRate.toFixed(1)),
125
+ destCaught: external.legacy.destCaught,
126
+ destTotal: external.legacy.destTotal,
127
+ safeFalsePos: external.legacy.safeFalsePos,
128
+ safeTotal: external.legacy.safeTotal,
129
+ },
130
+ current: {
131
+ detectionRate: Number(external.current.detectionRate.toFixed(1)),
132
+ falsePositiveRate: Number(external.current.falsePositiveRate.toFixed(1)),
133
+ destCaught: external.current.destCaught,
134
+ destTotal: external.current.destTotal,
135
+ safeFalsePos: external.current.safeFalsePos,
136
+ safeTotal: external.current.safeTotal,
137
+ misses: external.current.misses.map((m) => m.cmd),
138
+ falsePositives: external.current.falsePositives.map((f) => f.cmd),
139
+ },
140
+ };
141
+
113
142
  const results = {
114
- corpusSize: CORPUS.length,
115
- destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
116
- safeCount: CORPUS.filter((c) => c.label === "safe").length,
117
- legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
118
- current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
119
- truthfulness,
143
+ // The honest, third-party benchmark: real commands the analyzer was never
144
+ // fitted to. This is the headline number.
145
+ external: externalSummary,
146
+ // Curated REGRESSION FIXTURES: a hand-authored set of known destructive
147
+ // patterns and their safe look-alikes. These define the patterns the analyzer
148
+ // is built to catch and guard against regressions — they are NOT an unbiased
149
+ // sample, so the 100%/0% here is "passes its own spec", not measured accuracy.
150
+ fixtures: {
151
+ corpusSize: CORPUS.length,
152
+ destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
153
+ safeCount: CORPUS.filter((c) => c.label === "safe").length,
154
+ legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
155
+ current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
156
+ },
157
+ // Completion-enforcement fixtures (hand-authored policy cases), not a survey.
158
+ completionFixtures: truthfulness,
120
159
  };
121
160
 
122
161
  writeFileSync(join(outDir, "results.json"), JSON.stringify(results, null, 2));
123
162
 
124
- // Chart 1: detection rate by command family.
163
+ // Headline chart: detection + false positives on the EXTERNAL third-party corpus.
164
+ writeFileSync(
165
+ join(outDir, "external-scorecard.svg"),
166
+ groupedBarChart({
167
+ title: "Guard accuracy on real third-party commands",
168
+ subtitle: `${external.sampleSize} tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.`,
169
+ groups: ["Detection rate", "False-positive rate"],
170
+ series: [
171
+ { name: "Legacy regex guard", color: "#9aa0a6", values: [external.legacy.detectionRate, external.legacy.falsePositiveRate] },
172
+ { name: "Goal Mode analyzer", color: "#2da44e", values: [external.current.detectionRate, external.current.falsePositiveRate] },
173
+ ],
174
+ }),
175
+ );
176
+
177
+ // Chart 1: detection rate by command family (CURATED regression fixtures).
125
178
  writeFileSync(
126
179
  join(outDir, "detection-by-family.svg"),
127
180
  groupedBarChart({
128
- title: "Destructive-command detection rate by family",
129
- subtitle: `Higher is better. Corpus: ${results.destructiveCount} destructive commands.`,
181
+ title: "Detection by family curated regression fixtures",
182
+ subtitle: `Curated patterns the analyzer is built to catch (not an unbiased sample). ${results.fixtures.destructiveCount} destructive fixtures.`,
130
183
  groups: detFamilies.map((f) => FAMILY_LABELS[f]),
131
184
  series: [
132
185
  { name: "Legacy regex guard", color: "#9aa0a6", values: detFamilies.map((f) => familyRate(legacyEval, f)) },
@@ -135,12 +188,12 @@ writeFileSync(
135
188
  }),
136
189
  );
137
190
 
138
- // Chart 2: overall scorecard (detection up, false positives down).
191
+ // Chart 2: overall scorecard on the CURATED fixtures (passes its own spec).
139
192
  writeFileSync(
140
193
  join(outDir, "overall-scorecard.svg"),
141
194
  groupedBarChart({
142
- title: "Overall guard accuracy",
143
- subtitle: "Detection rate (higher better) vs false-positive rate (lower better).",
195
+ title: "Curated fixtures — passes its own spec",
196
+ subtitle: "Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.",
144
197
  groups: ["Detection rate", "False-positive rate"],
145
198
  series: [
146
199
  { name: "Legacy regex guard", color: "#9aa0a6", values: [legacyEval.detectionRate, legacyEval.falsePositiveRate] },
@@ -168,8 +221,8 @@ writeFileSync(
168
221
  writeFileSync(
169
222
  join(outDir, "truthfulness-score.svg"),
170
223
  horizontalBarChart({
171
- title: "Benchmark Truthfulness Score",
172
- subtitle: `False Completion Dataset: ${truthfulness.corpusSize} labeled completion-claim cases.`,
224
+ title: "Completion-enforcement fixtures",
225
+ subtitle: `${truthfulness.corpusSize} hand-authored policy cases (a spec, not a survey): premature claims blocked, valid ones allowed.`,
173
226
  unit: "%",
174
227
  max: 100,
175
228
  rows: [
@@ -183,16 +236,17 @@ writeFileSync(
183
236
  const pct = (n) => `${n.toFixed(1)}%`;
184
237
  console.log("Goal Mode shell-guard benchmark");
185
238
  console.log("================================");
186
- console.log(`Corpus: ${results.corpusSize} commands (${results.destructiveCount} destructive, ${results.safeCount} safe)`);
187
239
  console.log("");
188
- console.log(`Detection rate legacy ${pct(legacyEval.detectionRate)} → Goal Mode ${pct(currentEval.detectionRate)}`);
189
- console.log(`False positives legacy ${pct(legacyEval.falsePositiveRate)} → Goal Mode ${pct(currentEval.falsePositiveRate)}`);
190
- console.log(`Latency legacy ${legacyUs.toFixed(2)} µs/cmd → Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
191
- console.log(`Truthfulness False Completion Dataset score ${truthfulness.score.toFixed(1)}% (${truthfulness.corpusSize} cases)`);
240
+ console.log(`HEADLINE external corpus: ${external.sampleSize} real tldr-pages commands @ ${external.commit.slice(0, 12)}`);
241
+ console.log(` (${external.totals.destructiveFound} destructive [all found] + ${external.totals.safeSampled}/${external.totals.safeFound} safe sampled)`);
242
+ console.log(` Detection legacy ${pct(external.legacy.detectionRate)} → Goal Mode ${pct(external.current.detectionRate)}`);
243
+ console.log(` False positives legacy ${pct(external.legacy.falsePositiveRate)} → Goal Mode ${pct(external.current.falsePositiveRate)}`);
244
+ console.log(` Remaining Goal Mode misses: ${external.current.misses.length} (mostly un-flagged single-target rm — see external.mjs --json)`);
192
245
  console.log("");
193
- console.log("By family (detection rate):");
194
- for (const f of detFamilies) {
195
- console.log(` ${FAMILY_LABELS[f].padEnd(12)} legacy ${pct(familyRate(legacyEval, f)).padStart(6)} Goal Mode ${pct(familyRate(currentEval, f)).padStart(6)}`);
196
- }
246
+ console.log(`Curated regression fixtures: ${results.fixtures.corpusSize} commands (defines patterns to catch; not an unbiased sample)`);
247
+ console.log(` Detection legacy ${pct(legacyEval.detectionRate)} → Goal Mode ${pct(currentEval.detectionRate)} (passes its own spec)`);
248
+ console.log(` False pos legacy ${pct(legacyEval.falsePositiveRate)} Goal Mode ${pct(currentEval.falsePositiveRate)}`);
249
+ console.log(`Completion-enforcement fixtures: ${truthfulness.corpusSize} hand-authored policy cases, all pass (a spec, not a survey)`);
250
+ console.log(`Latency: Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
197
251
  console.log("");
198
- console.log(`Wrote results.json + 4 SVG charts to docs/benchmarks/`);
252
+ console.log(`Wrote results.json + 5 SVG charts to docs/benchmarks/`);
package/commands/goal.md CHANGED
@@ -9,4 +9,19 @@ Start Goal Mode for this request:
9
9
  $ARGUMENTS
10
10
  ```
11
11
 
12
- First create a Goal Contract, ask only essential beginning clarifying questions, delegate discovery/research to subagents, implement in the main agent, verify, run required review cycles, and only finish with `Goal Completed` if all gates pass.
12
+ Run this sequence:
13
+
14
+ 1. **Seed the contract first.** Call the `goal_contract` tool with the original
15
+ request, explicit/inferred requirements, non-goals, and concrete acceptance
16
+ criteria. This activates enforcement, fixes the required specialist review
17
+ gates, and lights up the goal banner in the sidebar. Ask only essential
18
+ clarifying questions before recording it.
19
+ 2. Delegate discovery and research to subagents; implement in the main agent.
20
+ 3. Verify, and record each verification with the `goal_evidence` tool so it maps
21
+ to your acceptance criteria.
22
+ 4. Run the required review cycles. Consult `goal_status` / `goal_evidence_map`
23
+ for the authoritative list of missing or stale gates rather than relying on
24
+ memory.
25
+ 5. Only finish with `Goal Completed` (plus an accurate `Review cycles: N` line)
26
+ once every required gate has a fresh PASS — the guard will rewrite a premature
27
+ claim to `Goal Not Completed`.
@@ -1,7 +1,7 @@
1
1
  <svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
2
2
  <rect width="720" height="380" fill="#ffffff"/>
3
- <text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Destructive-command detection rate by family</text>
4
- <text x="48" y="47" font-size="12" fill="#656d76">Higher is better. Corpus: 48 destructive commands.</text>
3
+ <text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Detection by family curated regression fixtures</text>
4
+ <text x="48" y="47" font-size="12" fill="#656d76">Curated patterns the analyzer is built to catch (not an unbiased sample). 48 destructive fixtures.</text>
5
5
  <line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
6
6
  <text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
7
7
  <line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
@@ -0,0 +1,32 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
2
+ <rect width="720" height="380" fill="#ffffff"/>
3
+ <text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Guard accuracy on real third-party commands</text>
4
+ <text x="48" y="47" font-size="12" fill="#656d76">704 tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.</text>
5
+ <line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
6
+ <text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
7
+ <line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
8
+ <text x="40" y="253.6" font-size="11" text-anchor="end" fill="#656d76">20%</text>
9
+ <line x1="48" y1="203.2" x2="700" y2="203.2" stroke="#eaeef2" stroke-width="1"/>
10
+ <text x="40" y="207.2" font-size="11" text-anchor="end" fill="#656d76">40%</text>
11
+ <line x1="48" y1="156.8" x2="700" y2="156.8" stroke="#eaeef2" stroke-width="1"/>
12
+ <text x="40" y="160.8" font-size="11" text-anchor="end" fill="#656d76">60%</text>
13
+ <line x1="48" y1="110.4" x2="700" y2="110.4" stroke="#eaeef2" stroke-width="1"/>
14
+ <text x="40" y="114.4" font-size="11" text-anchor="end" fill="#656d76">80%</text>
15
+ <line x1="48" y1="64.0" x2="700" y2="64.0" stroke="#eaeef2" stroke-width="1"/>
16
+ <text x="40" y="68.0" font-size="11" text-anchor="end" fill="#656d76">100%</text>
17
+ <rect x="56.0" y="171.1" width="151.0" height="124.9" rx="3" fill="#9aa0a6"/>
18
+ <text x="131.5" y="166.1" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">54%</text>
19
+ <rect x="215.0" y="79.6" width="151.0" height="216.4" rx="3" fill="#2da44e"/>
20
+ <text x="290.5" y="74.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">93%</text>
21
+ <text x="211.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Detection rate</text>
22
+ <rect x="382.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#9aa0a6"/>
23
+ <text x="457.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
24
+ <rect x="541.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#2da44e"/>
25
+ <text x="616.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
26
+ <text x="537.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">False-positive rate</text>
27
+ <line x1="48" y1="296" x2="700" y2="296" stroke="#d0d7de" stroke-width="1.5"/>
28
+ <rect x="48" y="344" width="12" height="12" rx="2" fill="#9aa0a6"/>
29
+ <text x="66" y="354" font-size="12" fill="#1f2328">Legacy regex guard</text>
30
+ <rect x="201.6" y="344" width="12" height="12" rx="2" fill="#2da44e"/>
31
+ <text x="219.6" y="354" font-size="12" fill="#1f2328">Goal Mode analyzer</text>
32
+ </svg>
@@ -4,10 +4,10 @@
4
4
  <text x="20" y="47" font-size="12" fill="#656d76">Microseconds to classify one command. Both are negligible for a tool-call guard.</text>
5
5
  <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Legacy regex guard</text>
6
6
  <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
7
- <rect x="230" y="70" width="179.4" height="22" rx="3" fill="#9aa0a6"/>
8
- <text x="417.4" y="87" font-size="12" font-weight="600" fill="#1f2328">2.24 µs</text>
7
+ <rect x="230" y="70" width="214.5" height="22" rx="3" fill="#9aa0a6"/>
8
+ <text x="452.5" y="87" font-size="12" font-weight="600" fill="#1f2328">0.79 µs</text>
9
9
  <text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Goal Mode analyzer</text>
10
10
  <rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
11
11
  <rect x="230" y="108" width="300.0" height="22" rx="3" fill="#2da44e"/>
12
- <text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.75 µs</text>
12
+ <text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">1.11 µs</text>
13
13
  </svg>
@@ -1,7 +1,7 @@
1
1
  <svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
2
2
  <rect width="720" height="380" fill="#ffffff"/>
3
- <text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Overall guard accuracy</text>
4
- <text x="48" y="47" font-size="12" fill="#656d76">Detection rate (higher better) vs false-positive rate (lower better).</text>
3
+ <text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Curated fixtures — passes its own spec</text>
4
+ <text x="48" y="47" font-size="12" fill="#656d76">Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.</text>
5
5
  <line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
6
6
  <text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
7
7
  <line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
@@ -1,80 +1,121 @@
1
1
  {
2
- "corpusSize": 71,
3
- "destructiveCount": 48,
4
- "safeCount": 23,
5
- "legacy": {
6
- "detectionRate": 20.833333333333336,
7
- "falsePositiveRate": 21.73913043478261,
8
- "destCaught": 10,
9
- "destTotal": 48,
10
- "safeFalsePos": 5,
11
- "safeTotal": 23,
12
- "families": {
13
- "classic": {
14
- "destTotal": 10,
15
- "destCaught": 10,
16
- "safeTotal": 0,
17
- "safeFalsePos": 0
18
- },
19
- "bypass": {
20
- "destTotal": 35,
21
- "destCaught": 0,
22
- "safeTotal": 0,
23
- "safeFalsePos": 0
24
- },
25
- "remote-exec": {
26
- "destTotal": 3,
27
- "destCaught": 0,
28
- "safeTotal": 0,
29
- "safeFalsePos": 0
30
- },
31
- "safe": {
32
- "destTotal": 0,
33
- "destCaught": 0,
34
- "safeTotal": 23,
35
- "safeFalsePos": 5
36
- }
2
+ "external": {
3
+ "source": "tldr-pages",
4
+ "commit": "afc5c5409f0c9a94c66980c40cd8215b5f111021",
5
+ "totals": {
6
+ "uniqueCommandsScanned": 28713,
7
+ "destructiveFound": 104,
8
+ "safeFound": 28609,
9
+ "safeSampled": 600
10
+ },
11
+ "sampleSize": 704,
12
+ "legacy": {
13
+ "detectionRate": 53.8,
14
+ "falsePositiveRate": 0.2,
15
+ "destCaught": 56,
16
+ "destTotal": 104,
17
+ "safeFalsePos": 1,
18
+ "safeTotal": 600
37
19
  },
38
- "opsPerSec": 445671,
39
- "usPerCommand": 2.24
20
+ "current": {
21
+ "detectionRate": 93.3,
22
+ "falsePositiveRate": 0.2,
23
+ "destCaught": 97,
24
+ "destTotal": 104,
25
+ "safeFalsePos": 1,
26
+ "safeTotal": 600,
27
+ "misses": [
28
+ "rm -d path/to/directory",
29
+ "rm -i path/to/file1 path/to/file2 ...",
30
+ "rm -v $HOME/.cache/fuzzel",
31
+ "rm -v path/to/file1 path/to/file2 ...",
32
+ "rm /dev/sdXN",
33
+ "rm path/to/file1 path/to/file2 ...",
34
+ "rm torrent_id"
35
+ ],
36
+ "falsePositives": [
37
+ "git filter-repo --path-rename path/to/folder/:"
38
+ ]
39
+ }
40
40
  },
41
- "current": {
42
- "detectionRate": 100,
43
- "falsePositiveRate": 0,
44
- "destCaught": 48,
45
- "destTotal": 48,
46
- "safeFalsePos": 0,
47
- "safeTotal": 23,
48
- "families": {
49
- "classic": {
50
- "destTotal": 10,
51
- "destCaught": 10,
52
- "safeTotal": 0,
53
- "safeFalsePos": 0
41
+ "fixtures": {
42
+ "corpusSize": 71,
43
+ "destructiveCount": 48,
44
+ "safeCount": 23,
45
+ "legacy": {
46
+ "detectionRate": 20.833333333333336,
47
+ "falsePositiveRate": 21.73913043478261,
48
+ "destCaught": 10,
49
+ "destTotal": 48,
50
+ "safeFalsePos": 5,
51
+ "safeTotal": 23,
52
+ "families": {
53
+ "classic": {
54
+ "destTotal": 10,
55
+ "destCaught": 10,
56
+ "safeTotal": 0,
57
+ "safeFalsePos": 0
58
+ },
59
+ "bypass": {
60
+ "destTotal": 35,
61
+ "destCaught": 0,
62
+ "safeTotal": 0,
63
+ "safeFalsePos": 0
64
+ },
65
+ "remote-exec": {
66
+ "destTotal": 3,
67
+ "destCaught": 0,
68
+ "safeTotal": 0,
69
+ "safeFalsePos": 0
70
+ },
71
+ "safe": {
72
+ "destTotal": 0,
73
+ "destCaught": 0,
74
+ "safeTotal": 23,
75
+ "safeFalsePos": 5
76
+ }
54
77
  },
55
- "bypass": {
56
- "destTotal": 35,
57
- "destCaught": 35,
58
- "safeTotal": 0,
59
- "safeFalsePos": 0
60
- },
61
- "remote-exec": {
62
- "destTotal": 3,
63
- "destCaught": 3,
64
- "safeTotal": 0,
65
- "safeFalsePos": 0
66
- },
67
- "safe": {
68
- "destTotal": 0,
69
- "destCaught": 0,
70
- "safeTotal": 23,
71
- "safeFalsePos": 0
72
- }
78
+ "opsPerSec": 1260371,
79
+ "usPerCommand": 0.79
73
80
  },
74
- "opsPerSec": 266556,
75
- "usPerCommand": 3.75
81
+ "current": {
82
+ "detectionRate": 100,
83
+ "falsePositiveRate": 0,
84
+ "destCaught": 48,
85
+ "destTotal": 48,
86
+ "safeFalsePos": 0,
87
+ "safeTotal": 23,
88
+ "families": {
89
+ "classic": {
90
+ "destTotal": 10,
91
+ "destCaught": 10,
92
+ "safeTotal": 0,
93
+ "safeFalsePos": 0
94
+ },
95
+ "bypass": {
96
+ "destTotal": 35,
97
+ "destCaught": 35,
98
+ "safeTotal": 0,
99
+ "safeFalsePos": 0
100
+ },
101
+ "remote-exec": {
102
+ "destTotal": 3,
103
+ "destCaught": 3,
104
+ "safeTotal": 0,
105
+ "safeFalsePos": 0
106
+ },
107
+ "safe": {
108
+ "destTotal": 0,
109
+ "destCaught": 0,
110
+ "safeTotal": 23,
111
+ "safeFalsePos": 0
112
+ }
113
+ },
114
+ "opsPerSec": 901050,
115
+ "usPerCommand": 1.11
116
+ }
76
117
  },
77
- "truthfulness": {
118
+ "completionFixtures": {
78
119
  "name": "False Completion Dataset",
79
120
  "corpusSize": 9,
80
121
  "requiredBaseGates": [
@@ -1,7 +1,7 @@
1
1
  <svg xmlns="http://www.w3.org/2000/svg" width="720" height="202" viewBox="0 0 720 202" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
2
2
  <rect width="720" height="202" fill="#ffffff"/>
3
- <text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Benchmark Truthfulness Score</text>
4
- <text x="20" y="47" font-size="12" fill="#656d76">False Completion Dataset: 9 labeled completion-claim cases.</text>
3
+ <text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Completion-enforcement fixtures</text>
4
+ <text x="20" y="47" font-size="12" fill="#656d76">9 hand-authored policy cases (a spec, not a survey): premature claims blocked, valid ones allowed.</text>
5
5
  <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Truthfulness score</text>
6
6
  <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
7
7
  <rect x="230" y="70" width="420.0" height="22" rx="3" fill="#2da44e"/>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-goal-mode",
3
- "version": "0.2.4",
3
+ "version": "0.3.1",
4
4
  "description": "Strict Goal Mode agents, commands, and guard plugin for OpenCode.",
5
5
  "type": "module",
6
6
  "engines": {
@@ -31,7 +31,10 @@
31
31
  "test:unit": "node --test tests/state.test.mjs tests/gates.test.mjs tests/verdicts.test.mjs tests/config.test.mjs tests/persistence.test.mjs",
32
32
  "test:agents": "node --test tests/agents.test.mjs tests/commands.test.mjs",
33
33
  "test:install": "node --test tests/install.test.mjs",
34
+ "test:visual": "bun tools/visual-test/sidebar-visual.jsx",
34
35
  "bench": "node benchmarks/run.mjs",
36
+ "bench:external": "node benchmarks/external.mjs",
37
+ "bench:corpus": "node benchmarks/build-external-corpus.mjs",
35
38
  "bench:truthfulness": "node benchmarks/truthfulness.mjs",
36
39
  "bench:compare": "node benchmarks/comparison.mjs",
37
40
  "pack:check": "npm pack --dry-run",
@@ -26,6 +26,14 @@ export const DEFAULT_CONFIG = Object.freeze({
26
26
  sessionTtlMs: 24 * 60 * 60 * 1000,
27
27
  /** Emit a TUI toast when completion is blocked. */
28
28
  toastOnBlock: true,
29
+ /** Emit a TUI toast when a review gate records a PASS/FAIL, and when completion unlocks. */
30
+ toastOnReview: true,
31
+ /** Show the experimental yellow goal banner in the TUI sidebar (TUI-plugin-capable OpenCode only). */
32
+ sidebarBanner: true,
33
+ /** Foreground colour (hex) for the sidebar goal banner. */
34
+ sidebarColor: "#FFD700",
35
+ /** Foreground colour (hex) for the muted "No goal" sidebar line. */
36
+ sidebarMutedColor: "#808080",
29
37
  /** Phrase that, at the start of an assistant message, claims completion. */
30
38
  completionMarker: "Goal Completed",
31
39
  /** Replacement marker when completion is blocked. */
@@ -59,6 +67,10 @@ function fromEnv(env) {
59
67
  GOAL_GUARD_MAX_SESSIONS: ["maxSessions", coerceInt],
60
68
  GOAL_GUARD_SESSION_TTL_MS: ["sessionTtlMs", coerceInt],
61
69
  GOAL_GUARD_TOAST_ON_BLOCK: ["toastOnBlock", coerceBool],
70
+ GOAL_GUARD_TOAST_ON_REVIEW: ["toastOnReview", coerceBool],
71
+ GOAL_GUARD_SIDEBAR_BANNER: ["sidebarBanner", coerceBool],
72
+ GOAL_GUARD_SIDEBAR_COLOR: ["sidebarColor", (v) => (v == null ? undefined : String(v))],
73
+ GOAL_GUARD_SIDEBAR_MUTED_COLOR: ["sidebarMutedColor", (v) => (v == null ? undefined : String(v))],
62
74
  };
63
75
  for (const [key, [field, coerce]] of Object.entries(map)) {
64
76
  if (env[key] !== undefined) out[field] = coerce(env[key], DEFAULT_CONFIG[field]);
@@ -415,7 +415,7 @@ const DIRECT_TEST_BINS = new Set(["jest", "mocha", "vitest", "ava", "tap", "tape
415
415
  const FORMATTERS = new Set(["prettier", "eslint", "black", "ruff", "gofmt", "goimports", "rustfmt", "clang-format", "autopep8", "isort", "standard", "biome", "dprint", "yapf", "stylelint"]);
416
416
 
417
417
  const MUTATING_BINS = new Set(["mkdir", "rmdir", "touch", "ln", "mv", "cp", "tee", "install", "patch", "rsync", "rename", "chmod", "chown", "chgrp", "git-apply"]);
418
- const DESTRUCTIVE_BINS = new Set(["shred", "mkfs", "fdisk", "parted", "wipefs", "sgdisk", "blkdiscard", "unlink"]);
418
+ const DESTRUCTIVE_BINS = new Set(["shred", "srm", "mkfs", "mkswap", "fdisk", "parted", "wipefs", "sgdisk", "blkdiscard", "unlink"]);
419
419
 
420
420
  /**
421
421
  * Classify a single already-split simple command (array of words).
@@ -603,8 +603,9 @@ function classifyCommand(words, redirects, depth, acc, pipelineCmds, indexInPipe
603
603
  return;
604
604
  }
605
605
 
606
- // Destructive disk/file utilities.
607
- if (DESTRUCTIVE_BINS.has(bin)) {
606
+ // Destructive disk/file utilities. `mkfs.<fstype>` (mkfs.ext4, mkfs.erofs, …)
607
+ // is the same irreversible filesystem-format operation as bare `mkfs`.
608
+ if (DESTRUCTIVE_BINS.has(bin) || /^mkfs\./.test(bin)) {
608
609
  acc.destructive = true;
609
610
  acc.reasons.push(bin);
610
611
  return;