npm - opencode-goal-mode - Versions diffs - 0.2.4 → 0.3.1 - Mend

opencode-goal-mode 0.2.4 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/ARCHITECTURE.md +39 -0
package/CHANGELOG.md +31 -0
package/README.md +67 -24
package/benchmarks/build-external-corpus.mjs +177 -0
package/benchmarks/external-corpus.json +3540 -0
package/benchmarks/external.mjs +110 -0
package/benchmarks/run.mjs +78 -24
package/commands/goal.md +16 -1
package/docs/benchmarks/detection-by-family.svg +2 -2
package/docs/benchmarks/external-scorecard.svg +32 -0
package/docs/benchmarks/latency.svg +3 -3
package/docs/benchmarks/overall-scorecard.svg +2 -2
package/docs/benchmarks/results.json +112 -71
package/docs/benchmarks/truthfulness-score.svg +2 -2
package/package.json +4 -1
package/plugins/goal-guard/config.js +12 -0
package/plugins/goal-guard/shell.js +4 -3
package/plugins/goal-guard/sidebar-data.js +73 -0
package/plugins/goal-guard/summary.js +38 -0
package/plugins/goal-guard/tools.js +8 -2
package/plugins/goal-guard.js +13 -0
package/plugins/goal-sidebar.js +144 -0
package/research/benchmarks.md +75 -69

package/benchmarks/external.mjs ADDED Viewed

@@ -0,0 +1,110 @@
+#!/usr/bin/env node
+/**
+ * Run the shell guard against the EXTERNAL, third-party-authored corpus built by
+ * build-external-corpus.mjs (real tldr-pages commands). This is the honest
+ * benchmark: the analyzer authors did not write or curate these commands, so the
+ * detection / false-positive numbers reflect real-world behavior, warts and all.
+ *
+ * It deliberately also reports DISAGREEMENTS between the analyzer and the
+ * independent ground-truth labeler, so misses and false positives are auditable
+ * rather than averaged away.
+ *
+ *   node benchmarks/external.mjs            # summary
+ *   node benchmarks/external.mjs --json     # full machine-readable result
+ */
+import { readFileSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import * as current from "../plugins/goal-guard/shell.js";
+import * as legacy from "./legacy-analyzer.mjs";
+const here = dirname(fileURLToPath(import.meta.url));
+export function loadExternalCorpus() {
+  return JSON.parse(readFileSync(join(here, "external-corpus.json"), "utf8"));
+}
+function blocked(analyzer, cmd) {
+  const a = analyzer.analyzeCommand(cmd);
+  return Boolean(a.destructive || a.networkExec);
+}
+/** Evaluate one analyzer over labeled entries (each {cmd, page, destructive}). */
+function score(analyzer, labeled) {
+  let destTotal = 0;
+  let destCaught = 0;
+  let safeTotal = 0;
+  let safeFalsePos = 0;
+  const misses = [];
+  const falsePositives = [];
+  for (const e of labeled) {
+    const isBlocked = blocked(analyzer, e.cmd);
+    if (e.destructive) {
+      destTotal += 1;
+      if (isBlocked) destCaught += 1;
+      else misses.push({ cmd: e.cmd, page: e.page });
+    } else {
+      safeTotal += 1;
+      if (isBlocked) {
+        safeFalsePos += 1;
+        falsePositives.push({ cmd: e.cmd, page: e.page });
+      }
+    }
+  }
+  return {
+    detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
+    falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
+    destCaught,
+    destTotal,
+    safeFalsePos,
+    safeTotal,
+    misses,
+    falsePositives,
+  };
+}
+export function runExternalBenchmark() {
+  const corpus = loadExternalCorpus();
+  // The corpus is written destructive-first then safe (see build-external-corpus.mjs),
+  // so the recorded count is the label boundary — no re-running the labeler needed.
+  const labeled = corpus.entries.map((e, i) => ({ ...e, destructive: i < corpus.totals.destructiveFound }));
+  return {
+    source: corpus.source,
+    commit: corpus.commit,
+    totals: corpus.totals,
+    sampleSize: labeled.length,
+    legacy: score(legacy, labeled),
+    current: score(current, labeled),
+  };
+}
+function pct(n) {
+  return `${n.toFixed(1)}%`;
+}
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  const r = runExternalBenchmark();
+  if (process.argv.includes("--json")) {
+    console.log(JSON.stringify(r, null, 2));
+  } else {
+    console.log("External shell-guard benchmark (third-party tldr-pages commands)");
+    console.log("================================================================");
+    console.log(`Source: ${r.source} @ ${r.commit.slice(0, 12)}`);
+    console.log(
+      `Sample: ${r.sampleSize} commands ` +
+        `(${r.totals.destructiveFound} destructive [all found], ` +
+        `${r.totals.safeSampled}/${r.totals.safeFound} safe sampled)`,
+    );
+    console.log("");
+    console.log(`Detection (destructive caught)   legacy ${pct(r.legacy.detectionRate)}   →   current ${pct(r.current.detectionRate)}`);
+    console.log(`False positives on safe commands legacy ${pct(r.legacy.falsePositiveRate)}   →   current ${pct(r.current.falsePositiveRate)}`);
+    console.log("");
+    console.log(`Current analyzer misses (${r.current.misses.length}):`);
+    for (const m of r.current.misses.slice(0, 20)) console.log(`  - ${m.cmd}   [${m.page}]`);
+    if (r.current.misses.length > 20) console.log(`  … ${r.current.misses.length - 20} more`);
+    console.log(`Current analyzer false positives (${r.current.falsePositives.length}):`);
+    for (const f of r.current.falsePositives.slice(0, 20)) console.log(`  - ${f.cmd}   [${f.page}]`);
+    if (r.current.falsePositives.length > 20) console.log(`  … ${r.current.falsePositives.length - 20} more`);
+  }
+}

package/benchmarks/run.mjs CHANGED Viewed

@@ -20,6 +20,7 @@ import * as current from "../plugins/goal-guard/shell.js";
 import * as legacy from "./legacy-analyzer.mjs";
 import { groupedBarChart, horizontalBarChart } from "./charts.mjs";
 import { runTruthfulnessBenchmark } from "./truthfulness.mjs";
+import { runExternalBenchmark } from "./external.mjs";
 const root = fileURLToPath(new URL("..", import.meta.url));
 const outDir = join(root, "docs", "benchmarks");
@@ -92,6 +93,7 @@ function fmt(n) {
 const legacyEval = evaluate(legacy);
 const currentEval = evaluate(current);
+const external = runExternalBenchmark();
 const truthfulness = runTruthfulnessBenchmark();
 const legacyOps = throughput(legacy);
 const currentOps = throughput(current);
@@ -110,23 +112,74 @@ function familyRate(ev, fam) {
   return f && f.destTotal ? (f.destCaught / f.destTotal) * 100 : 0;
 }
+// Trim the per-command miss/false-positive lists to keep results.json readable;
+// the full lists are always available via `node benchmarks/external.mjs --json`.
+const externalSummary = {
+  source: external.source,
+  commit: external.commit,
+  totals: external.totals,
+  sampleSize: external.sampleSize,
+  legacy: {
+    detectionRate: Number(external.legacy.detectionRate.toFixed(1)),
+    falsePositiveRate: Number(external.legacy.falsePositiveRate.toFixed(1)),
+    destCaught: external.legacy.destCaught,
+    destTotal: external.legacy.destTotal,
+    safeFalsePos: external.legacy.safeFalsePos,
+    safeTotal: external.legacy.safeTotal,
+  },
+  current: {
+    detectionRate: Number(external.current.detectionRate.toFixed(1)),
+    falsePositiveRate: Number(external.current.falsePositiveRate.toFixed(1)),
+    destCaught: external.current.destCaught,
+    destTotal: external.current.destTotal,
+    safeFalsePos: external.current.safeFalsePos,
+    safeTotal: external.current.safeTotal,
+    misses: external.current.misses.map((m) => m.cmd),
+    falsePositives: external.current.falsePositives.map((f) => f.cmd),
+  },
+};
 const results = {
-  corpusSize: CORPUS.length,
-  destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
-  safeCount: CORPUS.filter((c) => c.label === "safe").length,
-  legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
-  current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
-  truthfulness,
+  // The honest, third-party benchmark: real commands the analyzer was never
+  // fitted to. This is the headline number.
+  external: externalSummary,
+  // Curated REGRESSION FIXTURES: a hand-authored set of known destructive
+  // patterns and their safe look-alikes. These define the patterns the analyzer
+  // is built to catch and guard against regressions — they are NOT an unbiased
+  // sample, so the 100%/0% here is "passes its own spec", not measured accuracy.
+  fixtures: {
+    corpusSize: CORPUS.length,
+    destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
+    safeCount: CORPUS.filter((c) => c.label === "safe").length,
+    legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
+    current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
+  },
+  // Completion-enforcement fixtures (hand-authored policy cases), not a survey.
+  completionFixtures: truthfulness,
 };
 writeFileSync(join(outDir, "results.json"), JSON.stringify(results, null, 2));
-// Chart 1: detection rate by command family.
+// Headline chart: detection + false positives on the EXTERNAL third-party corpus.
+writeFileSync(
+  join(outDir, "external-scorecard.svg"),
+  groupedBarChart({
+    title: "Guard accuracy on real third-party commands",
+    subtitle: `${external.sampleSize} tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.`,
+    groups: ["Detection rate", "False-positive rate"],
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: [external.legacy.detectionRate, external.legacy.falsePositiveRate] },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: [external.current.detectionRate, external.current.falsePositiveRate] },
+    ],
+  }),
+);
+// Chart 1: detection rate by command family (CURATED regression fixtures).
 writeFileSync(
   join(outDir, "detection-by-family.svg"),
   groupedBarChart({
-    title: "Destructive-command detection rate by family",
-    subtitle: `Higher is better. Corpus: ${results.destructiveCount} destructive commands.`,
+    title: "Detection by family — curated regression fixtures",
+    subtitle: `Curated patterns the analyzer is built to catch (not an unbiased sample). ${results.fixtures.destructiveCount} destructive fixtures.`,
     groups: detFamilies.map((f) => FAMILY_LABELS[f]),
     series: [
       { name: "Legacy regex guard", color: "#9aa0a6", values: detFamilies.map((f) => familyRate(legacyEval, f)) },
@@ -135,12 +188,12 @@ writeFileSync(
   }),
 );
-// Chart 2: overall scorecard (detection up, false positives down).
+// Chart 2: overall scorecard on the CURATED fixtures (passes its own spec).
 writeFileSync(
   join(outDir, "overall-scorecard.svg"),
   groupedBarChart({
-    title: "Overall guard accuracy",
-    subtitle: "Detection rate (higher better) vs false-positive rate (lower better).",
+    title: "Curated fixtures — passes its own spec",
+    subtitle: "Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.",
     groups: ["Detection rate", "False-positive rate"],
     series: [
       { name: "Legacy regex guard", color: "#9aa0a6", values: [legacyEval.detectionRate, legacyEval.falsePositiveRate] },
@@ -168,8 +221,8 @@ writeFileSync(
 writeFileSync(
   join(outDir, "truthfulness-score.svg"),
   horizontalBarChart({
-    title: "Benchmark Truthfulness Score",
-    subtitle: `False Completion Dataset: ${truthfulness.corpusSize} labeled completion-claim cases.`,
+    title: "Completion-enforcement fixtures",
+    subtitle: `${truthfulness.corpusSize} hand-authored policy cases (a spec, not a survey): premature claims blocked, valid ones allowed.`,
     unit: "%",
     max: 100,
     rows: [
@@ -183,16 +236,17 @@ writeFileSync(
 const pct = (n) => `${n.toFixed(1)}%`;
 console.log("Goal Mode shell-guard benchmark");
 console.log("================================");
-console.log(`Corpus: ${results.corpusSize} commands (${results.destructiveCount} destructive, ${results.safeCount} safe)`);
 console.log("");
-console.log(`Detection rate   legacy ${pct(legacyEval.detectionRate)}   →   Goal Mode ${pct(currentEval.detectionRate)}`);
-console.log(`False positives  legacy ${pct(legacyEval.falsePositiveRate)}   →   Goal Mode ${pct(currentEval.falsePositiveRate)}`);
-console.log(`Latency          legacy ${legacyUs.toFixed(2)} µs/cmd   →   Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
-console.log(`Truthfulness    False Completion Dataset score ${truthfulness.score.toFixed(1)}% (${truthfulness.corpusSize} cases)`);
+console.log(`HEADLINE — external corpus: ${external.sampleSize} real tldr-pages commands @ ${external.commit.slice(0, 12)}`);
+console.log(`  (${external.totals.destructiveFound} destructive [all found] + ${external.totals.safeSampled}/${external.totals.safeFound} safe sampled)`);
+console.log(`  Detection       legacy ${pct(external.legacy.detectionRate)}   →   Goal Mode ${pct(external.current.detectionRate)}`);
+console.log(`  False positives legacy ${pct(external.legacy.falsePositiveRate)}   →   Goal Mode ${pct(external.current.falsePositiveRate)}`);
+console.log(`  Remaining Goal Mode misses: ${external.current.misses.length} (mostly un-flagged single-target rm — see external.mjs --json)`);
 console.log("");
-console.log("By family (detection rate):");
-for (const f of detFamilies) {
-  console.log(`  ${FAMILY_LABELS[f].padEnd(12)} legacy ${pct(familyRate(legacyEval, f)).padStart(6)}  →  Goal Mode ${pct(familyRate(currentEval, f)).padStart(6)}`);
-}
+console.log(`Curated regression fixtures: ${results.fixtures.corpusSize} commands (defines patterns to catch; not an unbiased sample)`);
+console.log(`  Detection   legacy ${pct(legacyEval.detectionRate)}   →   Goal Mode ${pct(currentEval.detectionRate)}   (passes its own spec)`);
+console.log(`  False pos   legacy ${pct(legacyEval.falsePositiveRate)}   →   Goal Mode ${pct(currentEval.falsePositiveRate)}`);
+console.log(`Completion-enforcement fixtures: ${truthfulness.corpusSize} hand-authored policy cases, all pass (a spec, not a survey)`);
+console.log(`Latency: Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
 console.log("");
-console.log(`Wrote results.json + 4 SVG charts to docs/benchmarks/`);
+console.log(`Wrote results.json + 5 SVG charts to docs/benchmarks/`);

package/commands/goal.md CHANGED Viewed

@@ -9,4 +9,19 @@ Start Goal Mode for this request:
 $ARGUMENTS
 ```
-First create a Goal Contract, ask only essential beginning clarifying questions, delegate discovery/research to subagents, implement in the main agent, verify, run required review cycles, and only finish with `Goal Completed` if all gates pass.
+Run this sequence:
+1. **Seed the contract first.** Call the `goal_contract` tool with the original
+   request, explicit/inferred requirements, non-goals, and concrete acceptance
+   criteria. This activates enforcement, fixes the required specialist review
+   gates, and lights up the goal banner in the sidebar. Ask only essential
+   clarifying questions before recording it.
+2. Delegate discovery and research to subagents; implement in the main agent.
+3. Verify, and record each verification with the `goal_evidence` tool so it maps
+   to your acceptance criteria.
+4. Run the required review cycles. Consult `goal_status` / `goal_evidence_map`
+   for the authoritative list of missing or stale gates rather than relying on
+   memory.
+5. Only finish with `Goal Completed` (plus an accurate `Review cycles: N` line)
+   once every required gate has a fresh PASS — the guard will rewrite a premature
+   claim to `Goal Not Completed`.

package/docs/benchmarks/detection-by-family.svg CHANGED Viewed

@@ -1,7 +1,7 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
 <rect width="720" height="380" fill="#ffffff"/>
-<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Destructive-command detection rate by family</text>
-<text x="48" y="47" font-size="12" fill="#656d76">Higher is better. Corpus: 48 destructive commands.</text>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Detection by family — curated regression fixtures</text>
+<text x="48" y="47" font-size="12" fill="#656d76">Curated patterns the analyzer is built to catch (not an unbiased sample). 48 destructive fixtures.</text>
 <line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
 <text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
 <line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>

package/docs/benchmarks/external-scorecard.svg ADDED Viewed

@@ -0,0 +1,32 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="720" height="380" fill="#ffffff"/>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Guard accuracy on real third-party commands</text>
+<text x="48" y="47" font-size="12" fill="#656d76">704 tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.</text>
+<line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
+<line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="253.6" font-size="11" text-anchor="end" fill="#656d76">20%</text>
+<line x1="48" y1="203.2" x2="700" y2="203.2" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="207.2" font-size="11" text-anchor="end" fill="#656d76">40%</text>
+<line x1="48" y1="156.8" x2="700" y2="156.8" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="160.8" font-size="11" text-anchor="end" fill="#656d76">60%</text>
+<line x1="48" y1="110.4" x2="700" y2="110.4" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="114.4" font-size="11" text-anchor="end" fill="#656d76">80%</text>
+<line x1="48" y1="64.0" x2="700" y2="64.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="68.0" font-size="11" text-anchor="end" fill="#656d76">100%</text>
+<rect x="56.0" y="171.1" width="151.0" height="124.9" rx="3" fill="#9aa0a6"/>
+<text x="131.5" y="166.1" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">54%</text>
+<rect x="215.0" y="79.6" width="151.0" height="216.4" rx="3" fill="#2da44e"/>
+<text x="290.5" y="74.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">93%</text>
+<text x="211.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Detection rate</text>
+<rect x="382.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#9aa0a6"/>
+<text x="457.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
+<rect x="541.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#2da44e"/>
+<text x="616.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
+<text x="537.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">False-positive rate</text>
+<line x1="48" y1="296" x2="700" y2="296" stroke="#d0d7de" stroke-width="1.5"/>
+<rect x="48" y="344" width="12" height="12" rx="2" fill="#9aa0a6"/>
+<text x="66" y="354" font-size="12" fill="#1f2328">Legacy regex guard</text>
+<rect x="201.6" y="344" width="12" height="12" rx="2" fill="#2da44e"/>
+<text x="219.6" y="354" font-size="12" fill="#1f2328">Goal Mode analyzer</text>
+</svg>

package/docs/benchmarks/latency.svg CHANGED Viewed

@@ -4,10 +4,10 @@
 <text x="20" y="47" font-size="12" fill="#656d76">Microseconds to classify one command. Both are negligible for a tool-call guard.</text>
 <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Legacy regex guard</text>
 <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
-<rect x="230" y="70" width="179.4" height="22" rx="3" fill="#9aa0a6"/>
-<text x="417.4" y="87" font-size="12" font-weight="600" fill="#1f2328">2.24 µs</text>
+<rect x="230" y="70" width="214.5" height="22" rx="3" fill="#9aa0a6"/>
+<text x="452.5" y="87" font-size="12" font-weight="600" fill="#1f2328">0.79 µs</text>
 <text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Goal Mode analyzer</text>
 <rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
 <rect x="230" y="108" width="300.0" height="22" rx="3" fill="#2da44e"/>
-<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.75 µs</text>
+<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">1.11 µs</text>
 </svg>

package/docs/benchmarks/overall-scorecard.svg CHANGED Viewed

@@ -1,7 +1,7 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
 <rect width="720" height="380" fill="#ffffff"/>
-<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Overall guard accuracy</text>
-<text x="48" y="47" font-size="12" fill="#656d76">Detection rate (higher better) vs false-positive rate (lower better).</text>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Curated fixtures — passes its own spec</text>
+<text x="48" y="47" font-size="12" fill="#656d76">Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.</text>
 <line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
 <text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
 <line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>

package/docs/benchmarks/results.json CHANGED Viewed

@@ -1,80 +1,121 @@
 {
-  "corpusSize": 71,
-  "destructiveCount": 48,
-  "safeCount": 23,
-  "legacy": {
-    "detectionRate": 20.833333333333336,
-    "falsePositiveRate": 21.73913043478261,
-    "destCaught": 10,
-    "destTotal": 48,
-    "safeFalsePos": 5,
-    "safeTotal": 23,
-    "families": {
-      "classic": {
-        "destTotal": 10,
-        "destCaught": 10,
-        "safeTotal": 0,
-        "safeFalsePos": 0
-      },
-      "bypass": {
-        "destTotal": 35,
-        "destCaught": 0,
-        "safeTotal": 0,
-        "safeFalsePos": 0
-      },
-      "remote-exec": {
-        "destTotal": 3,
-        "destCaught": 0,
-        "safeTotal": 0,
-        "safeFalsePos": 0
-      },
-      "safe": {
-        "destTotal": 0,
-        "destCaught": 0,
-        "safeTotal": 23,
-        "safeFalsePos": 5
-      }
+  "external": {
+    "source": "tldr-pages",
+    "commit": "afc5c5409f0c9a94c66980c40cd8215b5f111021",
+    "totals": {
+      "uniqueCommandsScanned": 28713,
+      "destructiveFound": 104,
+      "safeFound": 28609,
+      "safeSampled": 600
+    },
+    "sampleSize": 704,
+    "legacy": {
+      "detectionRate": 53.8,
+      "falsePositiveRate": 0.2,
+      "destCaught": 56,
+      "destTotal": 104,
+      "safeFalsePos": 1,
+      "safeTotal": 600
     },
-    "opsPerSec": 445671,
-    "usPerCommand": 2.24
+    "current": {
+      "detectionRate": 93.3,
+      "falsePositiveRate": 0.2,
+      "destCaught": 97,
+      "destTotal": 104,
+      "safeFalsePos": 1,
+      "safeTotal": 600,
+      "misses": [
+        "rm -d path/to/directory",
+        "rm -i path/to/file1 path/to/file2 ...",
+        "rm -v $HOME/.cache/fuzzel",
+        "rm -v path/to/file1 path/to/file2 ...",
+        "rm /dev/sdXN",
+        "rm path/to/file1 path/to/file2 ...",
+        "rm torrent_id"
+      ],
+      "falsePositives": [
+        "git filter-repo --path-rename path/to/folder/:"
+      ]
+    }
   },
-  "current": {
-    "detectionRate": 100,
-    "falsePositiveRate": 0,
-    "destCaught": 48,
-    "destTotal": 48,
-    "safeFalsePos": 0,
-    "safeTotal": 23,
-    "families": {
-      "classic": {
-        "destTotal": 10,
-        "destCaught": 10,
-        "safeTotal": 0,
-        "safeFalsePos": 0
+  "fixtures": {
+    "corpusSize": 71,
+    "destructiveCount": 48,
+    "safeCount": 23,
+    "legacy": {
+      "detectionRate": 20.833333333333336,
+      "falsePositiveRate": 21.73913043478261,
+      "destCaught": 10,
+      "destTotal": 48,
+      "safeFalsePos": 5,
+      "safeTotal": 23,
+      "families": {
+        "classic": {
+          "destTotal": 10,
+          "destCaught": 10,
+          "safeTotal": 0,
+          "safeFalsePos": 0
+        },
+        "bypass": {
+          "destTotal": 35,
+          "destCaught": 0,
+          "safeTotal": 0,
+          "safeFalsePos": 0
+        },
+        "remote-exec": {
+          "destTotal": 3,
+          "destCaught": 0,
+          "safeTotal": 0,
+          "safeFalsePos": 0
+        },
+        "safe": {
+          "destTotal": 0,
+          "destCaught": 0,
+          "safeTotal": 23,
+          "safeFalsePos": 5
+        }
       },
-      "bypass": {
-        "destTotal": 35,
-        "destCaught": 35,
-        "safeTotal": 0,
-        "safeFalsePos": 0
-      },
-      "remote-exec": {
-        "destTotal": 3,
-        "destCaught": 3,
-        "safeTotal": 0,
-        "safeFalsePos": 0
-      },
-      "safe": {
-        "destTotal": 0,
-        "destCaught": 0,
-        "safeTotal": 23,
-        "safeFalsePos": 0
-      }
+      "opsPerSec": 1260371,
+      "usPerCommand": 0.79
     },
-    "opsPerSec": 266556,
-    "usPerCommand": 3.75
+    "current": {
+      "detectionRate": 100,
+      "falsePositiveRate": 0,
+      "destCaught": 48,
+      "destTotal": 48,
+      "safeFalsePos": 0,
+      "safeTotal": 23,
+      "families": {
+        "classic": {
+          "destTotal": 10,
+          "destCaught": 10,
+          "safeTotal": 0,
+          "safeFalsePos": 0
+        },
+        "bypass": {
+          "destTotal": 35,
+          "destCaught": 35,
+          "safeTotal": 0,
+          "safeFalsePos": 0
+        },
+        "remote-exec": {
+          "destTotal": 3,
+          "destCaught": 3,
+          "safeTotal": 0,
+          "safeFalsePos": 0
+        },
+        "safe": {
+          "destTotal": 0,
+          "destCaught": 0,
+          "safeTotal": 23,
+          "safeFalsePos": 0
+        }
+      },
+      "opsPerSec": 901050,
+      "usPerCommand": 1.11
+    }
   },
-  "truthfulness": {
+  "completionFixtures": {
     "name": "False Completion Dataset",
     "corpusSize": 9,
     "requiredBaseGates": [

package/docs/benchmarks/truthfulness-score.svg CHANGED Viewed

@@ -1,7 +1,7 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="720" height="202" viewBox="0 0 720 202" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
 <rect width="720" height="202" fill="#ffffff"/>
-<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Benchmark Truthfulness Score</text>
-<text x="20" y="47" font-size="12" fill="#656d76">False Completion Dataset: 9 labeled completion-claim cases.</text>
+<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Completion-enforcement fixtures</text>
+<text x="20" y="47" font-size="12" fill="#656d76">9 hand-authored policy cases (a spec, not a survey): premature claims blocked, valid ones allowed.</text>
 <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Truthfulness score</text>
 <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
 <rect x="230" y="70" width="420.0" height="22" rx="3" fill="#2da44e"/>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "opencode-goal-mode",
-  "version": "0.2.4",
+  "version": "0.3.1",
   "description": "Strict Goal Mode agents, commands, and guard plugin for OpenCode.",
   "type": "module",
   "engines": {
@@ -31,7 +31,10 @@
     "test:unit": "node --test tests/state.test.mjs tests/gates.test.mjs tests/verdicts.test.mjs tests/config.test.mjs tests/persistence.test.mjs",
     "test:agents": "node --test tests/agents.test.mjs tests/commands.test.mjs",
     "test:install": "node --test tests/install.test.mjs",
+    "test:visual": "bun tools/visual-test/sidebar-visual.jsx",
     "bench": "node benchmarks/run.mjs",
+    "bench:external": "node benchmarks/external.mjs",
+    "bench:corpus": "node benchmarks/build-external-corpus.mjs",
     "bench:truthfulness": "node benchmarks/truthfulness.mjs",
     "bench:compare": "node benchmarks/comparison.mjs",
     "pack:check": "npm pack --dry-run",

package/plugins/goal-guard/config.js CHANGED Viewed

@@ -26,6 +26,14 @@ export const DEFAULT_CONFIG = Object.freeze({
   sessionTtlMs: 24 * 60 * 60 * 1000,
   /** Emit a TUI toast when completion is blocked. */
   toastOnBlock: true,
+  /** Emit a TUI toast when a review gate records a PASS/FAIL, and when completion unlocks. */
+  toastOnReview: true,
+  /** Show the experimental yellow goal banner in the TUI sidebar (TUI-plugin-capable OpenCode only). */
+  sidebarBanner: true,
+  /** Foreground colour (hex) for the sidebar goal banner. */
+  sidebarColor: "#FFD700",
+  /** Foreground colour (hex) for the muted "No goal" sidebar line. */
+  sidebarMutedColor: "#808080",
   /** Phrase that, at the start of an assistant message, claims completion. */
   completionMarker: "Goal Completed",
   /** Replacement marker when completion is blocked. */
@@ -59,6 +67,10 @@ function fromEnv(env) {
     GOAL_GUARD_MAX_SESSIONS: ["maxSessions", coerceInt],
     GOAL_GUARD_SESSION_TTL_MS: ["sessionTtlMs", coerceInt],
     GOAL_GUARD_TOAST_ON_BLOCK: ["toastOnBlock", coerceBool],
+    GOAL_GUARD_TOAST_ON_REVIEW: ["toastOnReview", coerceBool],
+    GOAL_GUARD_SIDEBAR_BANNER: ["sidebarBanner", coerceBool],
+    GOAL_GUARD_SIDEBAR_COLOR: ["sidebarColor", (v) => (v == null ? undefined : String(v))],
+    GOAL_GUARD_SIDEBAR_MUTED_COLOR: ["sidebarMutedColor", (v) => (v == null ? undefined : String(v))],
   };
   for (const [key, [field, coerce]] of Object.entries(map)) {
     if (env[key] !== undefined) out[field] = coerce(env[key], DEFAULT_CONFIG[field]);

package/plugins/goal-guard/shell.js CHANGED Viewed

@@ -415,7 +415,7 @@ const DIRECT_TEST_BINS = new Set(["jest", "mocha", "vitest", "ava", "tap", "tape
 const FORMATTERS = new Set(["prettier", "eslint", "black", "ruff", "gofmt", "goimports", "rustfmt", "clang-format", "autopep8", "isort", "standard", "biome", "dprint", "yapf", "stylelint"]);
 const MUTATING_BINS = new Set(["mkdir", "rmdir", "touch", "ln", "mv", "cp", "tee", "install", "patch", "rsync", "rename", "chmod", "chown", "chgrp", "git-apply"]);
-const DESTRUCTIVE_BINS = new Set(["shred", "mkfs", "fdisk", "parted", "wipefs", "sgdisk", "blkdiscard", "unlink"]);
+const DESTRUCTIVE_BINS = new Set(["shred", "srm", "mkfs", "mkswap", "fdisk", "parted", "wipefs", "sgdisk", "blkdiscard", "unlink"]);
 /**
  * Classify a single already-split simple command (array of words).
@@ -603,8 +603,9 @@ function classifyCommand(words, redirects, depth, acc, pipelineCmds, indexInPipe
     return;
   }
-  // Destructive disk/file utilities.
-  if (DESTRUCTIVE_BINS.has(bin)) {
+  // Destructive disk/file utilities. `mkfs.<fstype>` (mkfs.ext4, mkfs.erofs, …)
+  // is the same irreversible filesystem-format operation as bare `mkfs`.
+  if (DESTRUCTIVE_BINS.has(bin) || /^mkfs\./.test(bin)) {
     acc.destructive = true;
     acc.reasons.push(bin);
     return;