npm - opencode-goal-mode - Versions diffs - 0.2.2 → 0.3.0 - Mend

opencode-goal-mode 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/ARCHITECTURE.md +47 -7
package/CHANGELOG.md +27 -0
package/README.md +81 -23
package/benchmarks/build-external-corpus.mjs +177 -0
package/benchmarks/charts.mjs +176 -0
package/benchmarks/comparison.mjs +48 -0
package/benchmarks/completion-corpus.mjs +70 -0
package/benchmarks/corpus.mjs +92 -0
package/benchmarks/external-corpus.json +3540 -0
package/benchmarks/external.mjs +110 -0
package/benchmarks/legacy-analyzer.mjs +54 -0
package/benchmarks/run.mjs +252 -0
package/benchmarks/truthfulness.mjs +64 -0
package/commands/goal-evidence-map.md +27 -0
package/commands/goal.md +16 -1
package/docs/benchmarks/detection-by-family.svg +2 -2
package/docs/benchmarks/external-scorecard.svg +32 -0
package/docs/benchmarks/latency.svg +3 -3
package/docs/benchmarks/overall-scorecard.svg +2 -2
package/docs/benchmarks/results.json +207 -67
package/docs/benchmarks/truthfulness-score.svg +17 -0
package/package.json +5 -1
package/plugins/goal-guard/config.js +9 -0
package/plugins/goal-guard/events.js +6 -3
package/plugins/goal-guard/shell.js +4 -3
package/plugins/goal-guard/sidebar-data.js +71 -0
package/plugins/goal-guard/state.js +2 -1
package/plugins/goal-guard/summary.js +139 -1
package/plugins/goal-guard/system.js +3 -0
package/plugins/goal-guard/tools.js +43 -3
package/plugins/goal-guard/verdicts.js +38 -1
package/plugins/goal-guard.js +20 -5
package/plugins/goal-sidebar.js +141 -0
package/research/README.md +1 -1
package/research/benchmarks.md +72 -45

package/benchmarks/external.mjs ADDED Viewed

@@ -0,0 +1,110 @@
+#!/usr/bin/env node
+/**
+ * Run the shell guard against the EXTERNAL, third-party-authored corpus built by
+ * build-external-corpus.mjs (real tldr-pages commands). This is the honest
+ * benchmark: the analyzer authors did not write or curate these commands, so the
+ * detection / false-positive numbers reflect real-world behavior, warts and all.
+ *
+ * It deliberately also reports DISAGREEMENTS between the analyzer and the
+ * independent ground-truth labeler, so misses and false positives are auditable
+ * rather than averaged away.
+ *
+ *   node benchmarks/external.mjs            # summary
+ *   node benchmarks/external.mjs --json     # full machine-readable result
+ */
+import { readFileSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import * as current from "../plugins/goal-guard/shell.js";
+import * as legacy from "./legacy-analyzer.mjs";
+const here = dirname(fileURLToPath(import.meta.url));
+export function loadExternalCorpus() {
+  return JSON.parse(readFileSync(join(here, "external-corpus.json"), "utf8"));
+}
+function blocked(analyzer, cmd) {
+  const a = analyzer.analyzeCommand(cmd);
+  return Boolean(a.destructive || a.networkExec);
+}
+/** Evaluate one analyzer over labeled entries (each {cmd, page, destructive}). */
+function score(analyzer, labeled) {
+  let destTotal = 0;
+  let destCaught = 0;
+  let safeTotal = 0;
+  let safeFalsePos = 0;
+  const misses = [];
+  const falsePositives = [];
+  for (const e of labeled) {
+    const isBlocked = blocked(analyzer, e.cmd);
+    if (e.destructive) {
+      destTotal += 1;
+      if (isBlocked) destCaught += 1;
+      else misses.push({ cmd: e.cmd, page: e.page });
+    } else {
+      safeTotal += 1;
+      if (isBlocked) {
+        safeFalsePos += 1;
+        falsePositives.push({ cmd: e.cmd, page: e.page });
+      }
+    }
+  }
+  return {
+    detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
+    falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
+    destCaught,
+    destTotal,
+    safeFalsePos,
+    safeTotal,
+    misses,
+    falsePositives,
+  };
+}
+export function runExternalBenchmark() {
+  const corpus = loadExternalCorpus();
+  // The corpus is written destructive-first then safe (see build-external-corpus.mjs),
+  // so the recorded count is the label boundary — no re-running the labeler needed.
+  const labeled = corpus.entries.map((e, i) => ({ ...e, destructive: i < corpus.totals.destructiveFound }));
+  return {
+    source: corpus.source,
+    commit: corpus.commit,
+    totals: corpus.totals,
+    sampleSize: labeled.length,
+    legacy: score(legacy, labeled),
+    current: score(current, labeled),
+  };
+}
+function pct(n) {
+  return `${n.toFixed(1)}%`;
+}
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  const r = runExternalBenchmark();
+  if (process.argv.includes("--json")) {
+    console.log(JSON.stringify(r, null, 2));
+  } else {
+    console.log("External shell-guard benchmark (third-party tldr-pages commands)");
+    console.log("================================================================");
+    console.log(`Source: ${r.source} @ ${r.commit.slice(0, 12)}`);
+    console.log(
+      `Sample: ${r.sampleSize} commands ` +
+        `(${r.totals.destructiveFound} destructive [all found], ` +
+        `${r.totals.safeSampled}/${r.totals.safeFound} safe sampled)`,
+    );
+    console.log("");
+    console.log(`Detection (destructive caught)   legacy ${pct(r.legacy.detectionRate)}   →   current ${pct(r.current.detectionRate)}`);
+    console.log(`False positives on safe commands legacy ${pct(r.legacy.falsePositiveRate)}   →   current ${pct(r.current.falsePositiveRate)}`);
+    console.log("");
+    console.log(`Current analyzer misses (${r.current.misses.length}):`);
+    for (const m of r.current.misses.slice(0, 20)) console.log(`  - ${m.cmd}   [${m.page}]`);
+    if (r.current.misses.length > 20) console.log(`  … ${r.current.misses.length - 20} more`);
+    console.log(`Current analyzer false positives (${r.current.falsePositives.length}):`);
+    for (const f of r.current.falsePositives.slice(0, 20)) console.log(`  - ${f.cmd}   [${f.page}]`);
+    if (r.current.falsePositives.length > 20) console.log(`  … ${r.current.falsePositives.length - 20} more`);
+  }
+}

package/benchmarks/legacy-analyzer.mjs ADDED Viewed

@@ -0,0 +1,54 @@
+/**
+ * The ORIGINAL regex-based shell classifier, preserved verbatim from the first
+ * published version of the plugin (commit 130956d) so the benchmark can compare
+ * it apples-to-apples against the current quote-aware analyzer.
+ *
+ * Do not "improve" this file — its job is to faithfully represent the old
+ * behavior that the new analyzer replaced.
+ */
+const MUTATING_BASH_PATTERNS = [
+  /(^|&&|;|\|\|)\s*(sudo\s+)?(rm|mv|cp|mkdir|rmdir|touch|ln)\b/i,
+  /(^|&&|;|\|\|)\s*(sudo\s+)?(tee|xargs\s+(rm|mv|cp))\b/i,
+  /(^|&&|;|\|\|)\s*[^|]*\s(>|>>)\s*(?!\/dev\/null\b)\S+/i,
+  /(^|&&|;|\|\|)\s*(perl\s+-pi|sed\s+-i)\b/i,
+  /(^|&&|;|\|\|)\s*(npm|pnpm|yarn|bun)\s+(install|ci|add|remove|update)\b/i,
+  /(^|&&|;|\|\|)\s*(npm|pnpm|yarn|bun)\s+(run\s+)?(format|fix|lint:fix)\b/i,
+  /\b((npx|pnpm\s+exec|yarn)\s+)?(prettier|eslint)\b.*\s(--write|--fix)\b/i,
+  /\b(node|python3?)\b.*\b(writeFile|appendFile|copyFile|rename|unlink|rmSync|mkdir|rmdir|openSync)\b/i,
+];
+export function looksLikeDestructiveBash(command) {
+  const normalized = String(command || "").trim();
+  return [
+    /(^|&&|;|\|\|)\s*(sudo\s+)?rm\s+-[a-zA-Z]*[rR][a-zA-Z]*[rfRF]?\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?rm\s+(--recursive|--force|--recursive\s+--force|-rf|-fr|-r)\b/,
+    /(^|&&|;|\|\|)\s*git\s+reset\b/,
+    /(^|&&|;|\|\|)\s*git\s+clean\b/,
+    /(^|&&|;|\|\|)\s*git\s+checkout\b/,
+    /(^|&&|;|\|\|)\s*git\s+restore\b/,
+    /(^|&&|;|\|\|)\s*git\s+switch\b/,
+    /(^|&&|;|\|\|)\s*git\s+push\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?find\b.*\s-delete\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?find\b.*\s-exec\s+rm\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?dd\b.*\bof=\/dev\//,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?mkfs(\.|\s|$)/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?shred\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?truncate\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?chmod\s+-[a-zA-Z]*[rR][a-zA-Z]*[wW][a-zA-Z]*[xX][a-zA-Z]*\s+\/\b/,
+  ].some((pattern) => pattern.test(normalized));
+}
+export function looksLikeMutatingBash(command) {
+  const normalized = String(command || "").trim();
+  if (!normalized) return false;
+  if (looksLikeDestructiveBash(normalized)) return true;
+  return MUTATING_BASH_PATTERNS.some((pattern) => pattern.test(normalized));
+}
+/** Adapter to the analyzer signal shape used by the benchmark. */
+export function analyzeCommand(command) {
+  const destructive = looksLikeDestructiveBash(command);
+  const mutating = looksLikeMutatingBash(command);
+  return { destructive, mutating, verification: false, networkExec: false, reasons: [] };
+}

package/benchmarks/run.mjs ADDED Viewed

@@ -0,0 +1,252 @@
+#!/usr/bin/env node
+/**
+ * Reproducible benchmark of the shell-command guard.
+ *
+ * Compares the original regex classifier (benchmarks/legacy-analyzer.mjs,
+ * preserved verbatim from the first release) against the current quote-aware
+ * analyzer (plugins/goal-guard/shell.js) on a labeled corpus, and measures
+ * detection rate, false-positive rate, and throughput. Writes results.json and
+ * the SVG charts embedded in the README.
+ *
+ *   node benchmarks/run.mjs
+ */
+import { writeFileSync, mkdirSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import { performance } from "node:perf_hooks";
+import { CORPUS } from "./corpus.mjs";
+import * as current from "../plugins/goal-guard/shell.js";
+import * as legacy from "./legacy-analyzer.mjs";
+import { groupedBarChart, horizontalBarChart } from "./charts.mjs";
+import { runTruthfulnessBenchmark } from "./truthfulness.mjs";
+import { runExternalBenchmark } from "./external.mjs";
+const root = fileURLToPath(new URL("..", import.meta.url));
+const outDir = join(root, "docs", "benchmarks");
+mkdirSync(outDir, { recursive: true });
+/** A command is "blocked" when flagged destructive or as remote execution. */
+function blocked(analyzer, cmd) {
+  const a = analyzer.analyzeCommand(cmd);
+  return Boolean(a.destructive || a.networkExec);
+}
+function evaluate(analyzer) {
+  const families = {};
+  let destTotal = 0;
+  let destCaught = 0;
+  let safeTotal = 0;
+  let safeFalsePos = 0;
+  for (const { cmd, label, family } of CORPUS) {
+    families[family] ??= { destTotal: 0, destCaught: 0, safeTotal: 0, safeFalsePos: 0 };
+    const isBlocked = blocked(analyzer, cmd);
+    if (label === "destructive") {
+      destTotal += 1;
+      families[family].destTotal += 1;
+      if (isBlocked) {
+        destCaught += 1;
+        families[family].destCaught += 1;
+      }
+    } else {
+      safeTotal += 1;
+      families[family].safeTotal += 1;
+      if (isBlocked) {
+        safeFalsePos += 1;
+        families[family].safeFalsePos += 1;
+      }
+    }
+  }
+  return {
+    detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
+    falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
+    destCaught,
+    destTotal,
+    safeFalsePos,
+    safeTotal,
+    families,
+  };
+}
+function throughput(analyzer) {
+  const cmds = CORPUS.map((c) => c.cmd);
+  // Warm up.
+  for (const c of cmds) analyzer.analyzeCommand(c);
+  const iterations = 4000;
+  const start = performance.now();
+  for (let i = 0; i < iterations; i += 1) {
+    for (const c of cmds) analyzer.analyzeCommand(c);
+  }
+  const ms = performance.now() - start;
+  const ops = (iterations * cmds.length) / (ms / 1000);
+  return Math.round(ops);
+}
+/** Locale-independent thousands grouping (the host locale may use '.' as separator). */
+function fmt(n) {
+  return Math.round(n)
+    .toString()
+    .replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+}
+const legacyEval = evaluate(legacy);
+const currentEval = evaluate(current);
+const external = runExternalBenchmark();
+const truthfulness = runTruthfulnessBenchmark();
+const legacyOps = throughput(legacy);
+const currentOps = throughput(current);
+const legacyUs = 1e6 / legacyOps;
+const currentUs = 1e6 / currentOps;
+const FAMILY_LABELS = {
+  classic: "Classic",
+  bypass: "Obfuscated",
+  "remote-exec": "Remote exec",
+};
+const detFamilies = ["classic", "bypass", "remote-exec"];
+function familyRate(ev, fam) {
+  const f = ev.families[fam];
+  return f && f.destTotal ? (f.destCaught / f.destTotal) * 100 : 0;
+}
+// Trim the per-command miss/false-positive lists to keep results.json readable;
+// the full lists are always available via `node benchmarks/external.mjs --json`.
+const externalSummary = {
+  source: external.source,
+  commit: external.commit,
+  totals: external.totals,
+  sampleSize: external.sampleSize,
+  legacy: {
+    detectionRate: Number(external.legacy.detectionRate.toFixed(1)),
+    falsePositiveRate: Number(external.legacy.falsePositiveRate.toFixed(1)),
+    destCaught: external.legacy.destCaught,
+    destTotal: external.legacy.destTotal,
+    safeFalsePos: external.legacy.safeFalsePos,
+    safeTotal: external.legacy.safeTotal,
+  },
+  current: {
+    detectionRate: Number(external.current.detectionRate.toFixed(1)),
+    falsePositiveRate: Number(external.current.falsePositiveRate.toFixed(1)),
+    destCaught: external.current.destCaught,
+    destTotal: external.current.destTotal,
+    safeFalsePos: external.current.safeFalsePos,
+    safeTotal: external.current.safeTotal,
+    misses: external.current.misses.map((m) => m.cmd),
+    falsePositives: external.current.falsePositives.map((f) => f.cmd),
+  },
+};
+const results = {
+  // The honest, third-party benchmark: real commands the analyzer was never
+  // fitted to. This is the headline number.
+  external: externalSummary,
+  // Curated REGRESSION FIXTURES: a hand-authored set of known destructive
+  // patterns and their safe look-alikes. These define the patterns the analyzer
+  // is built to catch and guard against regressions — they are NOT an unbiased
+  // sample, so the 100%/0% here is "passes its own spec", not measured accuracy.
+  fixtures: {
+    corpusSize: CORPUS.length,
+    destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
+    safeCount: CORPUS.filter((c) => c.label === "safe").length,
+    legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
+    current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
+  },
+  // Completion-enforcement fixtures (hand-authored policy cases), not a survey.
+  completionFixtures: truthfulness,
+};
+writeFileSync(join(outDir, "results.json"), JSON.stringify(results, null, 2));
+// Headline chart: detection + false positives on the EXTERNAL third-party corpus.
+writeFileSync(
+  join(outDir, "external-scorecard.svg"),
+  groupedBarChart({
+    title: "Guard accuracy on real third-party commands",
+    subtitle: `${external.sampleSize} tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.`,
+    groups: ["Detection rate", "False-positive rate"],
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: [external.legacy.detectionRate, external.legacy.falsePositiveRate] },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: [external.current.detectionRate, external.current.falsePositiveRate] },
+    ],
+  }),
+);
+// Chart 1: detection rate by command family (CURATED regression fixtures).
+writeFileSync(
+  join(outDir, "detection-by-family.svg"),
+  groupedBarChart({
+    title: "Detection by family — curated regression fixtures",
+    subtitle: `Curated patterns the analyzer is built to catch (not an unbiased sample). ${results.fixtures.destructiveCount} destructive fixtures.`,
+    groups: detFamilies.map((f) => FAMILY_LABELS[f]),
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: detFamilies.map((f) => familyRate(legacyEval, f)) },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: detFamilies.map((f) => familyRate(currentEval, f)) },
+    ],
+  }),
+);
+// Chart 2: overall scorecard on the CURATED fixtures (passes its own spec).
+writeFileSync(
+  join(outDir, "overall-scorecard.svg"),
+  groupedBarChart({
+    title: "Curated fixtures — passes its own spec",
+    subtitle: "Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.",
+    groups: ["Detection rate", "False-positive rate"],
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: [legacyEval.detectionRate, legacyEval.falsePositiveRate] },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: [currentEval.detectionRate, currentEval.falsePositiveRate] },
+    ],
+  }),
+);
+// Chart 3: per-command latency — the deeper analysis costs a few microseconds,
+// which is negligible for a tool-call guard. Shown for honesty, not as a "win".
+writeFileSync(
+  join(outDir, "latency.svg"),
+  horizontalBarChart({
+    title: "Per-command analysis latency",
+    subtitle: "Microseconds to classify one command. Both are negligible for a tool-call guard.",
+    unit: " µs",
+    max: Math.max(legacyUs, currentUs) * 1.4,
+    rows: [
+      { label: "Legacy regex guard", value: legacyUs, display: `${legacyUs.toFixed(2)} µs`, color: "#9aa0a6" },
+      { label: "Goal Mode analyzer", value: currentUs, display: `${currentUs.toFixed(2)} µs`, color: "#2da44e" },
+    ],
+  }),
+);
+writeFileSync(
+  join(outDir, "truthfulness-score.svg"),
+  horizontalBarChart({
+    title: "Completion-enforcement fixtures",
+    subtitle: `${truthfulness.corpusSize} hand-authored policy cases (a spec, not a survey): premature claims blocked, valid ones allowed.`,
+    unit: "%",
+    max: 100,
+    rows: [
+      { label: "Truthfulness score", value: truthfulness.score, display: `${truthfulness.score.toFixed(1)}%`, color: "#2da44e" },
+      { label: "Decision accuracy", value: truthfulness.decisionAccuracy, display: `${truthfulness.decisionAccuracy.toFixed(1)}%`, color: "#0969da" },
+      { label: "Reason accuracy", value: truthfulness.reasonAccuracy, display: `${truthfulness.reasonAccuracy.toFixed(1)}%`, color: "#bf8700" },
+    ],
+  }),
+);
+const pct = (n) => `${n.toFixed(1)}%`;
+console.log("Goal Mode shell-guard benchmark");
+console.log("================================");
+console.log("");
+console.log(`HEADLINE — external corpus: ${external.sampleSize} real tldr-pages commands @ ${external.commit.slice(0, 12)}`);
+console.log(`  (${external.totals.destructiveFound} destructive [all found] + ${external.totals.safeSampled}/${external.totals.safeFound} safe sampled)`);
+console.log(`  Detection       legacy ${pct(external.legacy.detectionRate)}   →   Goal Mode ${pct(external.current.detectionRate)}`);
+console.log(`  False positives legacy ${pct(external.legacy.falsePositiveRate)}   →   Goal Mode ${pct(external.current.falsePositiveRate)}`);
+console.log(`  Remaining Goal Mode misses: ${external.current.misses.length} (mostly un-flagged single-target rm — see external.mjs --json)`);
+console.log("");
+console.log(`Curated regression fixtures: ${results.fixtures.corpusSize} commands (defines patterns to catch; not an unbiased sample)`);
+console.log(`  Detection   legacy ${pct(legacyEval.detectionRate)}   →   Goal Mode ${pct(currentEval.detectionRate)}   (passes its own spec)`);
+console.log(`  False pos   legacy ${pct(legacyEval.falsePositiveRate)}   →   Goal Mode ${pct(currentEval.falsePositiveRate)}`);
+console.log(`Completion-enforcement fixtures: ${truthfulness.corpusSize} hand-authored policy cases, all pass (a spec, not a survey)`);
+console.log(`Latency: Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
+console.log("");
+console.log(`Wrote results.json + 5 SVG charts to docs/benchmarks/`);

package/benchmarks/truthfulness.mjs ADDED Viewed

@@ -0,0 +1,64 @@
+import { BASE_GATES } from "../plugins/goal-guard/agents.js";
+import { DEFAULT_CONFIG } from "../plugins/goal-guard/config.js";
+import { evaluateCompletionClaim } from "../plugins/goal-guard/completion.js";
+import { refreshStickyGates } from "../plugins/goal-guard/gates.js";
+import { createState } from "../plugins/goal-guard/state.js";
+import { FALSE_COMPLETION_CORPUS } from "./completion-corpus.mjs";
+import { fileURLToPath } from "node:url";
+function buildState(def) {
+  const state = createState("2026-01-01T00:00:00.000Z");
+  Object.assign(state, def.state || {});
+  state.latestVerdict = {};
+  for (const gate of def.state?.verdicts || []) {
+    state.latestVerdict[gate.agent] = { verdict: gate.verdict, at: "2026-01-01T00:00:01.000Z", seq: gate.seq };
+  }
+  if (!state.goalText && def.state?.contextualGate) state.goalText = def.state.contextualGate;
+  refreshStickyGates(state);
+  return state;
+}
+export function evaluateTruthfulnessCase(def) {
+  const config = { ...DEFAULT_CONFIG, ...(def.config || {}) };
+  const decision = evaluateCompletionClaim(buildState(def), config, def.text);
+  const reason = decision.reason || "";
+  const decisionCorrect = Boolean(decision.blocked) === Boolean(def.expected.blocked);
+  const reasonCorrect = !def.expected.blocked || reason.includes(def.expected.reasonIncludes || "");
+  return {
+    id: def.id,
+    family: def.family,
+    expectedBlocked: Boolean(def.expected.blocked),
+    actualBlocked: Boolean(decision.blocked),
+    decisionCorrect,
+    reasonCorrect,
+    reason,
+  };
+}
+export function runTruthfulnessBenchmark(corpus = FALSE_COMPLETION_CORPUS) {
+  const cases = corpus.map(evaluateTruthfulnessCase);
+  const falseCompletionCases = cases.filter((c) => c.expectedBlocked);
+  const trueCompletionCases = cases.filter((c) => !c.expectedBlocked);
+  const decisionCorrect = cases.filter((c) => c.decisionCorrect).length;
+  const reasonCorrect = falseCompletionCases.filter((c) => c.reasonCorrect).length;
+  const falseCompletionBlocked = falseCompletionCases.filter((c) => c.actualBlocked).length;
+  const trueCompletionAllowed = trueCompletionCases.filter((c) => !c.actualBlocked).length;
+  const decisionAccuracy = cases.length ? (decisionCorrect / cases.length) * 100 : 0;
+  const reasonAccuracy = falseCompletionCases.length ? (reasonCorrect / falseCompletionCases.length) * 100 : 100;
+  return {
+    name: "False Completion Dataset",
+    corpusSize: cases.length,
+    requiredBaseGates: BASE_GATES,
+    score: Number(((decisionAccuracy * 0.65 + reasonAccuracy * 0.35)).toFixed(1)),
+    decisionAccuracy: Number(decisionAccuracy.toFixed(1)),
+    reasonAccuracy: Number(reasonAccuracy.toFixed(1)),
+    falseCompletionBlockRate: Number(((falseCompletionBlocked / falseCompletionCases.length) * 100).toFixed(1)),
+    validCompletionAllowRate: Number(((trueCompletionAllowed / trueCompletionCases.length) * 100).toFixed(1)),
+    cases,
+  };
+}
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  const result = runTruthfulnessBenchmark();
+  console.log(JSON.stringify(result, null, 2));
+}

package/commands/goal-evidence-map.md ADDED Viewed

@@ -0,0 +1,27 @@
+---
+description: Map Goal Contract acceptance criteria to recorded verification evidence and gaps.
+agent: goal
+---
+Produce a read-only evidence map for the current Goal Mode session. Do not edit files.
+Call `goal_evidence_map` first and use its authoritative Goal Guard state,
+including the Goal Contract, recorded evidence, dirty state, reviewer status, and
+any user-provided context. Report unknown or missing details honestly instead of
+inferring evidence that is not recorded.
+Include:
+- Acceptance criterion
+- Recorded evidence covering it
+- Reviewer status
+- Verification command/result summary
+- Status: covered, partially covered, missing, or stale
+- Gap or risk
+- Next required action
+Additional context:
+```text
+$ARGUMENTS
+```

package/commands/goal.md CHANGED Viewed

@@ -9,4 +9,19 @@ Start Goal Mode for this request:
 $ARGUMENTS
 ```
-First create a Goal Contract, ask only essential beginning clarifying questions, delegate discovery/research to subagents, implement in the main agent, verify, run required review cycles, and only finish with `Goal Completed` if all gates pass.
+Run this sequence:
+1. **Seed the contract first.** Call the `goal_contract` tool with the original
+   request, explicit/inferred requirements, non-goals, and concrete acceptance
+   criteria. This activates enforcement, fixes the required specialist review
+   gates, and lights up the goal banner in the sidebar. Ask only essential
+   clarifying questions before recording it.
+2. Delegate discovery and research to subagents; implement in the main agent.
+3. Verify, and record each verification with the `goal_evidence` tool so it maps
+   to your acceptance criteria.
+4. Run the required review cycles. Consult `goal_status` / `goal_evidence_map`
+   for the authoritative list of missing or stale gates rather than relying on
+   memory.
+5. Only finish with `Goal Completed` (plus an accurate `Review cycles: N` line)
+   once every required gate has a fresh PASS — the guard will rewrite a premature
+   claim to `Goal Not Completed`.

package/docs/benchmarks/detection-by-family.svg CHANGED Viewed

@@ -1,7 +1,7 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
 <rect width="720" height="380" fill="#ffffff"/>
-<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Destructive-command detection rate by family</text>
-<text x="48" y="47" font-size="12" fill="#656d76">Higher is better. Corpus: 48 destructive commands.</text>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Detection by family — curated regression fixtures</text>
+<text x="48" y="47" font-size="12" fill="#656d76">Curated patterns the analyzer is built to catch (not an unbiased sample). 48 destructive fixtures.</text>
 <line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
 <text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
 <line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>

package/docs/benchmarks/external-scorecard.svg ADDED Viewed

@@ -0,0 +1,32 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="720" height="380" fill="#ffffff"/>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Guard accuracy on real third-party commands</text>
+<text x="48" y="47" font-size="12" fill="#656d76">704 tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.</text>
+<line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
+<line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="253.6" font-size="11" text-anchor="end" fill="#656d76">20%</text>
+<line x1="48" y1="203.2" x2="700" y2="203.2" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="207.2" font-size="11" text-anchor="end" fill="#656d76">40%</text>
+<line x1="48" y1="156.8" x2="700" y2="156.8" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="160.8" font-size="11" text-anchor="end" fill="#656d76">60%</text>
+<line x1="48" y1="110.4" x2="700" y2="110.4" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="114.4" font-size="11" text-anchor="end" fill="#656d76">80%</text>
+<line x1="48" y1="64.0" x2="700" y2="64.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="68.0" font-size="11" text-anchor="end" fill="#656d76">100%</text>
+<rect x="56.0" y="171.1" width="151.0" height="124.9" rx="3" fill="#9aa0a6"/>
+<text x="131.5" y="166.1" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">54%</text>
+<rect x="215.0" y="79.6" width="151.0" height="216.4" rx="3" fill="#2da44e"/>
+<text x="290.5" y="74.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">93%</text>
+<text x="211.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Detection rate</text>
+<rect x="382.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#9aa0a6"/>
+<text x="457.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
+<rect x="541.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#2da44e"/>
+<text x="616.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
+<text x="537.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">False-positive rate</text>
+<line x1="48" y1="296" x2="700" y2="296" stroke="#d0d7de" stroke-width="1.5"/>
+<rect x="48" y="344" width="12" height="12" rx="2" fill="#9aa0a6"/>
+<text x="66" y="354" font-size="12" fill="#1f2328">Legacy regex guard</text>
+<rect x="201.6" y="344" width="12" height="12" rx="2" fill="#2da44e"/>
+<text x="219.6" y="354" font-size="12" fill="#1f2328">Goal Mode analyzer</text>
+</svg>

package/docs/benchmarks/latency.svg CHANGED Viewed

@@ -4,10 +4,10 @@
 <text x="20" y="47" font-size="12" fill="#656d76">Microseconds to classify one command. Both are negligible for a tool-call guard.</text>
 <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Legacy regex guard</text>
 <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
-<rect x="230" y="70" width="202.0" height="22" rx="3" fill="#9aa0a6"/>
-<text x="440.0" y="87" font-size="12" font-weight="600" fill="#1f2328">2.62 µs</text>
+<rect x="230" y="70" width="214.5" height="22" rx="3" fill="#9aa0a6"/>
+<text x="452.5" y="87" font-size="12" font-weight="600" fill="#1f2328">0.79 µs</text>
 <text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Goal Mode analyzer</text>
 <rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
 <rect x="230" y="108" width="300.0" height="22" rx="3" fill="#2da44e"/>
-<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.89 µs</text>
+<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">1.11 µs</text>
 </svg>

package/docs/benchmarks/overall-scorecard.svg CHANGED Viewed

@@ -1,7 +1,7 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
 <rect width="720" height="380" fill="#ffffff"/>
-<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Overall guard accuracy</text>
-<text x="48" y="47" font-size="12" fill="#656d76">Detection rate (higher better) vs false-positive rate (lower better).</text>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Curated fixtures — passes its own spec</text>
+<text x="48" y="47" font-size="12" fill="#656d76">Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.</text>
 <line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
 <text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
 <line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>