npm - opencode-goal-mode - Versions diffs - 0.2.1 → 0.2.4 - Mend

opencode-goal-mode 0.2.1 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/ARCHITECTURE.md +16 -7
package/CHANGELOG.md +17 -0
package/README.md +39 -18
package/benchmarks/charts.mjs +176 -0
package/benchmarks/comparison.mjs +48 -0
package/benchmarks/completion-corpus.mjs +70 -0
package/benchmarks/corpus.mjs +92 -0
package/benchmarks/legacy-analyzer.mjs +54 -0
package/benchmarks/run.mjs +198 -0
package/benchmarks/truthfulness.mjs +64 -0
package/commands/goal-evidence-map.md +27 -0
package/docs/benchmarks/capability-matrix.svg +86 -0
package/docs/benchmarks/detection-by-family.svg +37 -0
package/docs/benchmarks/latency.svg +13 -0
package/docs/benchmarks/overall-scorecard.svg +32 -0
package/docs/benchmarks/results.json +176 -0
package/docs/benchmarks/truthfulness-score.svg +17 -0
package/package.json +6 -1
package/plugins/goal-guard/events.js +6 -3
package/plugins/goal-guard/state.js +2 -1
package/plugins/goal-guard/summary.js +105 -1
package/plugins/goal-guard/system.js +3 -0
package/plugins/goal-guard/tools.js +35 -1
package/plugins/goal-guard/verdicts.js +38 -1
package/plugins/goal-guard.js +7 -5
package/research/README.md +18 -0
package/research/benchmarks.md +84 -0
package/research/goal-mode-comparison.md +100 -0
package/research/opencode-plugin-platform.md +89 -0
package/research/shell-hardening.md +62 -0

package/docs/benchmarks/results.json ADDED Viewed

@@ -0,0 +1,176 @@
+{
+  "corpusSize": 71,
+  "destructiveCount": 48,
+  "safeCount": 23,
+  "legacy": {
+    "detectionRate": 20.833333333333336,
+    "falsePositiveRate": 21.73913043478261,
+    "destCaught": 10,
+    "destTotal": 48,
+    "safeFalsePos": 5,
+    "safeTotal": 23,
+    "families": {
+      "classic": {
+        "destTotal": 10,
+        "destCaught": 10,
+        "safeTotal": 0,
+        "safeFalsePos": 0
+      },
+      "bypass": {
+        "destTotal": 35,
+        "destCaught": 0,
+        "safeTotal": 0,
+        "safeFalsePos": 0
+      },
+      "remote-exec": {
+        "destTotal": 3,
+        "destCaught": 0,
+        "safeTotal": 0,
+        "safeFalsePos": 0
+      },
+      "safe": {
+        "destTotal": 0,
+        "destCaught": 0,
+        "safeTotal": 23,
+        "safeFalsePos": 5
+      }
+    },
+    "opsPerSec": 445671,
+    "usPerCommand": 2.24
+  },
+  "current": {
+    "detectionRate": 100,
+    "falsePositiveRate": 0,
+    "destCaught": 48,
+    "destTotal": 48,
+    "safeFalsePos": 0,
+    "safeTotal": 23,
+    "families": {
+      "classic": {
+        "destTotal": 10,
+        "destCaught": 10,
+        "safeTotal": 0,
+        "safeFalsePos": 0
+      },
+      "bypass": {
+        "destTotal": 35,
+        "destCaught": 35,
+        "safeTotal": 0,
+        "safeFalsePos": 0
+      },
+      "remote-exec": {
+        "destTotal": 3,
+        "destCaught": 3,
+        "safeTotal": 0,
+        "safeFalsePos": 0
+      },
+      "safe": {
+        "destTotal": 0,
+        "destCaught": 0,
+        "safeTotal": 23,
+        "safeFalsePos": 0
+      }
+    },
+    "opsPerSec": 266556,
+    "usPerCommand": 3.75
+  },
+  "truthfulness": {
+    "name": "False Completion Dataset",
+    "corpusSize": 9,
+    "requiredBaseGates": [
+      "goal-prompt-auditor",
+      "goal-reviewer",
+      "goal-diff-reviewer",
+      "goal-verifier",
+      "goal-final-auditor"
+    ],
+    "score": 100,
+    "decisionAccuracy": 100,
+    "reasonAccuracy": 100,
+    "falseCompletionBlockRate": 100,
+    "validCompletionAllowRate": 100,
+    "cases": [
+      {
+        "id": "missing-review-cycles-line",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "missing required Review cycles line"
+      },
+      {
+        "id": "zero-review-cycles",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "no review cycles recorded"
+      },
+      {
+        "id": "wrong-review-cycle-count",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "claimed review cycles (1) do not match recorded review cycles (2)"
+      },
+      {
+        "id": "stale-review-after-edit",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "required review gates are missing or stale (goal-prompt-auditor, goal-reviewer, goal-diff-reviewer, goal-verifier, goal-final-auditor)"
+      },
+      {
+        "id": "missing-contextual-security-gate",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "required review gates are missing or stale (goal-security-reviewer)"
+      },
+      {
+        "id": "valid-completion-allowed",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      },
+      {
+        "id": "mid-text-mention-not-policed",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      },
+      {
+        "id": "inactive-session-not-policed",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      },
+      {
+        "id": "custom-marker-escaping",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      }
+    ]
+  }
+}

package/docs/benchmarks/truthfulness-score.svg ADDED Viewed

@@ -0,0 +1,17 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="720" height="202" viewBox="0 0 720 202" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="720" height="202" fill="#ffffff"/>
+<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Benchmark Truthfulness Score</text>
+<text x="20" y="47" font-size="12" fill="#656d76">False Completion Dataset: 9 labeled completion-claim cases.</text>
+<text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Truthfulness score</text>
+<rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="70" width="420.0" height="22" rx="3" fill="#2da44e"/>
+<text x="658.0" y="87" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
+<text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Decision accuracy</text>
+<rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="108" width="420.0" height="22" rx="3" fill="#0969da"/>
+<text x="658.0" y="125" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
+<text x="218" y="163" font-size="12" text-anchor="end" fill="#1f2328">Reason accuracy</text>
+<rect x="230" y="146" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="146" width="420.0" height="22" rx="3" fill="#bf8700"/>
+<text x="658.0" y="163" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
+</svg>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "opencode-goal-mode",
-  "version": "0.2.1",
+  "version": "0.2.4",
   "description": "Strict Goal Mode agents, commands, and guard plugin for OpenCode.",
   "type": "module",
   "engines": {
@@ -12,10 +12,14 @@
   },
   "files": [
     "agents/",
+    "benchmarks/",
     "commands/",
+    "docs/",
     "plugins/",
+    "research/",
     "scripts/install.mjs",
     "ARCHITECTURE.md",
+    "CHANGELOG.md",
     "LICENSE",
     "README.md"
   ],
@@ -28,6 +32,7 @@
     "test:agents": "node --test tests/agents.test.mjs tests/commands.test.mjs",
     "test:install": "node --test tests/install.test.mjs",
     "bench": "node benchmarks/run.mjs",
+    "bench:truthfulness": "node benchmarks/truthfulness.mjs",
     "bench:compare": "node benchmarks/comparison.mjs",
     "pack:check": "npm pack --dry-run",
     "audit": "npm audit --audit-level=moderate",

package/plugins/goal-guard/events.js CHANGED Viewed

@@ -29,6 +29,7 @@ export function markVerification(store, state) {
   state.lastVerificationAt = at;
   state.lastVerificationSeq = store.nextSeq();
   state.updatedAt = at;
+  return state.lastVerificationSeq;
 }
 export function markFileChanged(store, state, file) {
@@ -41,14 +42,16 @@ export function markFileChanged(store, state, file) {
 export function recordEvidence(store, state, command, result, criteria) {
   const at = store.nowIso();
-  state.evidence.push({
+  const entry = {
     command: String(command || ""),
     result: String(result || ""),
     criteria: Array.isArray(criteria) ? criteria.slice(0, 50) : [],
     at,
-  });
+    seq: 0,
+  };
+  state.evidence.push(entry);
   trim(state.evidence, 100);
-  markVerification(store, state);
+  entry.seq = markVerification(store, state);
   state.updatedAt = at;
 }

package/plugins/goal-guard/state.js CHANGED Viewed

@@ -36,6 +36,7 @@ export function createState(nowIso) {
     lastReviewAt: null,
     lastVerificationAt: null,
     verdicts: [],
+    reviewerMemory: [],
     evidence: [],
     latestVerdict: {},
     currentAgent: undefined,
@@ -59,7 +60,7 @@ function reviveState(raw) {
     if (raw[field] !== undefined) base[field] = raw[field];
   }
   // Defensive normalisation of array/object shapes.
-  for (const arrField of ["dirtyReasons", "changedFiles", "verdicts", "evidence", "completionRejections"]) {
+  for (const arrField of ["dirtyReasons", "changedFiles", "verdicts", "reviewerMemory", "evidence", "completionRejections"]) {
     if (!Array.isArray(base[arrField])) base[arrField] = [];
   }
   if (!base.latestVerdict || typeof base.latestVerdict !== "object") base.latestVerdict = {};

package/plugins/goal-guard/summary.js CHANGED Viewed

@@ -3,7 +3,7 @@
  * messages, and the `goal_status` tool. Kept pure and dependency-light.
  */
-import { requiredGates, missingGates } from "./gates.js";
+import { requiredGates, missingGates, gatePassedFresh } from "./gates.js";
 export function summarizeState(state, config) {
   const verdictSummary =
@@ -18,11 +18,32 @@ export function summarizeState(state, config) {
     `lastEditSeq=${state.lastEditSeq || 0}`,
     `lastReviewSeq=${state.lastReviewSeq || 0}`,
     `recentVerdicts=${verdictSummary}`,
+    `openReviewerMemory=${reviewerMemoryReport(state).open.length}`,
     `missingGates=${missingGates(state, config).join(" ") || "none"}`,
     `dirtyReasons=${state.dirtyReasons.slice(-5).join(" | ") || "none"}`,
   ].join("; ");
 }
+export function reviewerMemoryReport(state) {
+  const memory = Array.isArray(state.reviewerMemory) ? state.reviewerMemory : [];
+  const shape = (item) => ({
+    agent: item.agent,
+    finding: item.finding,
+    severity: item.severity || "blocking",
+    status: item.status || "open",
+    count: item.count || 1,
+    firstAt: item.firstAt || null,
+    lastAt: item.lastAt || null,
+    resolvedAt: item.resolvedAt || null,
+    fresh: Number(item.lastSeq || 0) > Number(state.lastEditSeq || 0),
+  });
+  return {
+    open: memory.filter((item) => (item.status || "open") === "open").slice(-20).map(shape),
+    resolved: memory.filter((item) => item.status === "resolved").slice(-20).map(shape),
+    total: memory.length,
+  };
+}
 /** Structured status object for the goal_status tool / diagnostics. */
 export function statusReport(state, config) {
   const required = requiredGates(state, config);
@@ -39,8 +60,91 @@ export function statusReport(state, config) {
     lastReviewAt: state.lastReviewAt,
     lastVerificationAt: state.lastVerificationAt,
     evidenceCount: state.evidence.length,
+    reviewerMemory: reviewerMemoryReport(state),
     changedFiles: state.changedFiles.slice(-50),
     contract: state.contract,
     completionAllowed: Boolean(state.active) && missing.length === 0,
   };
 }
+function evidenceMatchesCriterion(entry, criterion) {
+  const criteria = Array.isArray(entry.criteria) ? entry.criteria : [];
+  return criteria.some((c) => String(c).trim().toLowerCase() === String(criterion).trim().toLowerCase());
+}
+function evidenceFresh(entry, state) {
+  const lastEditSeq = Number(state.lastEditSeq || 0);
+  if (!entry.seq) return lastEditSeq === 0;
+  return Number(entry.seq) > lastEditSeq;
+}
+function criterionStatus(entries, state, missing) {
+  if (entries.length === 0) return "missing";
+  if (!entries.some((entry) => evidenceFresh(entry, state))) return "stale";
+  if (missing.length > 0 || state.dirty) return "partially covered";
+  return "covered";
+}
+/** Structured Requirement/Acceptance Criteria -> Evidence -> Reviewer -> Status map. */
+export function evidenceMapReport(state, config) {
+  const required = requiredGates(state, config);
+  const missing = missingGates(state, config);
+  const reviewers = required.map((agent) => {
+    const latest = state.latestVerdict[agent] || null;
+    return {
+      agent,
+      verdict: latest?.verdict || "missing",
+      at: latest?.at || null,
+      fresh: gatePassedFresh(state, agent),
+    };
+  });
+  const criteria = Array.isArray(state.contract?.acceptanceCriteria) ? state.contract.acceptanceCriteria : [];
+  const items = criteria.map((criterion) => {
+    const entries = state.evidence.filter((entry) => evidenceMatchesCriterion(entry, criterion));
+    const status = criterionStatus(entries, state, missing);
+    const memory = reviewerMemoryReport(state).open.filter((item) => item.finding.toLowerCase().includes(String(criterion).trim().toLowerCase()));
+    return {
+      criterion,
+      status,
+      evidence: entries.map((entry) => ({
+        command: entry.command,
+        result: entry.result,
+        at: entry.at,
+        seq: entry.seq || null,
+        fresh: evidenceFresh(entry, state),
+      })),
+      reviewers,
+      reviewerMemory: memory,
+      gap:
+        status === "missing"
+          ? "No recorded evidence references this acceptance criterion."
+          : status === "stale"
+            ? "Recorded evidence is older than the latest edit."
+            : missing.length > 0
+              ? `Missing or stale reviewer gates: ${missing.join(", ")}.`
+              : state.dirty
+                ? "Session is dirty; rerun reviews after the latest change."
+                : "None recorded.",
+      nextAction:
+        status === "covered"
+          ? "No action required for this criterion."
+          : status === "missing"
+            ? "Run verification and record it with goal_evidence, including this criterion."
+            : status === "stale"
+              ? "Rerun verification after the latest edit and record fresh evidence."
+              : "Complete missing/stale reviewer gates after verification.",
+    };
+  });
+  return {
+    active: Boolean(state.active),
+    dirty: Boolean(state.dirty),
+    lastEditAt: state.lastEditAt,
+    requiredGates: required,
+    missingGates: missing,
+    reviewers,
+    unmappedEvidence: state.evidence
+      .filter((entry) => !criteria.some((criterion) => evidenceMatchesCriterion(entry, criterion)))
+      .map((entry) => ({ command: entry.command, result: entry.result, criteria: entry.criteria || [], at: entry.at, seq: entry.seq || null })),
+    criteria: items,
+  };
+}

package/plugins/goal-guard/system.js CHANGED Viewed

@@ -33,6 +33,9 @@ export function buildSystemInjection(state, config) {
   lines.push(`- Verification observed: ${r.verificationSeen ? "yes" : "no"}.`);
   lines.push(`- Required review gates: ${bullet(r.requiredGates)}.`);
   lines.push(`- Gates still missing or stale: ${bullet(r.missingGates)}.`);
+  if (r.reviewerMemory.open.length) {
+    lines.push(`- Open Reviewer Memory: ${r.reviewerMemory.open.map((m) => `${m.agent}: ${m.finding}`).join(" | ")}.`);
+  }
   lines.push(
     `- Completion is currently ${r.completionAllowed ? "ALLOWED" : "BLOCKED"}. ` +
       (r.completionAllowed

package/plugins/goal-guard/tools.js CHANGED Viewed

@@ -12,7 +12,7 @@
  */
 import { tool } from "@opencode-ai/plugin";
-import { statusReport } from "./summary.js";
+import { evidenceMapReport, reviewerMemoryReport, statusReport } from "./summary.js";
 import { recordEvidence } from "./events.js";
 import { refreshStickyGates } from "./gates.js";
 import { createState } from "./state.js";
@@ -46,6 +46,40 @@ export function createGoalTools({ store, config, persist }) {
       },
     }),
+    goal_evidence_map: tool({
+      description:
+        "Return an authoritative read-only evidence map for this session: each acceptance " +
+        "criterion, matching recorded evidence, required reviewer gate status, coverage status, " +
+        "gaps, and next action.",
+      args: {},
+      async execute(_args, ctx) {
+        const state = store.stateFor(ctx.sessionID);
+        const report = evidenceMapReport(state, config);
+        const covered = report.criteria.filter((item) => item.status === "covered").length;
+        return {
+          title: `Evidence map: ${covered}/${report.criteria.length} criteria covered`,
+          output: JSON.stringify(report, null, 2),
+          metadata: { criteriaCount: report.criteria.length, coveredCount: covered, missingGates: report.missingGates },
+        };
+      },
+    }),
+    goal_reviewer_memory: tool({
+      description:
+        "Return durable Reviewer Memory for this session: unresolved and recently resolved " +
+        "reviewer findings carried across cycles. Read-only.",
+      args: {},
+      async execute(_args, ctx) {
+        const state = store.stateFor(ctx.sessionID);
+        const report = reviewerMemoryReport(state);
+        return {
+          title: `Reviewer Memory: ${report.open.length} open findings`,
+          output: JSON.stringify(report, null, 2),
+          metadata: { openCount: report.open.length, total: report.total },
+        };
+      },
+    }),
     goal_contract: tool({
       description:
         "Record or update the Goal Contract for this session (the explicit requirements, " +

package/plugins/goal-guard/verdicts.js CHANGED Viewed

@@ -68,17 +68,54 @@ export function latestVerdictFor(state, agent) {
   return state.latestVerdict[agent] || null;
 }
+function summarizeFinding(text) {
+  const headingRe = /^(blocking findings?|findings?|non-blocking findings?|open questions?|summary|verdict|blocking|issues?)[:\s]*$/i;
+  const lines = String(text || "")
+    .split(/\r?\n/)
+    .map((line) => line.replace(/^[\s>*_-]+/, "").trim())
+    .filter(Boolean)
+    .filter((line) => !headingRe.test(line))
+    .filter((line) => !/^verdict:?\s*(pass|fail)\b/i.test(line));
+  const blocking = lines.find((line) => /block|fail|finding|risk|missing|gap|regression/i.test(line));
+  return String(blocking || lines[0] || "Reviewer reported a blocking finding.").slice(0, 240);
+}
+function updateReviewerMemory(state, agent, verdict, at, seq, text) {
+  state.reviewerMemory ||= [];
+  if (verdict === "PASS") {
+    for (const item of state.reviewerMemory) {
+      if (item.agent === agent && item.status === "open") {
+        item.status = "resolved";
+        item.resolvedAt = at;
+        item.resolvedSeq = seq;
+      }
+    }
+    return;
+  }
+  const finding = summarizeFinding(text);
+  const open = state.reviewerMemory.find((item) => item.agent === agent && item.status === "open" && item.finding === finding);
+  if (open) {
+    open.lastAt = at;
+    open.lastSeq = seq;
+    open.count += 1;
+  } else {
+    state.reviewerMemory.push({ agent, finding, severity: "blocking", status: "open", firstAt: at, firstSeq: seq, lastAt: at, lastSeq: seq, count: 1 });
+  }
+  if (state.reviewerMemory.length > 100) state.reviewerMemory.splice(0, state.reviewerMemory.length - 100);
+}
 /**
  * Record a review verdict for `agent`, stamping it with the next monotonic seq.
  * Increments the review-cycle count when the cycle-closing agent reports.
  */
-export function recordVerdict(store, state, agent, verdict) {
+export function recordVerdict(store, state, agent, verdict, text = "") {
   const at = store.nowIso();
   const seq = store.nextSeq();
   const entry = { agent, verdict, at, seq };
   state.verdicts.push(entry);
   if (state.verdicts.length > 200) state.verdicts.splice(0, state.verdicts.length - 200);
   state.latestVerdict[agent] = { verdict, at, seq };
+  updateReviewerMemory(state, agent, verdict, at, seq, text);
   state.lastReviewAt = at;
   state.lastReviewSeq = seq;
   state.updatedAt = at;

package/plugins/goal-guard.js CHANGED Viewed

@@ -183,16 +183,18 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
         if (tool === "task") {
           const sub = normalizedSubagent(inp);
           if (isReviewAgent(sub)) {
-            const verdict = parseVerdict(textOf(out));
+            const text = textOf(out);
+            const verdict = parseVerdict(text);
             if (verdict) {
-              recordVerdict(store, state, sub, verdict);
+              recordVerdict(store, state, sub, verdict, text);
               recordedAgent = sub;
             }
           }
         } else if (isReviewAgent(state.currentAgent)) {
-          const verdict = parseVerdict(textOf(out));
+          const text = textOf(out);
+          const verdict = parseVerdict(text);
           if (verdict) {
-            recordVerdict(store, state, state.currentAgent, verdict);
+            recordVerdict(store, state, state.currentAgent, verdict, text);
             recordedAgent = state.currentAgent;
           }
         }
@@ -231,7 +233,7 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
         const state = store.stateFor(inp.sessionID);
         out.context.push(
           `Goal Guard state: ${summarizeState(state, config)}. Preserve Goal Contract, Verification Ledger, ` +
-            `Review Ledger, review cycle count, dirty state, and open findings across compaction.`,
+            `Review Ledger, Reviewer Memory, review cycle count, dirty state, and open findings across compaction.`,
         );
       } catch {
         /* ignore */

package/research/README.md ADDED Viewed

@@ -0,0 +1,18 @@
+# Research
+Background research that informs the Goal Mode design. These are working
+references, kept so the rationale behind the plugin is auditable and the
+platform facts are recoverable. They are shipped as reference docs so README
+links resolve in the npm package, but they are not runtime files.
+| Document | What it covers |
+| --- | --- |
+| [opencode-plugin-platform.md](opencode-plugin-platform.md) | Verified OpenCode plugin-runtime facts (hooks, discovery, permissions, tools) from `@opencode-ai/plugin@1.15.13` source. The pinned runtime reference the plugin is built against. |
+| [goal-mode-comparison.md](goal-mode-comparison.md) | How Goal Mode's mechanical enforcement compares to Claude Code and OpenAI Codex, with citations and honest caveats. |
+| [shell-hardening.md](shell-hardening.md) | The shell-analyzer threat model: the bypass classes the old regex guard missed and how the tokenizer closes each. |
+| [benchmarks.md](benchmarks.md) | Benchmark methodology and results (shell guard accuracy plus completion truthfulness). Reproduce charts with `npm run bench` and JSON with `npm run bench:truthfulness`. |
+Every non-obvious platform claim in these documents was verified against the
+installed `@opencode-ai/plugin` type definitions and/or the `sst/opencode`
+source at tag `v1.15.13`. Where a fact is version-specific (e.g. the dormant
+`permission.ask` hook) it is called out as such.

package/research/benchmarks.md ADDED Viewed

@@ -0,0 +1,84 @@
+# Benchmarks
+Reproducible measurement of the destructive-command guard from a repository
+checkout. Run:
+```bash
+npm run bench          # detection / false-positive / latency benchmark
+npm run bench:truthfulness  # print the completion truthfulness benchmark JSON
+npm run bench:compare  # regenerate the capability-comparison chart
+```
+`npm run bench` writes `docs/benchmarks/results.json` and the SVG charts the
+README embeds.
+## Methodology
+- **Corpus** (`benchmarks/corpus.mjs`): 71 real shell commands a coding agent
+  might emit, each labeled `destructive` (a guard must block) or `safe` (a guard
+  must not block). Split into families: *classic* (plain `rm -rf`, `git reset
+  --hard`), *obfuscated* (the bypass corpus — substitutions, wrappers, `bash -c`,
+  interpreters, weaponized git), *remote-exec* (`curl | sh`), and *safe*
+  (read-only and quoted-text commands, including ones the old guard
+  false-positived).
+- **Baseline** (`benchmarks/legacy-analyzer.mjs`): the original regex classifier,
+  preserved **verbatim** from the first published release (commit `130956d`), so
+  the comparison is apples-to-apples against the same code that shipped.
+- **A command counts as "blocked"** when the analyzer flags it `destructive` or
+  `networkExec` (the two signals `tool.execute.before` throws on). `mutating`
+  marks the session dirty but does not block, so it is not counted here.
+- **Metrics**: detection rate (recall over destructive commands),
+  false-positive rate (safe commands wrongly blocked), and per-command latency.
+- **False Completion Dataset** (`benchmarks/completion-corpus.mjs`): labeled final
+  answer scenarios for premature and valid completion claims. It checks whether
+  `completion.js` blocks missing review-cycle lines, zero cycles, stale reviews,
+  mismatched cycle counts, missing contextual gates, and allows inactive or valid
+  completions.
+- **Truthfulness Score** (`benchmarks/truthfulness.mjs`): weighted score over the
+  dataset: 65% decision accuracy (blocked vs allowed) and 35% reason accuracy for
+  blocked false-completion claims.
+## Results
+Representative run (Node 22, single-threaded; latency varies by machine, the
+accuracy figures do not):
+| Metric | Legacy regex guard | Goal Mode analyzer |
+| --- | --- | --- |
+| Detection rate | **20.8%** (10/48) | **100%** (48/48) |
+| False-positive rate | **21.7%** (5/23) | **0%** (0/23) |
+| Detection — classic | 100% | 100% |
+| Detection — obfuscated | 0% (0/35) | 100% (35/35) |
+| Detection — remote-exec | 0% (0/3) | 100% (3/3) |
+| Latency per command | ~2.3 µs | ~3.8 µs |
+False Completion Dataset run:
+| Metric | Goal Mode |
+| --- | --- |
+| Truthfulness score | **100.0%** |
+| Decision accuracy | **100.0%** |
+| Reason accuracy | **100.0%** |
+| False-completion block rate | **100.0%** |
+| Valid-completion allow rate | **100.0%** |
+The legacy guard catches only the *classic* family and misses every obfuscated
+and remote-execution command, while wrongly blocking 1-in-5 benign commands. The
+tokenizer catches the entire corpus with zero false positives, for an extra
+~1.5 µs per command on this run — negligible for a per-tool-call guard (still
+hundreds of thousands of classifications per second).
+## Honesty notes
+- The corpus is hand-built to exercise the known bypass classes; it is a
+  capability benchmark, not a claim of catching *every* possible obfuscation
+  (the analyzer fails open on un-analyzable dynamic commands — see
+  [shell-hardening.md](shell-hardening.md)).
+- The latency comparison is intentionally shown even though the new analyzer is
+  slower: the win is accuracy, and the parse cost is still only a few
+  microseconds per tool-call candidate.
+- "100% on this corpus" means 100% of the labeled set; new bypass classes that
+  are discovered get added to the corpus and fixed (that is how the second-wave
+  findings — `sudo -u`, `pnpm dlx`, interpreter shell-out — entered it).
+- The Truthfulness Score is corpus truthfulness for mechanical completion claims,
+  not a global claim that an LLM's prose is semantically true in every domain.