npm - opencode-goal-mode - Versions diffs - 0.2.1 → 0.2.4 - Mend

opencode-goal-mode 0.2.1 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/ARCHITECTURE.md +16 -7
package/CHANGELOG.md +17 -0
package/README.md +39 -18
package/benchmarks/charts.mjs +176 -0
package/benchmarks/comparison.mjs +48 -0
package/benchmarks/completion-corpus.mjs +70 -0
package/benchmarks/corpus.mjs +92 -0
package/benchmarks/legacy-analyzer.mjs +54 -0
package/benchmarks/run.mjs +198 -0
package/benchmarks/truthfulness.mjs +64 -0
package/commands/goal-evidence-map.md +27 -0
package/docs/benchmarks/capability-matrix.svg +86 -0
package/docs/benchmarks/detection-by-family.svg +37 -0
package/docs/benchmarks/latency.svg +13 -0
package/docs/benchmarks/overall-scorecard.svg +32 -0
package/docs/benchmarks/results.json +176 -0
package/docs/benchmarks/truthfulness-score.svg +17 -0
package/package.json +6 -1
package/plugins/goal-guard/events.js +6 -3
package/plugins/goal-guard/state.js +2 -1
package/plugins/goal-guard/summary.js +105 -1
package/plugins/goal-guard/system.js +3 -0
package/plugins/goal-guard/tools.js +35 -1
package/plugins/goal-guard/verdicts.js +38 -1
package/plugins/goal-guard.js +7 -5
package/research/README.md +18 -0
package/research/benchmarks.md +84 -0
package/research/goal-mode-comparison.md +100 -0
package/research/opencode-plugin-platform.md +89 -0
package/research/shell-hardening.md +62 -0

package/benchmarks/legacy-analyzer.mjs ADDED Viewed

@@ -0,0 +1,54 @@
+/**
+ * The ORIGINAL regex-based shell classifier, preserved verbatim from the first
+ * published version of the plugin (commit 130956d) so the benchmark can compare
+ * it apples-to-apples against the current quote-aware analyzer.
+ *
+ * Do not "improve" this file — its job is to faithfully represent the old
+ * behavior that the new analyzer replaced.
+ */
+const MUTATING_BASH_PATTERNS = [
+  /(^|&&|;|\|\|)\s*(sudo\s+)?(rm|mv|cp|mkdir|rmdir|touch|ln)\b/i,
+  /(^|&&|;|\|\|)\s*(sudo\s+)?(tee|xargs\s+(rm|mv|cp))\b/i,
+  /(^|&&|;|\|\|)\s*[^|]*\s(>|>>)\s*(?!\/dev\/null\b)\S+/i,
+  /(^|&&|;|\|\|)\s*(perl\s+-pi|sed\s+-i)\b/i,
+  /(^|&&|;|\|\|)\s*(npm|pnpm|yarn|bun)\s+(install|ci|add|remove|update)\b/i,
+  /(^|&&|;|\|\|)\s*(npm|pnpm|yarn|bun)\s+(run\s+)?(format|fix|lint:fix)\b/i,
+  /\b((npx|pnpm\s+exec|yarn)\s+)?(prettier|eslint)\b.*\s(--write|--fix)\b/i,
+  /\b(node|python3?)\b.*\b(writeFile|appendFile|copyFile|rename|unlink|rmSync|mkdir|rmdir|openSync)\b/i,
+];
+export function looksLikeDestructiveBash(command) {
+  const normalized = String(command || "").trim();
+  return [
+    /(^|&&|;|\|\|)\s*(sudo\s+)?rm\s+-[a-zA-Z]*[rR][a-zA-Z]*[rfRF]?\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?rm\s+(--recursive|--force|--recursive\s+--force|-rf|-fr|-r)\b/,
+    /(^|&&|;|\|\|)\s*git\s+reset\b/,
+    /(^|&&|;|\|\|)\s*git\s+clean\b/,
+    /(^|&&|;|\|\|)\s*git\s+checkout\b/,
+    /(^|&&|;|\|\|)\s*git\s+restore\b/,
+    /(^|&&|;|\|\|)\s*git\s+switch\b/,
+    /(^|&&|;|\|\|)\s*git\s+push\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?find\b.*\s-delete\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?find\b.*\s-exec\s+rm\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?dd\b.*\bof=\/dev\//,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?mkfs(\.|\s|$)/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?shred\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?truncate\b/,
+    /(^|&&|;|\|\|)\s*(sudo\s+)?chmod\s+-[a-zA-Z]*[rR][a-zA-Z]*[wW][a-zA-Z]*[xX][a-zA-Z]*\s+\/\b/,
+  ].some((pattern) => pattern.test(normalized));
+}
+export function looksLikeMutatingBash(command) {
+  const normalized = String(command || "").trim();
+  if (!normalized) return false;
+  if (looksLikeDestructiveBash(normalized)) return true;
+  return MUTATING_BASH_PATTERNS.some((pattern) => pattern.test(normalized));
+}
+/** Adapter to the analyzer signal shape used by the benchmark. */
+export function analyzeCommand(command) {
+  const destructive = looksLikeDestructiveBash(command);
+  const mutating = looksLikeMutatingBash(command);
+  return { destructive, mutating, verification: false, networkExec: false, reasons: [] };
+}

package/benchmarks/run.mjs ADDED Viewed

@@ -0,0 +1,198 @@
+#!/usr/bin/env node
+/**
+ * Reproducible benchmark of the shell-command guard.
+ *
+ * Compares the original regex classifier (benchmarks/legacy-analyzer.mjs,
+ * preserved verbatim from the first release) against the current quote-aware
+ * analyzer (plugins/goal-guard/shell.js) on a labeled corpus, and measures
+ * detection rate, false-positive rate, and throughput. Writes results.json and
+ * the SVG charts embedded in the README.
+ *
+ *   node benchmarks/run.mjs
+ */
+import { writeFileSync, mkdirSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import { performance } from "node:perf_hooks";
+import { CORPUS } from "./corpus.mjs";
+import * as current from "../plugins/goal-guard/shell.js";
+import * as legacy from "./legacy-analyzer.mjs";
+import { groupedBarChart, horizontalBarChart } from "./charts.mjs";
+import { runTruthfulnessBenchmark } from "./truthfulness.mjs";
+const root = fileURLToPath(new URL("..", import.meta.url));
+const outDir = join(root, "docs", "benchmarks");
+mkdirSync(outDir, { recursive: true });
+/** A command is "blocked" when flagged destructive or as remote execution. */
+function blocked(analyzer, cmd) {
+  const a = analyzer.analyzeCommand(cmd);
+  return Boolean(a.destructive || a.networkExec);
+}
+function evaluate(analyzer) {
+  const families = {};
+  let destTotal = 0;
+  let destCaught = 0;
+  let safeTotal = 0;
+  let safeFalsePos = 0;
+  for (const { cmd, label, family } of CORPUS) {
+    families[family] ??= { destTotal: 0, destCaught: 0, safeTotal: 0, safeFalsePos: 0 };
+    const isBlocked = blocked(analyzer, cmd);
+    if (label === "destructive") {
+      destTotal += 1;
+      families[family].destTotal += 1;
+      if (isBlocked) {
+        destCaught += 1;
+        families[family].destCaught += 1;
+      }
+    } else {
+      safeTotal += 1;
+      families[family].safeTotal += 1;
+      if (isBlocked) {
+        safeFalsePos += 1;
+        families[family].safeFalsePos += 1;
+      }
+    }
+  }
+  return {
+    detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
+    falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
+    destCaught,
+    destTotal,
+    safeFalsePos,
+    safeTotal,
+    families,
+  };
+}
+function throughput(analyzer) {
+  const cmds = CORPUS.map((c) => c.cmd);
+  // Warm up.
+  for (const c of cmds) analyzer.analyzeCommand(c);
+  const iterations = 4000;
+  const start = performance.now();
+  for (let i = 0; i < iterations; i += 1) {
+    for (const c of cmds) analyzer.analyzeCommand(c);
+  }
+  const ms = performance.now() - start;
+  const ops = (iterations * cmds.length) / (ms / 1000);
+  return Math.round(ops);
+}
+/** Locale-independent thousands grouping (the host locale may use '.' as separator). */
+function fmt(n) {
+  return Math.round(n)
+    .toString()
+    .replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+}
+const legacyEval = evaluate(legacy);
+const currentEval = evaluate(current);
+const truthfulness = runTruthfulnessBenchmark();
+const legacyOps = throughput(legacy);
+const currentOps = throughput(current);
+const legacyUs = 1e6 / legacyOps;
+const currentUs = 1e6 / currentOps;
+const FAMILY_LABELS = {
+  classic: "Classic",
+  bypass: "Obfuscated",
+  "remote-exec": "Remote exec",
+};
+const detFamilies = ["classic", "bypass", "remote-exec"];
+function familyRate(ev, fam) {
+  const f = ev.families[fam];
+  return f && f.destTotal ? (f.destCaught / f.destTotal) * 100 : 0;
+}
+const results = {
+  corpusSize: CORPUS.length,
+  destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
+  safeCount: CORPUS.filter((c) => c.label === "safe").length,
+  legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
+  current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
+  truthfulness,
+};
+writeFileSync(join(outDir, "results.json"), JSON.stringify(results, null, 2));
+// Chart 1: detection rate by command family.
+writeFileSync(
+  join(outDir, "detection-by-family.svg"),
+  groupedBarChart({
+    title: "Destructive-command detection rate by family",
+    subtitle: `Higher is better. Corpus: ${results.destructiveCount} destructive commands.`,
+    groups: detFamilies.map((f) => FAMILY_LABELS[f]),
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: detFamilies.map((f) => familyRate(legacyEval, f)) },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: detFamilies.map((f) => familyRate(currentEval, f)) },
+    ],
+  }),
+);
+// Chart 2: overall scorecard (detection up, false positives down).
+writeFileSync(
+  join(outDir, "overall-scorecard.svg"),
+  groupedBarChart({
+    title: "Overall guard accuracy",
+    subtitle: "Detection rate (higher better) vs false-positive rate (lower better).",
+    groups: ["Detection rate", "False-positive rate"],
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: [legacyEval.detectionRate, legacyEval.falsePositiveRate] },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: [currentEval.detectionRate, currentEval.falsePositiveRate] },
+    ],
+  }),
+);
+// Chart 3: per-command latency — the deeper analysis costs a few microseconds,
+// which is negligible for a tool-call guard. Shown for honesty, not as a "win".
+writeFileSync(
+  join(outDir, "latency.svg"),
+  horizontalBarChart({
+    title: "Per-command analysis latency",
+    subtitle: "Microseconds to classify one command. Both are negligible for a tool-call guard.",
+    unit: " µs",
+    max: Math.max(legacyUs, currentUs) * 1.4,
+    rows: [
+      { label: "Legacy regex guard", value: legacyUs, display: `${legacyUs.toFixed(2)} µs`, color: "#9aa0a6" },
+      { label: "Goal Mode analyzer", value: currentUs, display: `${currentUs.toFixed(2)} µs`, color: "#2da44e" },
+    ],
+  }),
+);
+writeFileSync(
+  join(outDir, "truthfulness-score.svg"),
+  horizontalBarChart({
+    title: "Benchmark Truthfulness Score",
+    subtitle: `False Completion Dataset: ${truthfulness.corpusSize} labeled completion-claim cases.`,
+    unit: "%",
+    max: 100,
+    rows: [
+      { label: "Truthfulness score", value: truthfulness.score, display: `${truthfulness.score.toFixed(1)}%`, color: "#2da44e" },
+      { label: "Decision accuracy", value: truthfulness.decisionAccuracy, display: `${truthfulness.decisionAccuracy.toFixed(1)}%`, color: "#0969da" },
+      { label: "Reason accuracy", value: truthfulness.reasonAccuracy, display: `${truthfulness.reasonAccuracy.toFixed(1)}%`, color: "#bf8700" },
+    ],
+  }),
+);
+const pct = (n) => `${n.toFixed(1)}%`;
+console.log("Goal Mode shell-guard benchmark");
+console.log("================================");
+console.log(`Corpus: ${results.corpusSize} commands (${results.destructiveCount} destructive, ${results.safeCount} safe)`);
+console.log("");
+console.log(`Detection rate   legacy ${pct(legacyEval.detectionRate)}   →   Goal Mode ${pct(currentEval.detectionRate)}`);
+console.log(`False positives  legacy ${pct(legacyEval.falsePositiveRate)}   →   Goal Mode ${pct(currentEval.falsePositiveRate)}`);
+console.log(`Latency          legacy ${legacyUs.toFixed(2)} µs/cmd   →   Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
+console.log(`Truthfulness    False Completion Dataset score ${truthfulness.score.toFixed(1)}% (${truthfulness.corpusSize} cases)`);
+console.log("");
+console.log("By family (detection rate):");
+for (const f of detFamilies) {
+  console.log(`  ${FAMILY_LABELS[f].padEnd(12)} legacy ${pct(familyRate(legacyEval, f)).padStart(6)}  →  Goal Mode ${pct(familyRate(currentEval, f)).padStart(6)}`);
+}
+console.log("");
+console.log(`Wrote results.json + 4 SVG charts to docs/benchmarks/`);

package/benchmarks/truthfulness.mjs ADDED Viewed

@@ -0,0 +1,64 @@
+import { BASE_GATES } from "../plugins/goal-guard/agents.js";
+import { DEFAULT_CONFIG } from "../plugins/goal-guard/config.js";
+import { evaluateCompletionClaim } from "../plugins/goal-guard/completion.js";
+import { refreshStickyGates } from "../plugins/goal-guard/gates.js";
+import { createState } from "../plugins/goal-guard/state.js";
+import { FALSE_COMPLETION_CORPUS } from "./completion-corpus.mjs";
+import { fileURLToPath } from "node:url";
+function buildState(def) {
+  const state = createState("2026-01-01T00:00:00.000Z");
+  Object.assign(state, def.state || {});
+  state.latestVerdict = {};
+  for (const gate of def.state?.verdicts || []) {
+    state.latestVerdict[gate.agent] = { verdict: gate.verdict, at: "2026-01-01T00:00:01.000Z", seq: gate.seq };
+  }
+  if (!state.goalText && def.state?.contextualGate) state.goalText = def.state.contextualGate;
+  refreshStickyGates(state);
+  return state;
+}
+export function evaluateTruthfulnessCase(def) {
+  const config = { ...DEFAULT_CONFIG, ...(def.config || {}) };
+  const decision = evaluateCompletionClaim(buildState(def), config, def.text);
+  const reason = decision.reason || "";
+  const decisionCorrect = Boolean(decision.blocked) === Boolean(def.expected.blocked);
+  const reasonCorrect = !def.expected.blocked || reason.includes(def.expected.reasonIncludes || "");
+  return {
+    id: def.id,
+    family: def.family,
+    expectedBlocked: Boolean(def.expected.blocked),
+    actualBlocked: Boolean(decision.blocked),
+    decisionCorrect,
+    reasonCorrect,
+    reason,
+  };
+}
+export function runTruthfulnessBenchmark(corpus = FALSE_COMPLETION_CORPUS) {
+  const cases = corpus.map(evaluateTruthfulnessCase);
+  const falseCompletionCases = cases.filter((c) => c.expectedBlocked);
+  const trueCompletionCases = cases.filter((c) => !c.expectedBlocked);
+  const decisionCorrect = cases.filter((c) => c.decisionCorrect).length;
+  const reasonCorrect = falseCompletionCases.filter((c) => c.reasonCorrect).length;
+  const falseCompletionBlocked = falseCompletionCases.filter((c) => c.actualBlocked).length;
+  const trueCompletionAllowed = trueCompletionCases.filter((c) => !c.actualBlocked).length;
+  const decisionAccuracy = cases.length ? (decisionCorrect / cases.length) * 100 : 0;
+  const reasonAccuracy = falseCompletionCases.length ? (reasonCorrect / falseCompletionCases.length) * 100 : 100;
+  return {
+    name: "False Completion Dataset",
+    corpusSize: cases.length,
+    requiredBaseGates: BASE_GATES,
+    score: Number(((decisionAccuracy * 0.65 + reasonAccuracy * 0.35)).toFixed(1)),
+    decisionAccuracy: Number(decisionAccuracy.toFixed(1)),
+    reasonAccuracy: Number(reasonAccuracy.toFixed(1)),
+    falseCompletionBlockRate: Number(((falseCompletionBlocked / falseCompletionCases.length) * 100).toFixed(1)),
+    validCompletionAllowRate: Number(((trueCompletionAllowed / trueCompletionCases.length) * 100).toFixed(1)),
+    cases,
+  };
+}
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  const result = runTruthfulnessBenchmark();
+  console.log(JSON.stringify(result, null, 2));
+}

package/commands/goal-evidence-map.md ADDED Viewed

@@ -0,0 +1,27 @@
+---
+description: Map Goal Contract acceptance criteria to recorded verification evidence and gaps.
+agent: goal
+---
+Produce a read-only evidence map for the current Goal Mode session. Do not edit files.
+Call `goal_evidence_map` first and use its authoritative Goal Guard state,
+including the Goal Contract, recorded evidence, dirty state, reviewer status, and
+any user-provided context. Report unknown or missing details honestly instead of
+inferring evidence that is not recorded.
+Include:
+- Acceptance criterion
+- Recorded evidence covering it
+- Reviewer status
+- Verification command/result summary
+- Status: covered, partially covered, missing, or stale
+- Gap or risk
+- Next required action
+Additional context:
+```text
+$ARGUMENTS
+```

package/docs/benchmarks/capability-matrix.svg ADDED Viewed

@@ -0,0 +1,86 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="760" height="496" viewBox="0 0 760 496" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="760" height="496" fill="#ffffff"/>
+<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Mechanically-enforced goal discipline</text>
+<text x="20" y="47" font-size="12" fill="#656d76">Enforced = guaranteed by the harness; Prompt-only / Partial = depends on the model or user config.</text>
+<text x="374.0" y="62" font-size="12.5" font-weight="700" text-anchor="middle" fill="#1f2328">Goal Mode</text>
+<text x="522.0" y="62" font-size="12.5" font-weight="700" text-anchor="middle" fill="#1f2328">Claude Code</text>
+<text x="670.0" y="62" font-size="12.5" font-weight="700" text-anchor="middle" fill="#1f2328">Codex</text>
+<text x="286" y="93" font-size="12" text-anchor="end" fill="#1f2328">Autonomous goal loop</text>
+<rect x="304.0" y="74" width="140.0" height="30" rx="4" fill="#dbe9d5"/>
+<text x="374.0" y="93" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Prompt-only</text>
+<rect x="452.0" y="74" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="522.0" y="93" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="600.0" y="74" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="670.0" y="93" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<text x="286" y="131" font-size="12" text-anchor="end" fill="#1f2328">Review gate before “done”</text>
+<rect x="304.0" y="112" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="131" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="112" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="522.0" y="131" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="600.0" y="112" width="140.0" height="30" rx="4" fill="#dbe9d5"/>
+<text x="670.0" y="131" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Prompt-only</text>
+<text x="286" y="169" font-size="12" text-anchor="end" fill="#1f2328">Contextual specialist reviews</text>
+<rect x="304.0" y="150" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="169" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="150" width="140.0" height="30" rx="4" fill="#dbe9d5"/>
+<text x="522.0" y="169" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Prompt-only</text>
+<rect x="600.0" y="150" width="140.0" height="30" rx="4" fill="#dbe9d5"/>
+<text x="670.0" y="169" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Prompt-only</text>
+<text x="286" y="207" font-size="12" text-anchor="end" fill="#1f2328">Stale-review invalidation on edit</text>
+<rect x="304.0" y="188" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="207" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="188" width="140.0" height="30" rx="4" fill="#eaeef2"/>
+<text x="522.0" y="207" font-size="11" font-weight="600" text-anchor="middle" fill="#656d76">None</text>
+<rect x="600.0" y="188" width="140.0" height="30" rx="4" fill="#eaeef2"/>
+<text x="670.0" y="207" font-size="11" font-weight="600" text-anchor="middle" fill="#656d76">None</text>
+<text x="286" y="245" font-size="12" text-anchor="end" fill="#1f2328">Completion-claim enforcement</text>
+<rect x="304.0" y="226" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="245" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="226" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="522.0" y="245" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="600.0" y="226" width="140.0" height="30" rx="4" fill="#eaeef2"/>
+<text x="670.0" y="245" font-size="11" font-weight="600" text-anchor="middle" fill="#656d76">None</text>
+<text x="286" y="283" font-size="12" text-anchor="end" fill="#1f2328">Destructive-command blocking</text>
+<rect x="304.0" y="264" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="283" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="264" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="522.0" y="283" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="600.0" y="264" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="670.0" y="283" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<text x="286" y="321" font-size="12" text-anchor="end" fill="#1f2328">Remote-exec (curl | sh) blocking</text>
+<rect x="304.0" y="302" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="321" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="302" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="522.0" y="321" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="600.0" y="302" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="670.0" y="321" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<text x="286" y="359" font-size="12" text-anchor="end" fill="#1f2328">Enforcement state survives restart</text>
+<rect x="304.0" y="340" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="359" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="340" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="522.0" y="359" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="600.0" y="340" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="670.0" y="359" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<text x="286" y="397" font-size="12" text-anchor="end" fill="#1f2328">State survives compaction</text>
+<rect x="304.0" y="378" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="397" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="378" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="522.0" y="397" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="600.0" y="378" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="670.0" y="397" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<text x="286" y="435" font-size="12" text-anchor="end" fill="#1f2328">Custom enforcement hooks/tools</text>
+<rect x="304.0" y="416" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="374.0" y="435" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="452.0" y="416" width="140.0" height="30" rx="4" fill="#2da44e"/>
+<text x="522.0" y="435" font-size="11" font-weight="600" text-anchor="middle" fill="#ffffff">Enforced</text>
+<rect x="600.0" y="416" width="140.0" height="30" rx="4" fill="#d4a72c"/>
+<text x="670.0" y="435" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">Partial</text>
+<rect x="286" y="461" width="12" height="12" rx="2" fill="#2da44e"/>
+<text x="303" y="472" font-size="11.5" fill="#1f2328">Enforced</text>
+<rect x="372" y="461" width="12" height="12" rx="2" fill="#d4a72c"/>
+<text x="389" y="472" font-size="11.5" fill="#1f2328">Partial</text>
+<rect x="451" y="461" width="12" height="12" rx="2" fill="#dbe9d5"/>
+<text x="468" y="472" font-size="11.5" fill="#1f2328">Prompt-only</text>
+<rect x="558" y="461" width="12" height="12" rx="2" fill="#eaeef2"/>
+<text x="575" y="472" font-size="11.5" fill="#1f2328">None</text>
+</svg>

package/docs/benchmarks/detection-by-family.svg ADDED Viewed

@@ -0,0 +1,37 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="720" height="380" fill="#ffffff"/>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Destructive-command detection rate by family</text>
+<text x="48" y="47" font-size="12" fill="#656d76">Higher is better. Corpus: 48 destructive commands.</text>
+<line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
+<line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="253.6" font-size="11" text-anchor="end" fill="#656d76">20%</text>
+<line x1="48" y1="203.2" x2="700" y2="203.2" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="207.2" font-size="11" text-anchor="end" fill="#656d76">40%</text>
+<line x1="48" y1="156.8" x2="700" y2="156.8" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="160.8" font-size="11" text-anchor="end" fill="#656d76">60%</text>
+<line x1="48" y1="110.4" x2="700" y2="110.4" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="114.4" font-size="11" text-anchor="end" fill="#656d76">80%</text>
+<line x1="48" y1="64.0" x2="700" y2="64.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="68.0" font-size="11" text-anchor="end" fill="#656d76">100%</text>
+<rect x="56.0" y="64.0" width="96.7" height="232.0" rx="3" fill="#9aa0a6"/>
+<text x="104.3" y="59.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">100%</text>
+<rect x="160.7" y="64.0" width="96.7" height="232.0" rx="3" fill="#2da44e"/>
+<text x="209.0" y="59.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">100%</text>
+<text x="156.7" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Classic</text>
+<rect x="273.3" y="296.0" width="96.7" height="0.0" rx="3" fill="#9aa0a6"/>
+<text x="321.7" y="291.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
+<rect x="378.0" y="64.0" width="96.7" height="232.0" rx="3" fill="#2da44e"/>
+<text x="426.3" y="59.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">100%</text>
+<text x="374.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Obfuscated</text>
+<rect x="490.7" y="296.0" width="96.7" height="0.0" rx="3" fill="#9aa0a6"/>
+<text x="539.0" y="291.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
+<rect x="595.3" y="64.0" width="96.7" height="232.0" rx="3" fill="#2da44e"/>
+<text x="643.7" y="59.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">100%</text>
+<text x="591.3" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Remote exec</text>
+<line x1="48" y1="296" x2="700" y2="296" stroke="#d0d7de" stroke-width="1.5"/>
+<rect x="48" y="344" width="12" height="12" rx="2" fill="#9aa0a6"/>
+<text x="66" y="354" font-size="12" fill="#1f2328">Legacy regex guard</text>
+<rect x="201.6" y="344" width="12" height="12" rx="2" fill="#2da44e"/>
+<text x="219.6" y="354" font-size="12" fill="#1f2328">Goal Mode analyzer</text>
+</svg>

package/docs/benchmarks/latency.svg ADDED Viewed

@@ -0,0 +1,13 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="720" height="164" viewBox="0 0 720 164" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="720" height="164" fill="#ffffff"/>
+<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Per-command analysis latency</text>
+<text x="20" y="47" font-size="12" fill="#656d76">Microseconds to classify one command. Both are negligible for a tool-call guard.</text>
+<text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Legacy regex guard</text>
+<rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="70" width="179.4" height="22" rx="3" fill="#9aa0a6"/>
+<text x="417.4" y="87" font-size="12" font-weight="600" fill="#1f2328">2.24 µs</text>
+<text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Goal Mode analyzer</text>
+<rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="108" width="300.0" height="22" rx="3" fill="#2da44e"/>
+<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.75 µs</text>
+</svg>

package/docs/benchmarks/overall-scorecard.svg ADDED Viewed

@@ -0,0 +1,32 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="720" height="380" fill="#ffffff"/>
+<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Overall guard accuracy</text>
+<text x="48" y="47" font-size="12" fill="#656d76">Detection rate (higher better) vs false-positive rate (lower better).</text>
+<line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
+<line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="253.6" font-size="11" text-anchor="end" fill="#656d76">20%</text>
+<line x1="48" y1="203.2" x2="700" y2="203.2" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="207.2" font-size="11" text-anchor="end" fill="#656d76">40%</text>
+<line x1="48" y1="156.8" x2="700" y2="156.8" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="160.8" font-size="11" text-anchor="end" fill="#656d76">60%</text>
+<line x1="48" y1="110.4" x2="700" y2="110.4" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="114.4" font-size="11" text-anchor="end" fill="#656d76">80%</text>
+<line x1="48" y1="64.0" x2="700" y2="64.0" stroke="#eaeef2" stroke-width="1"/>
+<text x="40" y="68.0" font-size="11" text-anchor="end" fill="#656d76">100%</text>
+<rect x="56.0" y="247.7" width="151.0" height="48.3" rx="3" fill="#9aa0a6"/>
+<text x="131.5" y="242.7" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">21%</text>
+<rect x="215.0" y="64.0" width="151.0" height="232.0" rx="3" fill="#2da44e"/>
+<text x="290.5" y="59.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">100%</text>
+<text x="211.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Detection rate</text>
+<rect x="382.0" y="245.6" width="151.0" height="50.4" rx="3" fill="#9aa0a6"/>
+<text x="457.5" y="240.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">22%</text>
+<rect x="541.0" y="296.0" width="151.0" height="0.0" rx="3" fill="#2da44e"/>
+<text x="616.5" y="291.0" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
+<text x="537.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">False-positive rate</text>
+<line x1="48" y1="296" x2="700" y2="296" stroke="#d0d7de" stroke-width="1.5"/>
+<rect x="48" y="344" width="12" height="12" rx="2" fill="#9aa0a6"/>
+<text x="66" y="354" font-size="12" fill="#1f2328">Legacy regex guard</text>
+<rect x="201.6" y="344" width="12" height="12" rx="2" fill="#2da44e"/>
+<text x="219.6" y="354" font-size="12" fill="#1f2328">Goal Mode analyzer</text>
+</svg>