npm - opencode-goal-mode - Versions diffs - 0.2.2 → 0.2.4 - Mend

opencode-goal-mode 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/ARCHITECTURE.md +16 -7
package/CHANGELOG.md +9 -0
package/README.md +26 -8
package/benchmarks/charts.mjs +176 -0
package/benchmarks/comparison.mjs +48 -0
package/benchmarks/completion-corpus.mjs +70 -0
package/benchmarks/corpus.mjs +92 -0
package/benchmarks/legacy-analyzer.mjs +54 -0
package/benchmarks/run.mjs +198 -0
package/benchmarks/truthfulness.mjs +64 -0
package/commands/goal-evidence-map.md +27 -0
package/docs/benchmarks/latency.svg +3 -3
package/docs/benchmarks/results.json +103 -4
package/docs/benchmarks/truthfulness-score.svg +17 -0
package/package.json +3 -1
package/plugins/goal-guard/events.js +6 -3
package/plugins/goal-guard/state.js +2 -1
package/plugins/goal-guard/summary.js +105 -1
package/plugins/goal-guard/system.js +3 -0
package/plugins/goal-guard/tools.js +35 -1
package/plugins/goal-guard/verdicts.js +38 -1
package/plugins/goal-guard.js +7 -5
package/research/README.md +1 -1
package/research/benchmarks.md +21 -0

package/benchmarks/run.mjs ADDED Viewed

@@ -0,0 +1,198 @@
+#!/usr/bin/env node
+/**
+ * Reproducible benchmark of the shell-command guard.
+ *
+ * Compares the original regex classifier (benchmarks/legacy-analyzer.mjs,
+ * preserved verbatim from the first release) against the current quote-aware
+ * analyzer (plugins/goal-guard/shell.js) on a labeled corpus, and measures
+ * detection rate, false-positive rate, and throughput. Writes results.json and
+ * the SVG charts embedded in the README.
+ *
+ *   node benchmarks/run.mjs
+ */
+import { writeFileSync, mkdirSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import { performance } from "node:perf_hooks";
+import { CORPUS } from "./corpus.mjs";
+import * as current from "../plugins/goal-guard/shell.js";
+import * as legacy from "./legacy-analyzer.mjs";
+import { groupedBarChart, horizontalBarChart } from "./charts.mjs";
+import { runTruthfulnessBenchmark } from "./truthfulness.mjs";
+const root = fileURLToPath(new URL("..", import.meta.url));
+const outDir = join(root, "docs", "benchmarks");
+mkdirSync(outDir, { recursive: true });
+/** A command is "blocked" when flagged destructive or as remote execution. */
+function blocked(analyzer, cmd) {
+  const a = analyzer.analyzeCommand(cmd);
+  return Boolean(a.destructive || a.networkExec);
+}
+function evaluate(analyzer) {
+  const families = {};
+  let destTotal = 0;
+  let destCaught = 0;
+  let safeTotal = 0;
+  let safeFalsePos = 0;
+  for (const { cmd, label, family } of CORPUS) {
+    families[family] ??= { destTotal: 0, destCaught: 0, safeTotal: 0, safeFalsePos: 0 };
+    const isBlocked = blocked(analyzer, cmd);
+    if (label === "destructive") {
+      destTotal += 1;
+      families[family].destTotal += 1;
+      if (isBlocked) {
+        destCaught += 1;
+        families[family].destCaught += 1;
+      }
+    } else {
+      safeTotal += 1;
+      families[family].safeTotal += 1;
+      if (isBlocked) {
+        safeFalsePos += 1;
+        families[family].safeFalsePos += 1;
+      }
+    }
+  }
+  return {
+    detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
+    falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
+    destCaught,
+    destTotal,
+    safeFalsePos,
+    safeTotal,
+    families,
+  };
+}
+function throughput(analyzer) {
+  const cmds = CORPUS.map((c) => c.cmd);
+  // Warm up.
+  for (const c of cmds) analyzer.analyzeCommand(c);
+  const iterations = 4000;
+  const start = performance.now();
+  for (let i = 0; i < iterations; i += 1) {
+    for (const c of cmds) analyzer.analyzeCommand(c);
+  }
+  const ms = performance.now() - start;
+  const ops = (iterations * cmds.length) / (ms / 1000);
+  return Math.round(ops);
+}
+/** Locale-independent thousands grouping (the host locale may use '.' as separator). */
+function fmt(n) {
+  return Math.round(n)
+    .toString()
+    .replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+}
+const legacyEval = evaluate(legacy);
+const currentEval = evaluate(current);
+const truthfulness = runTruthfulnessBenchmark();
+const legacyOps = throughput(legacy);
+const currentOps = throughput(current);
+const legacyUs = 1e6 / legacyOps;
+const currentUs = 1e6 / currentOps;
+const FAMILY_LABELS = {
+  classic: "Classic",
+  bypass: "Obfuscated",
+  "remote-exec": "Remote exec",
+};
+const detFamilies = ["classic", "bypass", "remote-exec"];
+function familyRate(ev, fam) {
+  const f = ev.families[fam];
+  return f && f.destTotal ? (f.destCaught / f.destTotal) * 100 : 0;
+}
+const results = {
+  corpusSize: CORPUS.length,
+  destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
+  safeCount: CORPUS.filter((c) => c.label === "safe").length,
+  legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
+  current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
+  truthfulness,
+};
+writeFileSync(join(outDir, "results.json"), JSON.stringify(results, null, 2));
+// Chart 1: detection rate by command family.
+writeFileSync(
+  join(outDir, "detection-by-family.svg"),
+  groupedBarChart({
+    title: "Destructive-command detection rate by family",
+    subtitle: `Higher is better. Corpus: ${results.destructiveCount} destructive commands.`,
+    groups: detFamilies.map((f) => FAMILY_LABELS[f]),
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: detFamilies.map((f) => familyRate(legacyEval, f)) },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: detFamilies.map((f) => familyRate(currentEval, f)) },
+    ],
+  }),
+);
+// Chart 2: overall scorecard (detection up, false positives down).
+writeFileSync(
+  join(outDir, "overall-scorecard.svg"),
+  groupedBarChart({
+    title: "Overall guard accuracy",
+    subtitle: "Detection rate (higher better) vs false-positive rate (lower better).",
+    groups: ["Detection rate", "False-positive rate"],
+    series: [
+      { name: "Legacy regex guard", color: "#9aa0a6", values: [legacyEval.detectionRate, legacyEval.falsePositiveRate] },
+      { name: "Goal Mode analyzer", color: "#2da44e", values: [currentEval.detectionRate, currentEval.falsePositiveRate] },
+    ],
+  }),
+);
+// Chart 3: per-command latency — the deeper analysis costs a few microseconds,
+// which is negligible for a tool-call guard. Shown for honesty, not as a "win".
+writeFileSync(
+  join(outDir, "latency.svg"),
+  horizontalBarChart({
+    title: "Per-command analysis latency",
+    subtitle: "Microseconds to classify one command. Both are negligible for a tool-call guard.",
+    unit: " µs",
+    max: Math.max(legacyUs, currentUs) * 1.4,
+    rows: [
+      { label: "Legacy regex guard", value: legacyUs, display: `${legacyUs.toFixed(2)} µs`, color: "#9aa0a6" },
+      { label: "Goal Mode analyzer", value: currentUs, display: `${currentUs.toFixed(2)} µs`, color: "#2da44e" },
+    ],
+  }),
+);
+writeFileSync(
+  join(outDir, "truthfulness-score.svg"),
+  horizontalBarChart({
+    title: "Benchmark Truthfulness Score",
+    subtitle: `False Completion Dataset: ${truthfulness.corpusSize} labeled completion-claim cases.`,
+    unit: "%",
+    max: 100,
+    rows: [
+      { label: "Truthfulness score", value: truthfulness.score, display: `${truthfulness.score.toFixed(1)}%`, color: "#2da44e" },
+      { label: "Decision accuracy", value: truthfulness.decisionAccuracy, display: `${truthfulness.decisionAccuracy.toFixed(1)}%`, color: "#0969da" },
+      { label: "Reason accuracy", value: truthfulness.reasonAccuracy, display: `${truthfulness.reasonAccuracy.toFixed(1)}%`, color: "#bf8700" },
+    ],
+  }),
+);
+const pct = (n) => `${n.toFixed(1)}%`;
+console.log("Goal Mode shell-guard benchmark");
+console.log("================================");
+console.log(`Corpus: ${results.corpusSize} commands (${results.destructiveCount} destructive, ${results.safeCount} safe)`);
+console.log("");
+console.log(`Detection rate   legacy ${pct(legacyEval.detectionRate)}   →   Goal Mode ${pct(currentEval.detectionRate)}`);
+console.log(`False positives  legacy ${pct(legacyEval.falsePositiveRate)}   →   Goal Mode ${pct(currentEval.falsePositiveRate)}`);
+console.log(`Latency          legacy ${legacyUs.toFixed(2)} µs/cmd   →   Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
+console.log(`Truthfulness    False Completion Dataset score ${truthfulness.score.toFixed(1)}% (${truthfulness.corpusSize} cases)`);
+console.log("");
+console.log("By family (detection rate):");
+for (const f of detFamilies) {
+  console.log(`  ${FAMILY_LABELS[f].padEnd(12)} legacy ${pct(familyRate(legacyEval, f)).padStart(6)}  →  Goal Mode ${pct(familyRate(currentEval, f)).padStart(6)}`);
+}
+console.log("");
+console.log(`Wrote results.json + 4 SVG charts to docs/benchmarks/`);

package/benchmarks/truthfulness.mjs ADDED Viewed

@@ -0,0 +1,64 @@
+import { BASE_GATES } from "../plugins/goal-guard/agents.js";
+import { DEFAULT_CONFIG } from "../plugins/goal-guard/config.js";
+import { evaluateCompletionClaim } from "../plugins/goal-guard/completion.js";
+import { refreshStickyGates } from "../plugins/goal-guard/gates.js";
+import { createState } from "../plugins/goal-guard/state.js";
+import { FALSE_COMPLETION_CORPUS } from "./completion-corpus.mjs";
+import { fileURLToPath } from "node:url";
+function buildState(def) {
+  const state = createState("2026-01-01T00:00:00.000Z");
+  Object.assign(state, def.state || {});
+  state.latestVerdict = {};
+  for (const gate of def.state?.verdicts || []) {
+    state.latestVerdict[gate.agent] = { verdict: gate.verdict, at: "2026-01-01T00:00:01.000Z", seq: gate.seq };
+  }
+  if (!state.goalText && def.state?.contextualGate) state.goalText = def.state.contextualGate;
+  refreshStickyGates(state);
+  return state;
+}
+export function evaluateTruthfulnessCase(def) {
+  const config = { ...DEFAULT_CONFIG, ...(def.config || {}) };
+  const decision = evaluateCompletionClaim(buildState(def), config, def.text);
+  const reason = decision.reason || "";
+  const decisionCorrect = Boolean(decision.blocked) === Boolean(def.expected.blocked);
+  const reasonCorrect = !def.expected.blocked || reason.includes(def.expected.reasonIncludes || "");
+  return {
+    id: def.id,
+    family: def.family,
+    expectedBlocked: Boolean(def.expected.blocked),
+    actualBlocked: Boolean(decision.blocked),
+    decisionCorrect,
+    reasonCorrect,
+    reason,
+  };
+}
+export function runTruthfulnessBenchmark(corpus = FALSE_COMPLETION_CORPUS) {
+  const cases = corpus.map(evaluateTruthfulnessCase);
+  const falseCompletionCases = cases.filter((c) => c.expectedBlocked);
+  const trueCompletionCases = cases.filter((c) => !c.expectedBlocked);
+  const decisionCorrect = cases.filter((c) => c.decisionCorrect).length;
+  const reasonCorrect = falseCompletionCases.filter((c) => c.reasonCorrect).length;
+  const falseCompletionBlocked = falseCompletionCases.filter((c) => c.actualBlocked).length;
+  const trueCompletionAllowed = trueCompletionCases.filter((c) => !c.actualBlocked).length;
+  const decisionAccuracy = cases.length ? (decisionCorrect / cases.length) * 100 : 0;
+  const reasonAccuracy = falseCompletionCases.length ? (reasonCorrect / falseCompletionCases.length) * 100 : 100;
+  return {
+    name: "False Completion Dataset",
+    corpusSize: cases.length,
+    requiredBaseGates: BASE_GATES,
+    score: Number(((decisionAccuracy * 0.65 + reasonAccuracy * 0.35)).toFixed(1)),
+    decisionAccuracy: Number(decisionAccuracy.toFixed(1)),
+    reasonAccuracy: Number(reasonAccuracy.toFixed(1)),
+    falseCompletionBlockRate: Number(((falseCompletionBlocked / falseCompletionCases.length) * 100).toFixed(1)),
+    validCompletionAllowRate: Number(((trueCompletionAllowed / trueCompletionCases.length) * 100).toFixed(1)),
+    cases,
+  };
+}
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  const result = runTruthfulnessBenchmark();
+  console.log(JSON.stringify(result, null, 2));
+}

package/commands/goal-evidence-map.md ADDED Viewed

@@ -0,0 +1,27 @@
+---
+description: Map Goal Contract acceptance criteria to recorded verification evidence and gaps.
+agent: goal
+---
+Produce a read-only evidence map for the current Goal Mode session. Do not edit files.
+Call `goal_evidence_map` first and use its authoritative Goal Guard state,
+including the Goal Contract, recorded evidence, dirty state, reviewer status, and
+any user-provided context. Report unknown or missing details honestly instead of
+inferring evidence that is not recorded.
+Include:
+- Acceptance criterion
+- Recorded evidence covering it
+- Reviewer status
+- Verification command/result summary
+- Status: covered, partially covered, missing, or stale
+- Gap or risk
+- Next required action
+Additional context:
+```text
+$ARGUMENTS
+```

package/docs/benchmarks/latency.svg CHANGED Viewed

@@ -4,10 +4,10 @@
 <text x="20" y="47" font-size="12" fill="#656d76">Microseconds to classify one command. Both are negligible for a tool-call guard.</text>
 <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Legacy regex guard</text>
 <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
-<rect x="230" y="70" width="202.0" height="22" rx="3" fill="#9aa0a6"/>
-<text x="440.0" y="87" font-size="12" font-weight="600" fill="#1f2328">2.62 µs</text>
+<rect x="230" y="70" width="179.4" height="22" rx="3" fill="#9aa0a6"/>
+<text x="417.4" y="87" font-size="12" font-weight="600" fill="#1f2328">2.24 µs</text>
 <text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Goal Mode analyzer</text>
 <rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
 <rect x="230" y="108" width="300.0" height="22" rx="3" fill="#2da44e"/>
-<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.89 µs</text>
+<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.75 µs</text>
 </svg>

package/docs/benchmarks/results.json CHANGED Viewed

@@ -35,8 +35,8 @@
         "safeFalsePos": 5
       }
     },
-    "opsPerSec": 381490,
-    "usPerCommand": 2.62
+    "opsPerSec": 445671,
+    "usPerCommand": 2.24
   },
   "current": {
     "detectionRate": 100,
@@ -71,7 +71,106 @@
         "safeFalsePos": 0
       }
     },
-    "opsPerSec": 256879,
-    "usPerCommand": 3.89
+    "opsPerSec": 266556,
+    "usPerCommand": 3.75
+  },
+  "truthfulness": {
+    "name": "False Completion Dataset",
+    "corpusSize": 9,
+    "requiredBaseGates": [
+      "goal-prompt-auditor",
+      "goal-reviewer",
+      "goal-diff-reviewer",
+      "goal-verifier",
+      "goal-final-auditor"
+    ],
+    "score": 100,
+    "decisionAccuracy": 100,
+    "reasonAccuracy": 100,
+    "falseCompletionBlockRate": 100,
+    "validCompletionAllowRate": 100,
+    "cases": [
+      {
+        "id": "missing-review-cycles-line",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "missing required Review cycles line"
+      },
+      {
+        "id": "zero-review-cycles",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "no review cycles recorded"
+      },
+      {
+        "id": "wrong-review-cycle-count",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "claimed review cycles (1) do not match recorded review cycles (2)"
+      },
+      {
+        "id": "stale-review-after-edit",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "required review gates are missing or stale (goal-prompt-auditor, goal-reviewer, goal-diff-reviewer, goal-verifier, goal-final-auditor)"
+      },
+      {
+        "id": "missing-contextual-security-gate",
+        "family": "false-completion",
+        "expectedBlocked": true,
+        "actualBlocked": true,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": "required review gates are missing or stale (goal-security-reviewer)"
+      },
+      {
+        "id": "valid-completion-allowed",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      },
+      {
+        "id": "mid-text-mention-not-policed",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      },
+      {
+        "id": "inactive-session-not-policed",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      },
+      {
+        "id": "custom-marker-escaping",
+        "family": "true-completion",
+        "expectedBlocked": false,
+        "actualBlocked": false,
+        "decisionCorrect": true,
+        "reasonCorrect": true,
+        "reason": ""
+      }
+    ]
   }
 }

package/docs/benchmarks/truthfulness-score.svg ADDED Viewed

@@ -0,0 +1,17 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="720" height="202" viewBox="0 0 720 202" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
+<rect width="720" height="202" fill="#ffffff"/>
+<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Benchmark Truthfulness Score</text>
+<text x="20" y="47" font-size="12" fill="#656d76">False Completion Dataset: 9 labeled completion-claim cases.</text>
+<text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Truthfulness score</text>
+<rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="70" width="420.0" height="22" rx="3" fill="#2da44e"/>
+<text x="658.0" y="87" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
+<text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Decision accuracy</text>
+<rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="108" width="420.0" height="22" rx="3" fill="#0969da"/>
+<text x="658.0" y="125" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
+<text x="218" y="163" font-size="12" text-anchor="end" fill="#1f2328">Reason accuracy</text>
+<rect x="230" y="146" width="420" height="22" rx="3" fill="#eaeef2"/>
+<rect x="230" y="146" width="420.0" height="22" rx="3" fill="#bf8700"/>
+<text x="658.0" y="163" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
+</svg>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "opencode-goal-mode",
-  "version": "0.2.2",
+  "version": "0.2.4",
   "description": "Strict Goal Mode agents, commands, and guard plugin for OpenCode.",
   "type": "module",
   "engines": {
@@ -12,6 +12,7 @@
   },
   "files": [
     "agents/",
+    "benchmarks/",
     "commands/",
     "docs/",
     "plugins/",
@@ -31,6 +32,7 @@
     "test:agents": "node --test tests/agents.test.mjs tests/commands.test.mjs",
     "test:install": "node --test tests/install.test.mjs",
     "bench": "node benchmarks/run.mjs",
+    "bench:truthfulness": "node benchmarks/truthfulness.mjs",
     "bench:compare": "node benchmarks/comparison.mjs",
     "pack:check": "npm pack --dry-run",
     "audit": "npm audit --audit-level=moderate",

package/plugins/goal-guard/events.js CHANGED Viewed

@@ -29,6 +29,7 @@ export function markVerification(store, state) {
   state.lastVerificationAt = at;
   state.lastVerificationSeq = store.nextSeq();
   state.updatedAt = at;
+  return state.lastVerificationSeq;
 }
 export function markFileChanged(store, state, file) {
@@ -41,14 +42,16 @@ export function markFileChanged(store, state, file) {
 export function recordEvidence(store, state, command, result, criteria) {
   const at = store.nowIso();
-  state.evidence.push({
+  const entry = {
     command: String(command || ""),
     result: String(result || ""),
     criteria: Array.isArray(criteria) ? criteria.slice(0, 50) : [],
     at,
-  });
+    seq: 0,
+  };
+  state.evidence.push(entry);
   trim(state.evidence, 100);
-  markVerification(store, state);
+  entry.seq = markVerification(store, state);
   state.updatedAt = at;
 }

package/plugins/goal-guard/state.js CHANGED Viewed

@@ -36,6 +36,7 @@ export function createState(nowIso) {
     lastReviewAt: null,
     lastVerificationAt: null,
     verdicts: [],
+    reviewerMemory: [],
     evidence: [],
     latestVerdict: {},
     currentAgent: undefined,
@@ -59,7 +60,7 @@ function reviveState(raw) {
     if (raw[field] !== undefined) base[field] = raw[field];
   }
   // Defensive normalisation of array/object shapes.
-  for (const arrField of ["dirtyReasons", "changedFiles", "verdicts", "evidence", "completionRejections"]) {
+  for (const arrField of ["dirtyReasons", "changedFiles", "verdicts", "reviewerMemory", "evidence", "completionRejections"]) {
     if (!Array.isArray(base[arrField])) base[arrField] = [];
   }
   if (!base.latestVerdict || typeof base.latestVerdict !== "object") base.latestVerdict = {};

package/plugins/goal-guard/summary.js CHANGED Viewed

@@ -3,7 +3,7 @@
  * messages, and the `goal_status` tool. Kept pure and dependency-light.
  */
-import { requiredGates, missingGates } from "./gates.js";
+import { requiredGates, missingGates, gatePassedFresh } from "./gates.js";
 export function summarizeState(state, config) {
   const verdictSummary =
@@ -18,11 +18,32 @@ export function summarizeState(state, config) {
     `lastEditSeq=${state.lastEditSeq || 0}`,
     `lastReviewSeq=${state.lastReviewSeq || 0}`,
     `recentVerdicts=${verdictSummary}`,
+    `openReviewerMemory=${reviewerMemoryReport(state).open.length}`,
     `missingGates=${missingGates(state, config).join(" ") || "none"}`,
     `dirtyReasons=${state.dirtyReasons.slice(-5).join(" | ") || "none"}`,
   ].join("; ");
 }
+export function reviewerMemoryReport(state) {
+  const memory = Array.isArray(state.reviewerMemory) ? state.reviewerMemory : [];
+  const shape = (item) => ({
+    agent: item.agent,
+    finding: item.finding,
+    severity: item.severity || "blocking",
+    status: item.status || "open",
+    count: item.count || 1,
+    firstAt: item.firstAt || null,
+    lastAt: item.lastAt || null,
+    resolvedAt: item.resolvedAt || null,
+    fresh: Number(item.lastSeq || 0) > Number(state.lastEditSeq || 0),
+  });
+  return {
+    open: memory.filter((item) => (item.status || "open") === "open").slice(-20).map(shape),
+    resolved: memory.filter((item) => item.status === "resolved").slice(-20).map(shape),
+    total: memory.length,
+  };
+}
 /** Structured status object for the goal_status tool / diagnostics. */
 export function statusReport(state, config) {
   const required = requiredGates(state, config);
@@ -39,8 +60,91 @@ export function statusReport(state, config) {
     lastReviewAt: state.lastReviewAt,
     lastVerificationAt: state.lastVerificationAt,
     evidenceCount: state.evidence.length,
+    reviewerMemory: reviewerMemoryReport(state),
     changedFiles: state.changedFiles.slice(-50),
     contract: state.contract,
     completionAllowed: Boolean(state.active) && missing.length === 0,
   };
 }
+function evidenceMatchesCriterion(entry, criterion) {
+  const criteria = Array.isArray(entry.criteria) ? entry.criteria : [];
+  return criteria.some((c) => String(c).trim().toLowerCase() === String(criterion).trim().toLowerCase());
+}
+function evidenceFresh(entry, state) {
+  const lastEditSeq = Number(state.lastEditSeq || 0);
+  if (!entry.seq) return lastEditSeq === 0;
+  return Number(entry.seq) > lastEditSeq;
+}
+function criterionStatus(entries, state, missing) {
+  if (entries.length === 0) return "missing";
+  if (!entries.some((entry) => evidenceFresh(entry, state))) return "stale";
+  if (missing.length > 0 || state.dirty) return "partially covered";
+  return "covered";
+}
+/** Structured Requirement/Acceptance Criteria -> Evidence -> Reviewer -> Status map. */
+export function evidenceMapReport(state, config) {
+  const required = requiredGates(state, config);
+  const missing = missingGates(state, config);
+  const reviewers = required.map((agent) => {
+    const latest = state.latestVerdict[agent] || null;
+    return {
+      agent,
+      verdict: latest?.verdict || "missing",
+      at: latest?.at || null,
+      fresh: gatePassedFresh(state, agent),
+    };
+  });
+  const criteria = Array.isArray(state.contract?.acceptanceCriteria) ? state.contract.acceptanceCriteria : [];
+  const items = criteria.map((criterion) => {
+    const entries = state.evidence.filter((entry) => evidenceMatchesCriterion(entry, criterion));
+    const status = criterionStatus(entries, state, missing);
+    const memory = reviewerMemoryReport(state).open.filter((item) => item.finding.toLowerCase().includes(String(criterion).trim().toLowerCase()));
+    return {
+      criterion,
+      status,
+      evidence: entries.map((entry) => ({
+        command: entry.command,
+        result: entry.result,
+        at: entry.at,
+        seq: entry.seq || null,
+        fresh: evidenceFresh(entry, state),
+      })),
+      reviewers,
+      reviewerMemory: memory,
+      gap:
+        status === "missing"
+          ? "No recorded evidence references this acceptance criterion."
+          : status === "stale"
+            ? "Recorded evidence is older than the latest edit."
+            : missing.length > 0
+              ? `Missing or stale reviewer gates: ${missing.join(", ")}.`
+              : state.dirty
+                ? "Session is dirty; rerun reviews after the latest change."
+                : "None recorded.",
+      nextAction:
+        status === "covered"
+          ? "No action required for this criterion."
+          : status === "missing"
+            ? "Run verification and record it with goal_evidence, including this criterion."
+            : status === "stale"
+              ? "Rerun verification after the latest edit and record fresh evidence."
+              : "Complete missing/stale reviewer gates after verification.",
+    };
+  });
+  return {
+    active: Boolean(state.active),
+    dirty: Boolean(state.dirty),
+    lastEditAt: state.lastEditAt,
+    requiredGates: required,
+    missingGates: missing,
+    reviewers,
+    unmappedEvidence: state.evidence
+      .filter((entry) => !criteria.some((criterion) => evidenceMatchesCriterion(entry, criterion)))
+      .map((entry) => ({ command: entry.command, result: entry.result, criteria: entry.criteria || [], at: entry.at, seq: entry.seq || null })),
+    criteria: items,
+  };
+}

package/plugins/goal-guard/system.js CHANGED Viewed

@@ -33,6 +33,9 @@ export function buildSystemInjection(state, config) {
   lines.push(`- Verification observed: ${r.verificationSeen ? "yes" : "no"}.`);
   lines.push(`- Required review gates: ${bullet(r.requiredGates)}.`);
   lines.push(`- Gates still missing or stale: ${bullet(r.missingGates)}.`);
+  if (r.reviewerMemory.open.length) {
+    lines.push(`- Open Reviewer Memory: ${r.reviewerMemory.open.map((m) => `${m.agent}: ${m.finding}`).join(" | ")}.`);
+  }
   lines.push(
     `- Completion is currently ${r.completionAllowed ? "ALLOWED" : "BLOCKED"}. ` +
       (r.completionAllowed