opencode-goal-mode 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,198 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Reproducible benchmark of the shell-command guard.
4
+ *
5
+ * Compares the original regex classifier (benchmarks/legacy-analyzer.mjs,
6
+ * preserved verbatim from the first release) against the current quote-aware
7
+ * analyzer (plugins/goal-guard/shell.js) on a labeled corpus, and measures
8
+ * detection rate, false-positive rate, and throughput. Writes results.json and
9
+ * the SVG charts embedded in the README.
10
+ *
11
+ * node benchmarks/run.mjs
12
+ */
13
+
14
+ import { writeFileSync, mkdirSync } from "node:fs";
15
+ import { join, dirname } from "node:path";
16
+ import { fileURLToPath } from "node:url";
17
+ import { performance } from "node:perf_hooks";
18
+ import { CORPUS } from "./corpus.mjs";
19
+ import * as current from "../plugins/goal-guard/shell.js";
20
+ import * as legacy from "./legacy-analyzer.mjs";
21
+ import { groupedBarChart, horizontalBarChart } from "./charts.mjs";
22
+ import { runTruthfulnessBenchmark } from "./truthfulness.mjs";
23
+
24
+ const root = fileURLToPath(new URL("..", import.meta.url));
25
+ const outDir = join(root, "docs", "benchmarks");
26
+ mkdirSync(outDir, { recursive: true });
27
+
28
+ /** A command is "blocked" when flagged destructive or as remote execution. */
29
+ function blocked(analyzer, cmd) {
30
+ const a = analyzer.analyzeCommand(cmd);
31
+ return Boolean(a.destructive || a.networkExec);
32
+ }
33
+
34
+ function evaluate(analyzer) {
35
+ const families = {};
36
+ let destTotal = 0;
37
+ let destCaught = 0;
38
+ let safeTotal = 0;
39
+ let safeFalsePos = 0;
40
+
41
+ for (const { cmd, label, family } of CORPUS) {
42
+ families[family] ??= { destTotal: 0, destCaught: 0, safeTotal: 0, safeFalsePos: 0 };
43
+ const isBlocked = blocked(analyzer, cmd);
44
+ if (label === "destructive") {
45
+ destTotal += 1;
46
+ families[family].destTotal += 1;
47
+ if (isBlocked) {
48
+ destCaught += 1;
49
+ families[family].destCaught += 1;
50
+ }
51
+ } else {
52
+ safeTotal += 1;
53
+ families[family].safeTotal += 1;
54
+ if (isBlocked) {
55
+ safeFalsePos += 1;
56
+ families[family].safeFalsePos += 1;
57
+ }
58
+ }
59
+ }
60
+
61
+ return {
62
+ detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
63
+ falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
64
+ destCaught,
65
+ destTotal,
66
+ safeFalsePos,
67
+ safeTotal,
68
+ families,
69
+ };
70
+ }
71
+
72
+ function throughput(analyzer) {
73
+ const cmds = CORPUS.map((c) => c.cmd);
74
+ // Warm up.
75
+ for (const c of cmds) analyzer.analyzeCommand(c);
76
+ const iterations = 4000;
77
+ const start = performance.now();
78
+ for (let i = 0; i < iterations; i += 1) {
79
+ for (const c of cmds) analyzer.analyzeCommand(c);
80
+ }
81
+ const ms = performance.now() - start;
82
+ const ops = (iterations * cmds.length) / (ms / 1000);
83
+ return Math.round(ops);
84
+ }
85
+
86
+ /** Locale-independent thousands grouping (the host locale may use '.' as separator). */
87
+ function fmt(n) {
88
+ return Math.round(n)
89
+ .toString()
90
+ .replace(/\B(?=(\d{3})+(?!\d))/g, ",");
91
+ }
92
+
93
+ const legacyEval = evaluate(legacy);
94
+ const currentEval = evaluate(current);
95
+ const truthfulness = runTruthfulnessBenchmark();
96
+ const legacyOps = throughput(legacy);
97
+ const currentOps = throughput(current);
98
+ const legacyUs = 1e6 / legacyOps;
99
+ const currentUs = 1e6 / currentOps;
100
+
101
+ const FAMILY_LABELS = {
102
+ classic: "Classic",
103
+ bypass: "Obfuscated",
104
+ "remote-exec": "Remote exec",
105
+ };
106
+ const detFamilies = ["classic", "bypass", "remote-exec"];
107
+
108
+ function familyRate(ev, fam) {
109
+ const f = ev.families[fam];
110
+ return f && f.destTotal ? (f.destCaught / f.destTotal) * 100 : 0;
111
+ }
112
+
113
+ const results = {
114
+ corpusSize: CORPUS.length,
115
+ destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
116
+ safeCount: CORPUS.filter((c) => c.label === "safe").length,
117
+ legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
118
+ current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
119
+ truthfulness,
120
+ };
121
+
122
+ writeFileSync(join(outDir, "results.json"), JSON.stringify(results, null, 2));
123
+
124
+ // Chart 1: detection rate by command family.
125
+ writeFileSync(
126
+ join(outDir, "detection-by-family.svg"),
127
+ groupedBarChart({
128
+ title: "Destructive-command detection rate by family",
129
+ subtitle: `Higher is better. Corpus: ${results.destructiveCount} destructive commands.`,
130
+ groups: detFamilies.map((f) => FAMILY_LABELS[f]),
131
+ series: [
132
+ { name: "Legacy regex guard", color: "#9aa0a6", values: detFamilies.map((f) => familyRate(legacyEval, f)) },
133
+ { name: "Goal Mode analyzer", color: "#2da44e", values: detFamilies.map((f) => familyRate(currentEval, f)) },
134
+ ],
135
+ }),
136
+ );
137
+
138
+ // Chart 2: overall scorecard (detection up, false positives down).
139
+ writeFileSync(
140
+ join(outDir, "overall-scorecard.svg"),
141
+ groupedBarChart({
142
+ title: "Overall guard accuracy",
143
+ subtitle: "Detection rate (higher better) vs false-positive rate (lower better).",
144
+ groups: ["Detection rate", "False-positive rate"],
145
+ series: [
146
+ { name: "Legacy regex guard", color: "#9aa0a6", values: [legacyEval.detectionRate, legacyEval.falsePositiveRate] },
147
+ { name: "Goal Mode analyzer", color: "#2da44e", values: [currentEval.detectionRate, currentEval.falsePositiveRate] },
148
+ ],
149
+ }),
150
+ );
151
+
152
+ // Chart 3: per-command latency — the deeper analysis costs a few microseconds,
153
+ // which is negligible for a tool-call guard. Shown for honesty, not as a "win".
154
+ writeFileSync(
155
+ join(outDir, "latency.svg"),
156
+ horizontalBarChart({
157
+ title: "Per-command analysis latency",
158
+ subtitle: "Microseconds to classify one command. Both are negligible for a tool-call guard.",
159
+ unit: " µs",
160
+ max: Math.max(legacyUs, currentUs) * 1.4,
161
+ rows: [
162
+ { label: "Legacy regex guard", value: legacyUs, display: `${legacyUs.toFixed(2)} µs`, color: "#9aa0a6" },
163
+ { label: "Goal Mode analyzer", value: currentUs, display: `${currentUs.toFixed(2)} µs`, color: "#2da44e" },
164
+ ],
165
+ }),
166
+ );
167
+
168
+ writeFileSync(
169
+ join(outDir, "truthfulness-score.svg"),
170
+ horizontalBarChart({
171
+ title: "Benchmark Truthfulness Score",
172
+ subtitle: `False Completion Dataset: ${truthfulness.corpusSize} labeled completion-claim cases.`,
173
+ unit: "%",
174
+ max: 100,
175
+ rows: [
176
+ { label: "Truthfulness score", value: truthfulness.score, display: `${truthfulness.score.toFixed(1)}%`, color: "#2da44e" },
177
+ { label: "Decision accuracy", value: truthfulness.decisionAccuracy, display: `${truthfulness.decisionAccuracy.toFixed(1)}%`, color: "#0969da" },
178
+ { label: "Reason accuracy", value: truthfulness.reasonAccuracy, display: `${truthfulness.reasonAccuracy.toFixed(1)}%`, color: "#bf8700" },
179
+ ],
180
+ }),
181
+ );
182
+
183
+ const pct = (n) => `${n.toFixed(1)}%`;
184
+ console.log("Goal Mode shell-guard benchmark");
185
+ console.log("================================");
186
+ console.log(`Corpus: ${results.corpusSize} commands (${results.destructiveCount} destructive, ${results.safeCount} safe)`);
187
+ console.log("");
188
+ console.log(`Detection rate legacy ${pct(legacyEval.detectionRate)} → Goal Mode ${pct(currentEval.detectionRate)}`);
189
+ console.log(`False positives legacy ${pct(legacyEval.falsePositiveRate)} → Goal Mode ${pct(currentEval.falsePositiveRate)}`);
190
+ console.log(`Latency legacy ${legacyUs.toFixed(2)} µs/cmd → Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
191
+ console.log(`Truthfulness False Completion Dataset score ${truthfulness.score.toFixed(1)}% (${truthfulness.corpusSize} cases)`);
192
+ console.log("");
193
+ console.log("By family (detection rate):");
194
+ for (const f of detFamilies) {
195
+ console.log(` ${FAMILY_LABELS[f].padEnd(12)} legacy ${pct(familyRate(legacyEval, f)).padStart(6)} → Goal Mode ${pct(familyRate(currentEval, f)).padStart(6)}`);
196
+ }
197
+ console.log("");
198
+ console.log(`Wrote results.json + 4 SVG charts to docs/benchmarks/`);
@@ -0,0 +1,64 @@
1
+ import { BASE_GATES } from "../plugins/goal-guard/agents.js";
2
+ import { DEFAULT_CONFIG } from "../plugins/goal-guard/config.js";
3
+ import { evaluateCompletionClaim } from "../plugins/goal-guard/completion.js";
4
+ import { refreshStickyGates } from "../plugins/goal-guard/gates.js";
5
+ import { createState } from "../plugins/goal-guard/state.js";
6
+ import { FALSE_COMPLETION_CORPUS } from "./completion-corpus.mjs";
7
+ import { fileURLToPath } from "node:url";
8
+
9
+ function buildState(def) {
10
+ const state = createState("2026-01-01T00:00:00.000Z");
11
+ Object.assign(state, def.state || {});
12
+ state.latestVerdict = {};
13
+ for (const gate of def.state?.verdicts || []) {
14
+ state.latestVerdict[gate.agent] = { verdict: gate.verdict, at: "2026-01-01T00:00:01.000Z", seq: gate.seq };
15
+ }
16
+ if (!state.goalText && def.state?.contextualGate) state.goalText = def.state.contextualGate;
17
+ refreshStickyGates(state);
18
+ return state;
19
+ }
20
+
21
+ export function evaluateTruthfulnessCase(def) {
22
+ const config = { ...DEFAULT_CONFIG, ...(def.config || {}) };
23
+ const decision = evaluateCompletionClaim(buildState(def), config, def.text);
24
+ const reason = decision.reason || "";
25
+ const decisionCorrect = Boolean(decision.blocked) === Boolean(def.expected.blocked);
26
+ const reasonCorrect = !def.expected.blocked || reason.includes(def.expected.reasonIncludes || "");
27
+ return {
28
+ id: def.id,
29
+ family: def.family,
30
+ expectedBlocked: Boolean(def.expected.blocked),
31
+ actualBlocked: Boolean(decision.blocked),
32
+ decisionCorrect,
33
+ reasonCorrect,
34
+ reason,
35
+ };
36
+ }
37
+
38
+ export function runTruthfulnessBenchmark(corpus = FALSE_COMPLETION_CORPUS) {
39
+ const cases = corpus.map(evaluateTruthfulnessCase);
40
+ const falseCompletionCases = cases.filter((c) => c.expectedBlocked);
41
+ const trueCompletionCases = cases.filter((c) => !c.expectedBlocked);
42
+ const decisionCorrect = cases.filter((c) => c.decisionCorrect).length;
43
+ const reasonCorrect = falseCompletionCases.filter((c) => c.reasonCorrect).length;
44
+ const falseCompletionBlocked = falseCompletionCases.filter((c) => c.actualBlocked).length;
45
+ const trueCompletionAllowed = trueCompletionCases.filter((c) => !c.actualBlocked).length;
46
+ const decisionAccuracy = cases.length ? (decisionCorrect / cases.length) * 100 : 0;
47
+ const reasonAccuracy = falseCompletionCases.length ? (reasonCorrect / falseCompletionCases.length) * 100 : 100;
48
+ return {
49
+ name: "False Completion Dataset",
50
+ corpusSize: cases.length,
51
+ requiredBaseGates: BASE_GATES,
52
+ score: Number(((decisionAccuracy * 0.65 + reasonAccuracy * 0.35)).toFixed(1)),
53
+ decisionAccuracy: Number(decisionAccuracy.toFixed(1)),
54
+ reasonAccuracy: Number(reasonAccuracy.toFixed(1)),
55
+ falseCompletionBlockRate: Number(((falseCompletionBlocked / falseCompletionCases.length) * 100).toFixed(1)),
56
+ validCompletionAllowRate: Number(((trueCompletionAllowed / trueCompletionCases.length) * 100).toFixed(1)),
57
+ cases,
58
+ };
59
+ }
60
+
61
+ if (process.argv[1] === fileURLToPath(import.meta.url)) {
62
+ const result = runTruthfulnessBenchmark();
63
+ console.log(JSON.stringify(result, null, 2));
64
+ }
@@ -0,0 +1,27 @@
1
+ ---
2
+ description: Map Goal Contract acceptance criteria to recorded verification evidence and gaps.
3
+ agent: goal
4
+ ---
5
+
6
+ Produce a read-only evidence map for the current Goal Mode session. Do not edit files.
7
+
8
+ Call `goal_evidence_map` first and use its authoritative Goal Guard state,
9
+ including the Goal Contract, recorded evidence, dirty state, reviewer status, and
10
+ any user-provided context. Report unknown or missing details honestly instead of
11
+ inferring evidence that is not recorded.
12
+
13
+ Include:
14
+
15
+ - Acceptance criterion
16
+ - Recorded evidence covering it
17
+ - Reviewer status
18
+ - Verification command/result summary
19
+ - Status: covered, partially covered, missing, or stale
20
+ - Gap or risk
21
+ - Next required action
22
+
23
+ Additional context:
24
+
25
+ ```text
26
+ $ARGUMENTS
27
+ ```
@@ -4,10 +4,10 @@
4
4
  <text x="20" y="47" font-size="12" fill="#656d76">Microseconds to classify one command. Both are negligible for a tool-call guard.</text>
5
5
  <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Legacy regex guard</text>
6
6
  <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
7
- <rect x="230" y="70" width="202.0" height="22" rx="3" fill="#9aa0a6"/>
8
- <text x="440.0" y="87" font-size="12" font-weight="600" fill="#1f2328">2.62 µs</text>
7
+ <rect x="230" y="70" width="179.4" height="22" rx="3" fill="#9aa0a6"/>
8
+ <text x="417.4" y="87" font-size="12" font-weight="600" fill="#1f2328">2.24 µs</text>
9
9
  <text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Goal Mode analyzer</text>
10
10
  <rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
11
11
  <rect x="230" y="108" width="300.0" height="22" rx="3" fill="#2da44e"/>
12
- <text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.89 µs</text>
12
+ <text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">3.75 µs</text>
13
13
  </svg>
@@ -35,8 +35,8 @@
35
35
  "safeFalsePos": 5
36
36
  }
37
37
  },
38
- "opsPerSec": 381490,
39
- "usPerCommand": 2.62
38
+ "opsPerSec": 445671,
39
+ "usPerCommand": 2.24
40
40
  },
41
41
  "current": {
42
42
  "detectionRate": 100,
@@ -71,7 +71,106 @@
71
71
  "safeFalsePos": 0
72
72
  }
73
73
  },
74
- "opsPerSec": 256879,
75
- "usPerCommand": 3.89
74
+ "opsPerSec": 266556,
75
+ "usPerCommand": 3.75
76
+ },
77
+ "truthfulness": {
78
+ "name": "False Completion Dataset",
79
+ "corpusSize": 9,
80
+ "requiredBaseGates": [
81
+ "goal-prompt-auditor",
82
+ "goal-reviewer",
83
+ "goal-diff-reviewer",
84
+ "goal-verifier",
85
+ "goal-final-auditor"
86
+ ],
87
+ "score": 100,
88
+ "decisionAccuracy": 100,
89
+ "reasonAccuracy": 100,
90
+ "falseCompletionBlockRate": 100,
91
+ "validCompletionAllowRate": 100,
92
+ "cases": [
93
+ {
94
+ "id": "missing-review-cycles-line",
95
+ "family": "false-completion",
96
+ "expectedBlocked": true,
97
+ "actualBlocked": true,
98
+ "decisionCorrect": true,
99
+ "reasonCorrect": true,
100
+ "reason": "missing required Review cycles line"
101
+ },
102
+ {
103
+ "id": "zero-review-cycles",
104
+ "family": "false-completion",
105
+ "expectedBlocked": true,
106
+ "actualBlocked": true,
107
+ "decisionCorrect": true,
108
+ "reasonCorrect": true,
109
+ "reason": "no review cycles recorded"
110
+ },
111
+ {
112
+ "id": "wrong-review-cycle-count",
113
+ "family": "false-completion",
114
+ "expectedBlocked": true,
115
+ "actualBlocked": true,
116
+ "decisionCorrect": true,
117
+ "reasonCorrect": true,
118
+ "reason": "claimed review cycles (1) do not match recorded review cycles (2)"
119
+ },
120
+ {
121
+ "id": "stale-review-after-edit",
122
+ "family": "false-completion",
123
+ "expectedBlocked": true,
124
+ "actualBlocked": true,
125
+ "decisionCorrect": true,
126
+ "reasonCorrect": true,
127
+ "reason": "required review gates are missing or stale (goal-prompt-auditor, goal-reviewer, goal-diff-reviewer, goal-verifier, goal-final-auditor)"
128
+ },
129
+ {
130
+ "id": "missing-contextual-security-gate",
131
+ "family": "false-completion",
132
+ "expectedBlocked": true,
133
+ "actualBlocked": true,
134
+ "decisionCorrect": true,
135
+ "reasonCorrect": true,
136
+ "reason": "required review gates are missing or stale (goal-security-reviewer)"
137
+ },
138
+ {
139
+ "id": "valid-completion-allowed",
140
+ "family": "true-completion",
141
+ "expectedBlocked": false,
142
+ "actualBlocked": false,
143
+ "decisionCorrect": true,
144
+ "reasonCorrect": true,
145
+ "reason": ""
146
+ },
147
+ {
148
+ "id": "mid-text-mention-not-policed",
149
+ "family": "true-completion",
150
+ "expectedBlocked": false,
151
+ "actualBlocked": false,
152
+ "decisionCorrect": true,
153
+ "reasonCorrect": true,
154
+ "reason": ""
155
+ },
156
+ {
157
+ "id": "inactive-session-not-policed",
158
+ "family": "true-completion",
159
+ "expectedBlocked": false,
160
+ "actualBlocked": false,
161
+ "decisionCorrect": true,
162
+ "reasonCorrect": true,
163
+ "reason": ""
164
+ },
165
+ {
166
+ "id": "custom-marker-escaping",
167
+ "family": "true-completion",
168
+ "expectedBlocked": false,
169
+ "actualBlocked": false,
170
+ "decisionCorrect": true,
171
+ "reasonCorrect": true,
172
+ "reason": ""
173
+ }
174
+ ]
76
175
  }
77
176
  }
@@ -0,0 +1,17 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="720" height="202" viewBox="0 0 720 202" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
2
+ <rect width="720" height="202" fill="#ffffff"/>
3
+ <text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Benchmark Truthfulness Score</text>
4
+ <text x="20" y="47" font-size="12" fill="#656d76">False Completion Dataset: 9 labeled completion-claim cases.</text>
5
+ <text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Truthfulness score</text>
6
+ <rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
7
+ <rect x="230" y="70" width="420.0" height="22" rx="3" fill="#2da44e"/>
8
+ <text x="658.0" y="87" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
9
+ <text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Decision accuracy</text>
10
+ <rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
11
+ <rect x="230" y="108" width="420.0" height="22" rx="3" fill="#0969da"/>
12
+ <text x="658.0" y="125" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
13
+ <text x="218" y="163" font-size="12" text-anchor="end" fill="#1f2328">Reason accuracy</text>
14
+ <rect x="230" y="146" width="420" height="22" rx="3" fill="#eaeef2"/>
15
+ <rect x="230" y="146" width="420.0" height="22" rx="3" fill="#bf8700"/>
16
+ <text x="658.0" y="163" font-size="12" font-weight="600" fill="#1f2328">100.0%</text>
17
+ </svg>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-goal-mode",
3
- "version": "0.2.2",
3
+ "version": "0.2.4",
4
4
  "description": "Strict Goal Mode agents, commands, and guard plugin for OpenCode.",
5
5
  "type": "module",
6
6
  "engines": {
@@ -12,6 +12,7 @@
12
12
  },
13
13
  "files": [
14
14
  "agents/",
15
+ "benchmarks/",
15
16
  "commands/",
16
17
  "docs/",
17
18
  "plugins/",
@@ -31,6 +32,7 @@
31
32
  "test:agents": "node --test tests/agents.test.mjs tests/commands.test.mjs",
32
33
  "test:install": "node --test tests/install.test.mjs",
33
34
  "bench": "node benchmarks/run.mjs",
35
+ "bench:truthfulness": "node benchmarks/truthfulness.mjs",
34
36
  "bench:compare": "node benchmarks/comparison.mjs",
35
37
  "pack:check": "npm pack --dry-run",
36
38
  "audit": "npm audit --audit-level=moderate",
@@ -29,6 +29,7 @@ export function markVerification(store, state) {
29
29
  state.lastVerificationAt = at;
30
30
  state.lastVerificationSeq = store.nextSeq();
31
31
  state.updatedAt = at;
32
+ return state.lastVerificationSeq;
32
33
  }
33
34
 
34
35
  export function markFileChanged(store, state, file) {
@@ -41,14 +42,16 @@ export function markFileChanged(store, state, file) {
41
42
 
42
43
  export function recordEvidence(store, state, command, result, criteria) {
43
44
  const at = store.nowIso();
44
- state.evidence.push({
45
+ const entry = {
45
46
  command: String(command || ""),
46
47
  result: String(result || ""),
47
48
  criteria: Array.isArray(criteria) ? criteria.slice(0, 50) : [],
48
49
  at,
49
- });
50
+ seq: 0,
51
+ };
52
+ state.evidence.push(entry);
50
53
  trim(state.evidence, 100);
51
- markVerification(store, state);
54
+ entry.seq = markVerification(store, state);
52
55
  state.updatedAt = at;
53
56
  }
54
57
 
@@ -36,6 +36,7 @@ export function createState(nowIso) {
36
36
  lastReviewAt: null,
37
37
  lastVerificationAt: null,
38
38
  verdicts: [],
39
+ reviewerMemory: [],
39
40
  evidence: [],
40
41
  latestVerdict: {},
41
42
  currentAgent: undefined,
@@ -59,7 +60,7 @@ function reviveState(raw) {
59
60
  if (raw[field] !== undefined) base[field] = raw[field];
60
61
  }
61
62
  // Defensive normalisation of array/object shapes.
62
- for (const arrField of ["dirtyReasons", "changedFiles", "verdicts", "evidence", "completionRejections"]) {
63
+ for (const arrField of ["dirtyReasons", "changedFiles", "verdicts", "reviewerMemory", "evidence", "completionRejections"]) {
63
64
  if (!Array.isArray(base[arrField])) base[arrField] = [];
64
65
  }
65
66
  if (!base.latestVerdict || typeof base.latestVerdict !== "object") base.latestVerdict = {};
@@ -3,7 +3,7 @@
3
3
  * messages, and the `goal_status` tool. Kept pure and dependency-light.
4
4
  */
5
5
 
6
- import { requiredGates, missingGates } from "./gates.js";
6
+ import { requiredGates, missingGates, gatePassedFresh } from "./gates.js";
7
7
 
8
8
  export function summarizeState(state, config) {
9
9
  const verdictSummary =
@@ -18,11 +18,32 @@ export function summarizeState(state, config) {
18
18
  `lastEditSeq=${state.lastEditSeq || 0}`,
19
19
  `lastReviewSeq=${state.lastReviewSeq || 0}`,
20
20
  `recentVerdicts=${verdictSummary}`,
21
+ `openReviewerMemory=${reviewerMemoryReport(state).open.length}`,
21
22
  `missingGates=${missingGates(state, config).join(" ") || "none"}`,
22
23
  `dirtyReasons=${state.dirtyReasons.slice(-5).join(" | ") || "none"}`,
23
24
  ].join("; ");
24
25
  }
25
26
 
27
+ export function reviewerMemoryReport(state) {
28
+ const memory = Array.isArray(state.reviewerMemory) ? state.reviewerMemory : [];
29
+ const shape = (item) => ({
30
+ agent: item.agent,
31
+ finding: item.finding,
32
+ severity: item.severity || "blocking",
33
+ status: item.status || "open",
34
+ count: item.count || 1,
35
+ firstAt: item.firstAt || null,
36
+ lastAt: item.lastAt || null,
37
+ resolvedAt: item.resolvedAt || null,
38
+ fresh: Number(item.lastSeq || 0) > Number(state.lastEditSeq || 0),
39
+ });
40
+ return {
41
+ open: memory.filter((item) => (item.status || "open") === "open").slice(-20).map(shape),
42
+ resolved: memory.filter((item) => item.status === "resolved").slice(-20).map(shape),
43
+ total: memory.length,
44
+ };
45
+ }
46
+
26
47
  /** Structured status object for the goal_status tool / diagnostics. */
27
48
  export function statusReport(state, config) {
28
49
  const required = requiredGates(state, config);
@@ -39,8 +60,91 @@ export function statusReport(state, config) {
39
60
  lastReviewAt: state.lastReviewAt,
40
61
  lastVerificationAt: state.lastVerificationAt,
41
62
  evidenceCount: state.evidence.length,
63
+ reviewerMemory: reviewerMemoryReport(state),
42
64
  changedFiles: state.changedFiles.slice(-50),
43
65
  contract: state.contract,
44
66
  completionAllowed: Boolean(state.active) && missing.length === 0,
45
67
  };
46
68
  }
69
+
70
+ function evidenceMatchesCriterion(entry, criterion) {
71
+ const criteria = Array.isArray(entry.criteria) ? entry.criteria : [];
72
+ return criteria.some((c) => String(c).trim().toLowerCase() === String(criterion).trim().toLowerCase());
73
+ }
74
+
75
+ function evidenceFresh(entry, state) {
76
+ const lastEditSeq = Number(state.lastEditSeq || 0);
77
+ if (!entry.seq) return lastEditSeq === 0;
78
+ return Number(entry.seq) > lastEditSeq;
79
+ }
80
+
81
+ function criterionStatus(entries, state, missing) {
82
+ if (entries.length === 0) return "missing";
83
+ if (!entries.some((entry) => evidenceFresh(entry, state))) return "stale";
84
+ if (missing.length > 0 || state.dirty) return "partially covered";
85
+ return "covered";
86
+ }
87
+
88
+ /** Structured Requirement/Acceptance Criteria -> Evidence -> Reviewer -> Status map. */
89
+ export function evidenceMapReport(state, config) {
90
+ const required = requiredGates(state, config);
91
+ const missing = missingGates(state, config);
92
+ const reviewers = required.map((agent) => {
93
+ const latest = state.latestVerdict[agent] || null;
94
+ return {
95
+ agent,
96
+ verdict: latest?.verdict || "missing",
97
+ at: latest?.at || null,
98
+ fresh: gatePassedFresh(state, agent),
99
+ };
100
+ });
101
+ const criteria = Array.isArray(state.contract?.acceptanceCriteria) ? state.contract.acceptanceCriteria : [];
102
+ const items = criteria.map((criterion) => {
103
+ const entries = state.evidence.filter((entry) => evidenceMatchesCriterion(entry, criterion));
104
+ const status = criterionStatus(entries, state, missing);
105
+ const memory = reviewerMemoryReport(state).open.filter((item) => item.finding.toLowerCase().includes(String(criterion).trim().toLowerCase()));
106
+ return {
107
+ criterion,
108
+ status,
109
+ evidence: entries.map((entry) => ({
110
+ command: entry.command,
111
+ result: entry.result,
112
+ at: entry.at,
113
+ seq: entry.seq || null,
114
+ fresh: evidenceFresh(entry, state),
115
+ })),
116
+ reviewers,
117
+ reviewerMemory: memory,
118
+ gap:
119
+ status === "missing"
120
+ ? "No recorded evidence references this acceptance criterion."
121
+ : status === "stale"
122
+ ? "Recorded evidence is older than the latest edit."
123
+ : missing.length > 0
124
+ ? `Missing or stale reviewer gates: ${missing.join(", ")}.`
125
+ : state.dirty
126
+ ? "Session is dirty; rerun reviews after the latest change."
127
+ : "None recorded.",
128
+ nextAction:
129
+ status === "covered"
130
+ ? "No action required for this criterion."
131
+ : status === "missing"
132
+ ? "Run verification and record it with goal_evidence, including this criterion."
133
+ : status === "stale"
134
+ ? "Rerun verification after the latest edit and record fresh evidence."
135
+ : "Complete missing/stale reviewer gates after verification.",
136
+ };
137
+ });
138
+ return {
139
+ active: Boolean(state.active),
140
+ dirty: Boolean(state.dirty),
141
+ lastEditAt: state.lastEditAt,
142
+ requiredGates: required,
143
+ missingGates: missing,
144
+ reviewers,
145
+ unmappedEvidence: state.evidence
146
+ .filter((entry) => !criteria.some((criterion) => evidenceMatchesCriterion(entry, criterion)))
147
+ .map((entry) => ({ command: entry.command, result: entry.result, criteria: entry.criteria || [], at: entry.at, seq: entry.seq || null })),
148
+ criteria: items,
149
+ };
150
+ }
@@ -33,6 +33,9 @@ export function buildSystemInjection(state, config) {
33
33
  lines.push(`- Verification observed: ${r.verificationSeen ? "yes" : "no"}.`);
34
34
  lines.push(`- Required review gates: ${bullet(r.requiredGates)}.`);
35
35
  lines.push(`- Gates still missing or stale: ${bullet(r.missingGates)}.`);
36
+ if (r.reviewerMemory.open.length) {
37
+ lines.push(`- Open Reviewer Memory: ${r.reviewerMemory.open.map((m) => `${m.agent}: ${m.finding}`).join(" | ")}.`);
38
+ }
36
39
  lines.push(
37
40
  `- Completion is currently ${r.completionAllowed ? "ALLOWED" : "BLOCKED"}. ` +
38
41
  (r.completionAllowed