opencode-goal-mode 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@
12
12
  */
13
13
 
14
14
  import { tool } from "@opencode-ai/plugin";
15
- import { statusReport } from "./summary.js";
15
+ import { evidenceMapReport, reviewerMemoryReport, statusReport } from "./summary.js";
16
16
  import { recordEvidence } from "./events.js";
17
17
  import { refreshStickyGates } from "./gates.js";
18
18
  import { createState } from "./state.js";
@@ -46,6 +46,40 @@ export function createGoalTools({ store, config, persist }) {
46
46
  },
47
47
  }),
48
48
 
49
+ goal_evidence_map: tool({
50
+ description:
51
+ "Return an authoritative read-only evidence map for this session: each acceptance " +
52
+ "criterion, matching recorded evidence, required reviewer gate status, coverage status, " +
53
+ "gaps, and next action.",
54
+ args: {},
55
+ async execute(_args, ctx) {
56
+ const state = store.stateFor(ctx.sessionID);
57
+ const report = evidenceMapReport(state, config);
58
+ const covered = report.criteria.filter((item) => item.status === "covered").length;
59
+ return {
60
+ title: `Evidence map: ${covered}/${report.criteria.length} criteria covered`,
61
+ output: JSON.stringify(report, null, 2),
62
+ metadata: { criteriaCount: report.criteria.length, coveredCount: covered, missingGates: report.missingGates },
63
+ };
64
+ },
65
+ }),
66
+
67
+ goal_reviewer_memory: tool({
68
+ description:
69
+ "Return durable Reviewer Memory for this session: unresolved and recently resolved " +
70
+ "reviewer findings carried across cycles. Read-only.",
71
+ args: {},
72
+ async execute(_args, ctx) {
73
+ const state = store.stateFor(ctx.sessionID);
74
+ const report = reviewerMemoryReport(state);
75
+ return {
76
+ title: `Reviewer Memory: ${report.open.length} open findings`,
77
+ output: JSON.stringify(report, null, 2),
78
+ metadata: { openCount: report.open.length, total: report.total },
79
+ };
80
+ },
81
+ }),
82
+
49
83
  goal_contract: tool({
50
84
  description:
51
85
  "Record or update the Goal Contract for this session (the explicit requirements, " +
@@ -68,17 +68,54 @@ export function latestVerdictFor(state, agent) {
68
68
  return state.latestVerdict[agent] || null;
69
69
  }
70
70
 
71
+ function summarizeFinding(text) {
72
+ const headingRe = /^(blocking findings?|findings?|non-blocking findings?|open questions?|summary|verdict|blocking|issues?)[:\s]*$/i;
73
+ const lines = String(text || "")
74
+ .split(/\r?\n/)
75
+ .map((line) => line.replace(/^[\s>*_-]+/, "").trim())
76
+ .filter(Boolean)
77
+ .filter((line) => !headingRe.test(line))
78
+ .filter((line) => !/^verdict:?\s*(pass|fail)\b/i.test(line));
79
+ const blocking = lines.find((line) => /block|fail|finding|risk|missing|gap|regression/i.test(line));
80
+ return String(blocking || lines[0] || "Reviewer reported a blocking finding.").slice(0, 240);
81
+ }
82
+
83
+ function updateReviewerMemory(state, agent, verdict, at, seq, text) {
84
+ state.reviewerMemory ||= [];
85
+ if (verdict === "PASS") {
86
+ for (const item of state.reviewerMemory) {
87
+ if (item.agent === agent && item.status === "open") {
88
+ item.status = "resolved";
89
+ item.resolvedAt = at;
90
+ item.resolvedSeq = seq;
91
+ }
92
+ }
93
+ return;
94
+ }
95
+ const finding = summarizeFinding(text);
96
+ const open = state.reviewerMemory.find((item) => item.agent === agent && item.status === "open" && item.finding === finding);
97
+ if (open) {
98
+ open.lastAt = at;
99
+ open.lastSeq = seq;
100
+ open.count += 1;
101
+ } else {
102
+ state.reviewerMemory.push({ agent, finding, severity: "blocking", status: "open", firstAt: at, firstSeq: seq, lastAt: at, lastSeq: seq, count: 1 });
103
+ }
104
+ if (state.reviewerMemory.length > 100) state.reviewerMemory.splice(0, state.reviewerMemory.length - 100);
105
+ }
106
+
71
107
  /**
72
108
  * Record a review verdict for `agent`, stamping it with the next monotonic seq.
73
109
  * Increments the review-cycle count when the cycle-closing agent reports.
74
110
  */
75
- export function recordVerdict(store, state, agent, verdict) {
111
+ export function recordVerdict(store, state, agent, verdict, text = "") {
76
112
  const at = store.nowIso();
77
113
  const seq = store.nextSeq();
78
114
  const entry = { agent, verdict, at, seq };
79
115
  state.verdicts.push(entry);
80
116
  if (state.verdicts.length > 200) state.verdicts.splice(0, state.verdicts.length - 200);
81
117
  state.latestVerdict[agent] = { verdict, at, seq };
118
+ updateReviewerMemory(state, agent, verdict, at, seq, text);
82
119
  state.lastReviewAt = at;
83
120
  state.lastReviewSeq = seq;
84
121
  state.updatedAt = at;
@@ -183,16 +183,18 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
183
183
  if (tool === "task") {
184
184
  const sub = normalizedSubagent(inp);
185
185
  if (isReviewAgent(sub)) {
186
- const verdict = parseVerdict(textOf(out));
186
+ const text = textOf(out);
187
+ const verdict = parseVerdict(text);
187
188
  if (verdict) {
188
- recordVerdict(store, state, sub, verdict);
189
+ recordVerdict(store, state, sub, verdict, text);
189
190
  recordedAgent = sub;
190
191
  }
191
192
  }
192
193
  } else if (isReviewAgent(state.currentAgent)) {
193
- const verdict = parseVerdict(textOf(out));
194
+ const text = textOf(out);
195
+ const verdict = parseVerdict(text);
194
196
  if (verdict) {
195
- recordVerdict(store, state, state.currentAgent, verdict);
197
+ recordVerdict(store, state, state.currentAgent, verdict, text);
196
198
  recordedAgent = state.currentAgent;
197
199
  }
198
200
  }
@@ -231,7 +233,7 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
231
233
  const state = store.stateFor(inp.sessionID);
232
234
  out.context.push(
233
235
  `Goal Guard state: ${summarizeState(state, config)}. Preserve Goal Contract, Verification Ledger, ` +
234
- `Review Ledger, review cycle count, dirty state, and open findings across compaction.`,
236
+ `Review Ledger, Reviewer Memory, review cycle count, dirty state, and open findings across compaction.`,
235
237
  );
236
238
  } catch {
237
239
  /* ignore */
@@ -10,7 +10,7 @@ links resolve in the npm package, but they are not runtime files.
10
10
  | [opencode-plugin-platform.md](opencode-plugin-platform.md) | Verified OpenCode plugin-runtime facts (hooks, discovery, permissions, tools) from `@opencode-ai/plugin@1.15.13` source. The pinned runtime reference the plugin is built against. |
11
11
  | [goal-mode-comparison.md](goal-mode-comparison.md) | How Goal Mode's mechanical enforcement compares to Claude Code and OpenAI Codex, with citations and honest caveats. |
12
12
  | [shell-hardening.md](shell-hardening.md) | The shell-analyzer threat model: the bypass classes the old regex guard missed and how the tokenizer closes each. |
13
- | [benchmarks.md](benchmarks.md) | Benchmark methodology and results (detection rate, false positives, latency). Reproduce with `npm run bench`. |
13
+ | [benchmarks.md](benchmarks.md) | Benchmark methodology and results (shell guard accuracy plus completion truthfulness). Reproduce charts with `npm run bench` and JSON with `npm run bench:truthfulness`. |
14
14
 
15
15
  Every non-obvious platform claim in these documents was verified against the
16
16
  installed `@opencode-ai/plugin` type definitions and/or the `sst/opencode`
@@ -5,6 +5,7 @@ checkout. Run:
5
5
 
6
6
  ```bash
7
7
  npm run bench # detection / false-positive / latency benchmark
8
+ npm run bench:truthfulness # print the completion truthfulness benchmark JSON
8
9
  npm run bench:compare # regenerate the capability-comparison chart
9
10
  ```
10
11
 
@@ -28,6 +29,14 @@ README embeds.
28
29
  marks the session dirty but does not block, so it is not counted here.
29
30
  - **Metrics**: detection rate (recall over destructive commands),
30
31
  false-positive rate (safe commands wrongly blocked), and per-command latency.
32
+ - **False Completion Dataset** (`benchmarks/completion-corpus.mjs`): labeled final
33
+ answer scenarios for premature and valid completion claims. It checks whether
34
+ `completion.js` blocks missing review-cycle lines, zero cycles, stale reviews,
35
+ mismatched cycle counts, missing contextual gates, and allows inactive or valid
36
+ completions.
37
+ - **Truthfulness Score** (`benchmarks/truthfulness.mjs`): weighted score over the
38
+ dataset: 65% decision accuracy (blocked vs allowed) and 35% reason accuracy for
39
+ blocked false-completion claims.
31
40
 
32
41
  ## Results
33
42
 
@@ -43,6 +52,16 @@ accuracy figures do not):
43
52
  | Detection — remote-exec | 0% (0/3) | 100% (3/3) |
44
53
  | Latency per command | ~2.3 µs | ~3.8 µs |
45
54
 
55
+ False Completion Dataset run:
56
+
57
+ | Metric | Goal Mode |
58
+ | --- | --- |
59
+ | Truthfulness score | **100.0%** |
60
+ | Decision accuracy | **100.0%** |
61
+ | Reason accuracy | **100.0%** |
62
+ | False-completion block rate | **100.0%** |
63
+ | Valid-completion allow rate | **100.0%** |
64
+
46
65
  The legacy guard catches only the *classic* family and misses every obfuscated
47
66
  and remote-execution command, while wrongly blocking 1-in-5 benign commands. The
48
67
  tokenizer catches the entire corpus with zero false positives, for an extra
@@ -61,3 +80,5 @@ hundreds of thousands of classifications per second).
61
80
  - "100% on this corpus" means 100% of the labeled set; new bypass classes that
62
81
  are discovered get added to the corpus and fixed (that is how the second-wave
63
82
  findings — `sudo -u`, `pnpm dlx`, interpreter shell-out — entered it).
83
+ - The Truthfulness Score is corpus truthfulness for mechanical completion claims,
84
+ not a global claim that an LLM's prose is semantically true in every domain.