opencode-goal-mode 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +16 -7
- package/CHANGELOG.md +9 -0
- package/README.md +26 -8
- package/benchmarks/charts.mjs +176 -0
- package/benchmarks/comparison.mjs +48 -0
- package/benchmarks/completion-corpus.mjs +70 -0
- package/benchmarks/corpus.mjs +92 -0
- package/benchmarks/legacy-analyzer.mjs +54 -0
- package/benchmarks/run.mjs +198 -0
- package/benchmarks/truthfulness.mjs +64 -0
- package/commands/goal-evidence-map.md +27 -0
- package/docs/benchmarks/latency.svg +3 -3
- package/docs/benchmarks/results.json +103 -4
- package/docs/benchmarks/truthfulness-score.svg +17 -0
- package/package.json +3 -1
- package/plugins/goal-guard/events.js +6 -3
- package/plugins/goal-guard/state.js +2 -1
- package/plugins/goal-guard/summary.js +105 -1
- package/plugins/goal-guard/system.js +3 -0
- package/plugins/goal-guard/tools.js +35 -1
- package/plugins/goal-guard/verdicts.js +38 -1
- package/plugins/goal-guard.js +7 -5
- package/research/README.md +1 -1
- package/research/benchmarks.md +21 -0
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
import { tool } from "@opencode-ai/plugin";
|
|
15
|
-
import { statusReport } from "./summary.js";
|
|
15
|
+
import { evidenceMapReport, reviewerMemoryReport, statusReport } from "./summary.js";
|
|
16
16
|
import { recordEvidence } from "./events.js";
|
|
17
17
|
import { refreshStickyGates } from "./gates.js";
|
|
18
18
|
import { createState } from "./state.js";
|
|
@@ -46,6 +46,40 @@ export function createGoalTools({ store, config, persist }) {
|
|
|
46
46
|
},
|
|
47
47
|
}),
|
|
48
48
|
|
|
49
|
+
goal_evidence_map: tool({
|
|
50
|
+
description:
|
|
51
|
+
"Return an authoritative read-only evidence map for this session: each acceptance " +
|
|
52
|
+
"criterion, matching recorded evidence, required reviewer gate status, coverage status, " +
|
|
53
|
+
"gaps, and next action.",
|
|
54
|
+
args: {},
|
|
55
|
+
async execute(_args, ctx) {
|
|
56
|
+
const state = store.stateFor(ctx.sessionID);
|
|
57
|
+
const report = evidenceMapReport(state, config);
|
|
58
|
+
const covered = report.criteria.filter((item) => item.status === "covered").length;
|
|
59
|
+
return {
|
|
60
|
+
title: `Evidence map: ${covered}/${report.criteria.length} criteria covered`,
|
|
61
|
+
output: JSON.stringify(report, null, 2),
|
|
62
|
+
metadata: { criteriaCount: report.criteria.length, coveredCount: covered, missingGates: report.missingGates },
|
|
63
|
+
};
|
|
64
|
+
},
|
|
65
|
+
}),
|
|
66
|
+
|
|
67
|
+
goal_reviewer_memory: tool({
|
|
68
|
+
description:
|
|
69
|
+
"Return durable Reviewer Memory for this session: unresolved and recently resolved " +
|
|
70
|
+
"reviewer findings carried across cycles. Read-only.",
|
|
71
|
+
args: {},
|
|
72
|
+
async execute(_args, ctx) {
|
|
73
|
+
const state = store.stateFor(ctx.sessionID);
|
|
74
|
+
const report = reviewerMemoryReport(state);
|
|
75
|
+
return {
|
|
76
|
+
title: `Reviewer Memory: ${report.open.length} open findings`,
|
|
77
|
+
output: JSON.stringify(report, null, 2),
|
|
78
|
+
metadata: { openCount: report.open.length, total: report.total },
|
|
79
|
+
};
|
|
80
|
+
},
|
|
81
|
+
}),
|
|
82
|
+
|
|
49
83
|
goal_contract: tool({
|
|
50
84
|
description:
|
|
51
85
|
"Record or update the Goal Contract for this session (the explicit requirements, " +
|
|
@@ -68,17 +68,54 @@ export function latestVerdictFor(state, agent) {
|
|
|
68
68
|
return state.latestVerdict[agent] || null;
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
+
function summarizeFinding(text) {
|
|
72
|
+
const headingRe = /^(blocking findings?|findings?|non-blocking findings?|open questions?|summary|verdict|blocking|issues?)[:\s]*$/i;
|
|
73
|
+
const lines = String(text || "")
|
|
74
|
+
.split(/\r?\n/)
|
|
75
|
+
.map((line) => line.replace(/^[\s>*_-]+/, "").trim())
|
|
76
|
+
.filter(Boolean)
|
|
77
|
+
.filter((line) => !headingRe.test(line))
|
|
78
|
+
.filter((line) => !/^verdict:?\s*(pass|fail)\b/i.test(line));
|
|
79
|
+
const blocking = lines.find((line) => /block|fail|finding|risk|missing|gap|regression/i.test(line));
|
|
80
|
+
return String(blocking || lines[0] || "Reviewer reported a blocking finding.").slice(0, 240);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function updateReviewerMemory(state, agent, verdict, at, seq, text) {
|
|
84
|
+
state.reviewerMemory ||= [];
|
|
85
|
+
if (verdict === "PASS") {
|
|
86
|
+
for (const item of state.reviewerMemory) {
|
|
87
|
+
if (item.agent === agent && item.status === "open") {
|
|
88
|
+
item.status = "resolved";
|
|
89
|
+
item.resolvedAt = at;
|
|
90
|
+
item.resolvedSeq = seq;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
const finding = summarizeFinding(text);
|
|
96
|
+
const open = state.reviewerMemory.find((item) => item.agent === agent && item.status === "open" && item.finding === finding);
|
|
97
|
+
if (open) {
|
|
98
|
+
open.lastAt = at;
|
|
99
|
+
open.lastSeq = seq;
|
|
100
|
+
open.count += 1;
|
|
101
|
+
} else {
|
|
102
|
+
state.reviewerMemory.push({ agent, finding, severity: "blocking", status: "open", firstAt: at, firstSeq: seq, lastAt: at, lastSeq: seq, count: 1 });
|
|
103
|
+
}
|
|
104
|
+
if (state.reviewerMemory.length > 100) state.reviewerMemory.splice(0, state.reviewerMemory.length - 100);
|
|
105
|
+
}
|
|
106
|
+
|
|
71
107
|
/**
|
|
72
108
|
* Record a review verdict for `agent`, stamping it with the next monotonic seq.
|
|
73
109
|
* Increments the review-cycle count when the cycle-closing agent reports.
|
|
74
110
|
*/
|
|
75
|
-
export function recordVerdict(store, state, agent, verdict) {
|
|
111
|
+
export function recordVerdict(store, state, agent, verdict, text = "") {
|
|
76
112
|
const at = store.nowIso();
|
|
77
113
|
const seq = store.nextSeq();
|
|
78
114
|
const entry = { agent, verdict, at, seq };
|
|
79
115
|
state.verdicts.push(entry);
|
|
80
116
|
if (state.verdicts.length > 200) state.verdicts.splice(0, state.verdicts.length - 200);
|
|
81
117
|
state.latestVerdict[agent] = { verdict, at, seq };
|
|
118
|
+
updateReviewerMemory(state, agent, verdict, at, seq, text);
|
|
82
119
|
state.lastReviewAt = at;
|
|
83
120
|
state.lastReviewSeq = seq;
|
|
84
121
|
state.updatedAt = at;
|
package/plugins/goal-guard.js
CHANGED
|
@@ -183,16 +183,18 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
|
|
|
183
183
|
if (tool === "task") {
|
|
184
184
|
const sub = normalizedSubagent(inp);
|
|
185
185
|
if (isReviewAgent(sub)) {
|
|
186
|
-
const
|
|
186
|
+
const text = textOf(out);
|
|
187
|
+
const verdict = parseVerdict(text);
|
|
187
188
|
if (verdict) {
|
|
188
|
-
recordVerdict(store, state, sub, verdict);
|
|
189
|
+
recordVerdict(store, state, sub, verdict, text);
|
|
189
190
|
recordedAgent = sub;
|
|
190
191
|
}
|
|
191
192
|
}
|
|
192
193
|
} else if (isReviewAgent(state.currentAgent)) {
|
|
193
|
-
const
|
|
194
|
+
const text = textOf(out);
|
|
195
|
+
const verdict = parseVerdict(text);
|
|
194
196
|
if (verdict) {
|
|
195
|
-
recordVerdict(store, state, state.currentAgent, verdict);
|
|
197
|
+
recordVerdict(store, state, state.currentAgent, verdict, text);
|
|
196
198
|
recordedAgent = state.currentAgent;
|
|
197
199
|
}
|
|
198
200
|
}
|
|
@@ -231,7 +233,7 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
|
|
|
231
233
|
const state = store.stateFor(inp.sessionID);
|
|
232
234
|
out.context.push(
|
|
233
235
|
`Goal Guard state: ${summarizeState(state, config)}. Preserve Goal Contract, Verification Ledger, ` +
|
|
234
|
-
`Review Ledger, review cycle count, dirty state, and open findings across compaction.`,
|
|
236
|
+
`Review Ledger, Reviewer Memory, review cycle count, dirty state, and open findings across compaction.`,
|
|
235
237
|
);
|
|
236
238
|
} catch {
|
|
237
239
|
/* ignore */
|
package/research/README.md
CHANGED
|
@@ -10,7 +10,7 @@ links resolve in the npm package, but they are not runtime files.
|
|
|
10
10
|
| [opencode-plugin-platform.md](opencode-plugin-platform.md) | Verified OpenCode plugin-runtime facts (hooks, discovery, permissions, tools) from `@opencode-ai/plugin@1.15.13` source. The pinned runtime reference the plugin is built against. |
|
|
11
11
|
| [goal-mode-comparison.md](goal-mode-comparison.md) | How Goal Mode's mechanical enforcement compares to Claude Code and OpenAI Codex, with citations and honest caveats. |
|
|
12
12
|
| [shell-hardening.md](shell-hardening.md) | The shell-analyzer threat model: the bypass classes the old regex guard missed and how the tokenizer closes each. |
|
|
13
|
-
| [benchmarks.md](benchmarks.md) | Benchmark methodology and results (
|
|
13
|
+
| [benchmarks.md](benchmarks.md) | Benchmark methodology and results (shell guard accuracy plus completion truthfulness). Reproduce charts with `npm run bench` and JSON with `npm run bench:truthfulness`. |
|
|
14
14
|
|
|
15
15
|
Every non-obvious platform claim in these documents was verified against the
|
|
16
16
|
installed `@opencode-ai/plugin` type definitions and/or the `sst/opencode`
|
package/research/benchmarks.md
CHANGED
|
@@ -5,6 +5,7 @@ checkout. Run:
|
|
|
5
5
|
|
|
6
6
|
```bash
|
|
7
7
|
npm run bench # detection / false-positive / latency benchmark
|
|
8
|
+
npm run bench:truthfulness # print the completion truthfulness benchmark JSON
|
|
8
9
|
npm run bench:compare # regenerate the capability-comparison chart
|
|
9
10
|
```
|
|
10
11
|
|
|
@@ -28,6 +29,14 @@ README embeds.
|
|
|
28
29
|
marks the session dirty but does not block, so it is not counted here.
|
|
29
30
|
- **Metrics**: detection rate (recall over destructive commands),
|
|
30
31
|
false-positive rate (safe commands wrongly blocked), and per-command latency.
|
|
32
|
+
- **False Completion Dataset** (`benchmarks/completion-corpus.mjs`): labeled final
|
|
33
|
+
answer scenarios for premature and valid completion claims. It checks whether
|
|
34
|
+
`completion.js` blocks missing review-cycle lines, zero cycles, stale reviews,
|
|
35
|
+
mismatched cycle counts, missing contextual gates, and allows inactive or valid
|
|
36
|
+
completions.
|
|
37
|
+
- **Truthfulness Score** (`benchmarks/truthfulness.mjs`): weighted score over the
|
|
38
|
+
dataset: 65% decision accuracy (blocked vs allowed) and 35% reason accuracy for
|
|
39
|
+
blocked false-completion claims.
|
|
31
40
|
|
|
32
41
|
## Results
|
|
33
42
|
|
|
@@ -43,6 +52,16 @@ accuracy figures do not):
|
|
|
43
52
|
| Detection — remote-exec | 0% (0/3) | 100% (3/3) |
|
|
44
53
|
| Latency per command | ~2.3 µs | ~3.8 µs |
|
|
45
54
|
|
|
55
|
+
False Completion Dataset run:
|
|
56
|
+
|
|
57
|
+
| Metric | Goal Mode |
|
|
58
|
+
| --- | --- |
|
|
59
|
+
| Truthfulness score | **100.0%** |
|
|
60
|
+
| Decision accuracy | **100.0%** |
|
|
61
|
+
| Reason accuracy | **100.0%** |
|
|
62
|
+
| False-completion block rate | **100.0%** |
|
|
63
|
+
| Valid-completion allow rate | **100.0%** |
|
|
64
|
+
|
|
46
65
|
The legacy guard catches only the *classic* family and misses every obfuscated
|
|
47
66
|
and remote-execution command, while wrongly blocking 1-in-5 benign commands. The
|
|
48
67
|
tokenizer catches the entire corpus with zero false positives, for an extra
|
|
@@ -61,3 +80,5 @@ hundreds of thousands of classifications per second).
|
|
|
61
80
|
- "100% on this corpus" means 100% of the labeled set; new bypass classes that
|
|
62
81
|
are discovered get added to the corpus and fixed (that is how the second-wave
|
|
63
82
|
findings — `sudo -u`, `pnpm dlx`, interpreter shell-out — entered it).
|
|
83
|
+
- The Truthfulness Score is corpus truthfulness for mechanical completion claims,
|
|
84
|
+
not a global claim that an LLM's prose is semantically true in every domain.
|