opencode-goal-mode 0.2.4 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +39 -0
- package/CHANGELOG.md +31 -0
- package/README.md +67 -24
- package/benchmarks/build-external-corpus.mjs +177 -0
- package/benchmarks/external-corpus.json +3540 -0
- package/benchmarks/external.mjs +110 -0
- package/benchmarks/run.mjs +78 -24
- package/commands/goal.md +16 -1
- package/docs/benchmarks/detection-by-family.svg +2 -2
- package/docs/benchmarks/external-scorecard.svg +32 -0
- package/docs/benchmarks/latency.svg +3 -3
- package/docs/benchmarks/overall-scorecard.svg +2 -2
- package/docs/benchmarks/results.json +112 -71
- package/docs/benchmarks/truthfulness-score.svg +2 -2
- package/package.json +4 -1
- package/plugins/goal-guard/config.js +12 -0
- package/plugins/goal-guard/shell.js +4 -3
- package/plugins/goal-guard/sidebar-data.js +73 -0
- package/plugins/goal-guard/summary.js +38 -0
- package/plugins/goal-guard/tools.js +8 -2
- package/plugins/goal-guard.js +13 -0
- package/plugins/goal-sidebar.js +144 -0
- package/research/benchmarks.md +75 -69
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Run the shell guard against the EXTERNAL, third-party-authored corpus built by
|
|
4
|
+
* build-external-corpus.mjs (real tldr-pages commands). This is the honest
|
|
5
|
+
* benchmark: the analyzer authors did not write or curate these commands, so the
|
|
6
|
+
* detection / false-positive numbers reflect real-world behavior, warts and all.
|
|
7
|
+
*
|
|
8
|
+
* It deliberately also reports DISAGREEMENTS between the analyzer and the
|
|
9
|
+
* independent ground-truth labeler, so misses and false positives are auditable
|
|
10
|
+
* rather than averaged away.
|
|
11
|
+
*
|
|
12
|
+
* node benchmarks/external.mjs # summary
|
|
13
|
+
* node benchmarks/external.mjs --json # full machine-readable result
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { readFileSync } from "node:fs";
|
|
17
|
+
import { join, dirname } from "node:path";
|
|
18
|
+
import { fileURLToPath } from "node:url";
|
|
19
|
+
import * as current from "../plugins/goal-guard/shell.js";
|
|
20
|
+
import * as legacy from "./legacy-analyzer.mjs";
|
|
21
|
+
|
|
22
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
23
|
+
|
|
24
|
+
export function loadExternalCorpus() {
|
|
25
|
+
return JSON.parse(readFileSync(join(here, "external-corpus.json"), "utf8"));
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function blocked(analyzer, cmd) {
|
|
29
|
+
const a = analyzer.analyzeCommand(cmd);
|
|
30
|
+
return Boolean(a.destructive || a.networkExec);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Evaluate one analyzer over labeled entries (each {cmd, page, destructive}). */
|
|
34
|
+
function score(analyzer, labeled) {
|
|
35
|
+
let destTotal = 0;
|
|
36
|
+
let destCaught = 0;
|
|
37
|
+
let safeTotal = 0;
|
|
38
|
+
let safeFalsePos = 0;
|
|
39
|
+
const misses = [];
|
|
40
|
+
const falsePositives = [];
|
|
41
|
+
for (const e of labeled) {
|
|
42
|
+
const isBlocked = blocked(analyzer, e.cmd);
|
|
43
|
+
if (e.destructive) {
|
|
44
|
+
destTotal += 1;
|
|
45
|
+
if (isBlocked) destCaught += 1;
|
|
46
|
+
else misses.push({ cmd: e.cmd, page: e.page });
|
|
47
|
+
} else {
|
|
48
|
+
safeTotal += 1;
|
|
49
|
+
if (isBlocked) {
|
|
50
|
+
safeFalsePos += 1;
|
|
51
|
+
falsePositives.push({ cmd: e.cmd, page: e.page });
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return {
|
|
56
|
+
detectionRate: destTotal ? (destCaught / destTotal) * 100 : 0,
|
|
57
|
+
falsePositiveRate: safeTotal ? (safeFalsePos / safeTotal) * 100 : 0,
|
|
58
|
+
destCaught,
|
|
59
|
+
destTotal,
|
|
60
|
+
safeFalsePos,
|
|
61
|
+
safeTotal,
|
|
62
|
+
misses,
|
|
63
|
+
falsePositives,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function runExternalBenchmark() {
|
|
68
|
+
const corpus = loadExternalCorpus();
|
|
69
|
+
// The corpus is written destructive-first then safe (see build-external-corpus.mjs),
|
|
70
|
+
// so the recorded count is the label boundary — no re-running the labeler needed.
|
|
71
|
+
const labeled = corpus.entries.map((e, i) => ({ ...e, destructive: i < corpus.totals.destructiveFound }));
|
|
72
|
+
return {
|
|
73
|
+
source: corpus.source,
|
|
74
|
+
commit: corpus.commit,
|
|
75
|
+
totals: corpus.totals,
|
|
76
|
+
sampleSize: labeled.length,
|
|
77
|
+
legacy: score(legacy, labeled),
|
|
78
|
+
current: score(current, labeled),
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function pct(n) {
|
|
83
|
+
return `${n.toFixed(1)}%`;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (process.argv[1] === fileURLToPath(import.meta.url)) {
|
|
87
|
+
const r = runExternalBenchmark();
|
|
88
|
+
if (process.argv.includes("--json")) {
|
|
89
|
+
console.log(JSON.stringify(r, null, 2));
|
|
90
|
+
} else {
|
|
91
|
+
console.log("External shell-guard benchmark (third-party tldr-pages commands)");
|
|
92
|
+
console.log("================================================================");
|
|
93
|
+
console.log(`Source: ${r.source} @ ${r.commit.slice(0, 12)}`);
|
|
94
|
+
console.log(
|
|
95
|
+
`Sample: ${r.sampleSize} commands ` +
|
|
96
|
+
`(${r.totals.destructiveFound} destructive [all found], ` +
|
|
97
|
+
`${r.totals.safeSampled}/${r.totals.safeFound} safe sampled)`,
|
|
98
|
+
);
|
|
99
|
+
console.log("");
|
|
100
|
+
console.log(`Detection (destructive caught) legacy ${pct(r.legacy.detectionRate)} → current ${pct(r.current.detectionRate)}`);
|
|
101
|
+
console.log(`False positives on safe commands legacy ${pct(r.legacy.falsePositiveRate)} → current ${pct(r.current.falsePositiveRate)}`);
|
|
102
|
+
console.log("");
|
|
103
|
+
console.log(`Current analyzer misses (${r.current.misses.length}):`);
|
|
104
|
+
for (const m of r.current.misses.slice(0, 20)) console.log(` - ${m.cmd} [${m.page}]`);
|
|
105
|
+
if (r.current.misses.length > 20) console.log(` … ${r.current.misses.length - 20} more`);
|
|
106
|
+
console.log(`Current analyzer false positives (${r.current.falsePositives.length}):`);
|
|
107
|
+
for (const f of r.current.falsePositives.slice(0, 20)) console.log(` - ${f.cmd} [${f.page}]`);
|
|
108
|
+
if (r.current.falsePositives.length > 20) console.log(` … ${r.current.falsePositives.length - 20} more`);
|
|
109
|
+
}
|
|
110
|
+
}
|
package/benchmarks/run.mjs
CHANGED
|
@@ -20,6 +20,7 @@ import * as current from "../plugins/goal-guard/shell.js";
|
|
|
20
20
|
import * as legacy from "./legacy-analyzer.mjs";
|
|
21
21
|
import { groupedBarChart, horizontalBarChart } from "./charts.mjs";
|
|
22
22
|
import { runTruthfulnessBenchmark } from "./truthfulness.mjs";
|
|
23
|
+
import { runExternalBenchmark } from "./external.mjs";
|
|
23
24
|
|
|
24
25
|
const root = fileURLToPath(new URL("..", import.meta.url));
|
|
25
26
|
const outDir = join(root, "docs", "benchmarks");
|
|
@@ -92,6 +93,7 @@ function fmt(n) {
|
|
|
92
93
|
|
|
93
94
|
const legacyEval = evaluate(legacy);
|
|
94
95
|
const currentEval = evaluate(current);
|
|
96
|
+
const external = runExternalBenchmark();
|
|
95
97
|
const truthfulness = runTruthfulnessBenchmark();
|
|
96
98
|
const legacyOps = throughput(legacy);
|
|
97
99
|
const currentOps = throughput(current);
|
|
@@ -110,23 +112,74 @@ function familyRate(ev, fam) {
|
|
|
110
112
|
return f && f.destTotal ? (f.destCaught / f.destTotal) * 100 : 0;
|
|
111
113
|
}
|
|
112
114
|
|
|
115
|
+
// Trim the per-command miss/false-positive lists to keep results.json readable;
|
|
116
|
+
// the full lists are always available via `node benchmarks/external.mjs --json`.
|
|
117
|
+
const externalSummary = {
|
|
118
|
+
source: external.source,
|
|
119
|
+
commit: external.commit,
|
|
120
|
+
totals: external.totals,
|
|
121
|
+
sampleSize: external.sampleSize,
|
|
122
|
+
legacy: {
|
|
123
|
+
detectionRate: Number(external.legacy.detectionRate.toFixed(1)),
|
|
124
|
+
falsePositiveRate: Number(external.legacy.falsePositiveRate.toFixed(1)),
|
|
125
|
+
destCaught: external.legacy.destCaught,
|
|
126
|
+
destTotal: external.legacy.destTotal,
|
|
127
|
+
safeFalsePos: external.legacy.safeFalsePos,
|
|
128
|
+
safeTotal: external.legacy.safeTotal,
|
|
129
|
+
},
|
|
130
|
+
current: {
|
|
131
|
+
detectionRate: Number(external.current.detectionRate.toFixed(1)),
|
|
132
|
+
falsePositiveRate: Number(external.current.falsePositiveRate.toFixed(1)),
|
|
133
|
+
destCaught: external.current.destCaught,
|
|
134
|
+
destTotal: external.current.destTotal,
|
|
135
|
+
safeFalsePos: external.current.safeFalsePos,
|
|
136
|
+
safeTotal: external.current.safeTotal,
|
|
137
|
+
misses: external.current.misses.map((m) => m.cmd),
|
|
138
|
+
falsePositives: external.current.falsePositives.map((f) => f.cmd),
|
|
139
|
+
},
|
|
140
|
+
};
|
|
141
|
+
|
|
113
142
|
const results = {
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
143
|
+
// The honest, third-party benchmark: real commands the analyzer was never
|
|
144
|
+
// fitted to. This is the headline number.
|
|
145
|
+
external: externalSummary,
|
|
146
|
+
// Curated REGRESSION FIXTURES: a hand-authored set of known destructive
|
|
147
|
+
// patterns and their safe look-alikes. These define the patterns the analyzer
|
|
148
|
+
// is built to catch and guard against regressions — they are NOT an unbiased
|
|
149
|
+
// sample, so the 100%/0% here is "passes its own spec", not measured accuracy.
|
|
150
|
+
fixtures: {
|
|
151
|
+
corpusSize: CORPUS.length,
|
|
152
|
+
destructiveCount: CORPUS.filter((c) => c.label === "destructive").length,
|
|
153
|
+
safeCount: CORPUS.filter((c) => c.label === "safe").length,
|
|
154
|
+
legacy: { ...legacyEval, opsPerSec: legacyOps, usPerCommand: Number(legacyUs.toFixed(2)) },
|
|
155
|
+
current: { ...currentEval, opsPerSec: currentOps, usPerCommand: Number(currentUs.toFixed(2)) },
|
|
156
|
+
},
|
|
157
|
+
// Completion-enforcement fixtures (hand-authored policy cases), not a survey.
|
|
158
|
+
completionFixtures: truthfulness,
|
|
120
159
|
};
|
|
121
160
|
|
|
122
161
|
writeFileSync(join(outDir, "results.json"), JSON.stringify(results, null, 2));
|
|
123
162
|
|
|
124
|
-
//
|
|
163
|
+
// Headline chart: detection + false positives on the EXTERNAL third-party corpus.
|
|
164
|
+
writeFileSync(
|
|
165
|
+
join(outDir, "external-scorecard.svg"),
|
|
166
|
+
groupedBarChart({
|
|
167
|
+
title: "Guard accuracy on real third-party commands",
|
|
168
|
+
subtitle: `${external.sampleSize} tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.`,
|
|
169
|
+
groups: ["Detection rate", "False-positive rate"],
|
|
170
|
+
series: [
|
|
171
|
+
{ name: "Legacy regex guard", color: "#9aa0a6", values: [external.legacy.detectionRate, external.legacy.falsePositiveRate] },
|
|
172
|
+
{ name: "Goal Mode analyzer", color: "#2da44e", values: [external.current.detectionRate, external.current.falsePositiveRate] },
|
|
173
|
+
],
|
|
174
|
+
}),
|
|
175
|
+
);
|
|
176
|
+
|
|
177
|
+
// Chart 1: detection rate by command family (CURATED regression fixtures).
|
|
125
178
|
writeFileSync(
|
|
126
179
|
join(outDir, "detection-by-family.svg"),
|
|
127
180
|
groupedBarChart({
|
|
128
|
-
title: "
|
|
129
|
-
subtitle: `
|
|
181
|
+
title: "Detection by family — curated regression fixtures",
|
|
182
|
+
subtitle: `Curated patterns the analyzer is built to catch (not an unbiased sample). ${results.fixtures.destructiveCount} destructive fixtures.`,
|
|
130
183
|
groups: detFamilies.map((f) => FAMILY_LABELS[f]),
|
|
131
184
|
series: [
|
|
132
185
|
{ name: "Legacy regex guard", color: "#9aa0a6", values: detFamilies.map((f) => familyRate(legacyEval, f)) },
|
|
@@ -135,12 +188,12 @@ writeFileSync(
|
|
|
135
188
|
}),
|
|
136
189
|
);
|
|
137
190
|
|
|
138
|
-
// Chart 2: overall scorecard (
|
|
191
|
+
// Chart 2: overall scorecard on the CURATED fixtures (passes its own spec).
|
|
139
192
|
writeFileSync(
|
|
140
193
|
join(outDir, "overall-scorecard.svg"),
|
|
141
194
|
groupedBarChart({
|
|
142
|
-
title: "
|
|
143
|
-
subtitle: "
|
|
195
|
+
title: "Curated fixtures — passes its own spec",
|
|
196
|
+
subtitle: "Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.",
|
|
144
197
|
groups: ["Detection rate", "False-positive rate"],
|
|
145
198
|
series: [
|
|
146
199
|
{ name: "Legacy regex guard", color: "#9aa0a6", values: [legacyEval.detectionRate, legacyEval.falsePositiveRate] },
|
|
@@ -168,8 +221,8 @@ writeFileSync(
|
|
|
168
221
|
writeFileSync(
|
|
169
222
|
join(outDir, "truthfulness-score.svg"),
|
|
170
223
|
horizontalBarChart({
|
|
171
|
-
title: "
|
|
172
|
-
subtitle:
|
|
224
|
+
title: "Completion-enforcement fixtures",
|
|
225
|
+
subtitle: `${truthfulness.corpusSize} hand-authored policy cases (a spec, not a survey): premature claims blocked, valid ones allowed.`,
|
|
173
226
|
unit: "%",
|
|
174
227
|
max: 100,
|
|
175
228
|
rows: [
|
|
@@ -183,16 +236,17 @@ writeFileSync(
|
|
|
183
236
|
const pct = (n) => `${n.toFixed(1)}%`;
|
|
184
237
|
console.log("Goal Mode shell-guard benchmark");
|
|
185
238
|
console.log("================================");
|
|
186
|
-
console.log(`Corpus: ${results.corpusSize} commands (${results.destructiveCount} destructive, ${results.safeCount} safe)`);
|
|
187
239
|
console.log("");
|
|
188
|
-
console.log(`
|
|
189
|
-
console.log(`
|
|
190
|
-
console.log(`
|
|
191
|
-
console.log(`
|
|
240
|
+
console.log(`HEADLINE — external corpus: ${external.sampleSize} real tldr-pages commands @ ${external.commit.slice(0, 12)}`);
|
|
241
|
+
console.log(` (${external.totals.destructiveFound} destructive [all found] + ${external.totals.safeSampled}/${external.totals.safeFound} safe sampled)`);
|
|
242
|
+
console.log(` Detection legacy ${pct(external.legacy.detectionRate)} → Goal Mode ${pct(external.current.detectionRate)}`);
|
|
243
|
+
console.log(` False positives legacy ${pct(external.legacy.falsePositiveRate)} → Goal Mode ${pct(external.current.falsePositiveRate)}`);
|
|
244
|
+
console.log(` Remaining Goal Mode misses: ${external.current.misses.length} (mostly un-flagged single-target rm — see external.mjs --json)`);
|
|
192
245
|
console.log("");
|
|
193
|
-
console.log(
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
}
|
|
246
|
+
console.log(`Curated regression fixtures: ${results.fixtures.corpusSize} commands (defines patterns to catch; not an unbiased sample)`);
|
|
247
|
+
console.log(` Detection legacy ${pct(legacyEval.detectionRate)} → Goal Mode ${pct(currentEval.detectionRate)} (passes its own spec)`);
|
|
248
|
+
console.log(` False pos legacy ${pct(legacyEval.falsePositiveRate)} → Goal Mode ${pct(currentEval.falsePositiveRate)}`);
|
|
249
|
+
console.log(`Completion-enforcement fixtures: ${truthfulness.corpusSize} hand-authored policy cases, all pass (a spec, not a survey)`);
|
|
250
|
+
console.log(`Latency: Goal Mode ${currentUs.toFixed(2)} µs/cmd (${fmt(currentOps)}/s)`);
|
|
197
251
|
console.log("");
|
|
198
|
-
console.log(`Wrote results.json +
|
|
252
|
+
console.log(`Wrote results.json + 5 SVG charts to docs/benchmarks/`);
|
package/commands/goal.md
CHANGED
|
@@ -9,4 +9,19 @@ Start Goal Mode for this request:
|
|
|
9
9
|
$ARGUMENTS
|
|
10
10
|
```
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
Run this sequence:
|
|
13
|
+
|
|
14
|
+
1. **Seed the contract first.** Call the `goal_contract` tool with the original
|
|
15
|
+
request, explicit/inferred requirements, non-goals, and concrete acceptance
|
|
16
|
+
criteria. This activates enforcement, fixes the required specialist review
|
|
17
|
+
gates, and lights up the goal banner in the sidebar. Ask only essential
|
|
18
|
+
clarifying questions before recording it.
|
|
19
|
+
2. Delegate discovery and research to subagents; implement in the main agent.
|
|
20
|
+
3. Verify, and record each verification with the `goal_evidence` tool so it maps
|
|
21
|
+
to your acceptance criteria.
|
|
22
|
+
4. Run the required review cycles. Consult `goal_status` / `goal_evidence_map`
|
|
23
|
+
for the authoritative list of missing or stale gates rather than relying on
|
|
24
|
+
memory.
|
|
25
|
+
5. Only finish with `Goal Completed` (plus an accurate `Review cycles: N` line)
|
|
26
|
+
once every required gate has a fresh PASS — the guard will rewrite a premature
|
|
27
|
+
claim to `Goal Not Completed`.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
<svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
|
|
2
2
|
<rect width="720" height="380" fill="#ffffff"/>
|
|
3
|
-
<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">
|
|
4
|
-
<text x="48" y="47" font-size="12" fill="#656d76">
|
|
3
|
+
<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Detection by family — curated regression fixtures</text>
|
|
4
|
+
<text x="48" y="47" font-size="12" fill="#656d76">Curated patterns the analyzer is built to catch (not an unbiased sample). 48 destructive fixtures.</text>
|
|
5
5
|
<line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
|
|
6
6
|
<text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
|
|
7
7
|
<line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
|
|
2
|
+
<rect width="720" height="380" fill="#ffffff"/>
|
|
3
|
+
<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Guard accuracy on real third-party commands</text>
|
|
4
|
+
<text x="48" y="47" font-size="12" fill="#656d76">704 tldr-pages commands the analyzer was never fitted to. Detection higher = better; false positives lower = better.</text>
|
|
5
|
+
<line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
|
|
6
|
+
<text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
|
|
7
|
+
<line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
|
|
8
|
+
<text x="40" y="253.6" font-size="11" text-anchor="end" fill="#656d76">20%</text>
|
|
9
|
+
<line x1="48" y1="203.2" x2="700" y2="203.2" stroke="#eaeef2" stroke-width="1"/>
|
|
10
|
+
<text x="40" y="207.2" font-size="11" text-anchor="end" fill="#656d76">40%</text>
|
|
11
|
+
<line x1="48" y1="156.8" x2="700" y2="156.8" stroke="#eaeef2" stroke-width="1"/>
|
|
12
|
+
<text x="40" y="160.8" font-size="11" text-anchor="end" fill="#656d76">60%</text>
|
|
13
|
+
<line x1="48" y1="110.4" x2="700" y2="110.4" stroke="#eaeef2" stroke-width="1"/>
|
|
14
|
+
<text x="40" y="114.4" font-size="11" text-anchor="end" fill="#656d76">80%</text>
|
|
15
|
+
<line x1="48" y1="64.0" x2="700" y2="64.0" stroke="#eaeef2" stroke-width="1"/>
|
|
16
|
+
<text x="40" y="68.0" font-size="11" text-anchor="end" fill="#656d76">100%</text>
|
|
17
|
+
<rect x="56.0" y="171.1" width="151.0" height="124.9" rx="3" fill="#9aa0a6"/>
|
|
18
|
+
<text x="131.5" y="166.1" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">54%</text>
|
|
19
|
+
<rect x="215.0" y="79.6" width="151.0" height="216.4" rx="3" fill="#2da44e"/>
|
|
20
|
+
<text x="290.5" y="74.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">93%</text>
|
|
21
|
+
<text x="211.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">Detection rate</text>
|
|
22
|
+
<rect x="382.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#9aa0a6"/>
|
|
23
|
+
<text x="457.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
|
|
24
|
+
<rect x="541.0" y="295.6" width="151.0" height="0.4" rx="3" fill="#2da44e"/>
|
|
25
|
+
<text x="616.5" y="290.6" font-size="11" font-weight="600" text-anchor="middle" fill="#1f2328">0%</text>
|
|
26
|
+
<text x="537.0" y="314.0" font-size="11" text-anchor="middle" fill="#1f2328">False-positive rate</text>
|
|
27
|
+
<line x1="48" y1="296" x2="700" y2="296" stroke="#d0d7de" stroke-width="1.5"/>
|
|
28
|
+
<rect x="48" y="344" width="12" height="12" rx="2" fill="#9aa0a6"/>
|
|
29
|
+
<text x="66" y="354" font-size="12" fill="#1f2328">Legacy regex guard</text>
|
|
30
|
+
<rect x="201.6" y="344" width="12" height="12" rx="2" fill="#2da44e"/>
|
|
31
|
+
<text x="219.6" y="354" font-size="12" fill="#1f2328">Goal Mode analyzer</text>
|
|
32
|
+
</svg>
|
|
@@ -4,10 +4,10 @@
|
|
|
4
4
|
<text x="20" y="47" font-size="12" fill="#656d76">Microseconds to classify one command. Both are negligible for a tool-call guard.</text>
|
|
5
5
|
<text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Legacy regex guard</text>
|
|
6
6
|
<rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
|
|
7
|
-
<rect x="230" y="70" width="
|
|
8
|
-
<text x="
|
|
7
|
+
<rect x="230" y="70" width="214.5" height="22" rx="3" fill="#9aa0a6"/>
|
|
8
|
+
<text x="452.5" y="87" font-size="12" font-weight="600" fill="#1f2328">0.79 µs</text>
|
|
9
9
|
<text x="218" y="125" font-size="12" text-anchor="end" fill="#1f2328">Goal Mode analyzer</text>
|
|
10
10
|
<rect x="230" y="108" width="420" height="22" rx="3" fill="#eaeef2"/>
|
|
11
11
|
<rect x="230" y="108" width="300.0" height="22" rx="3" fill="#2da44e"/>
|
|
12
|
-
<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">
|
|
12
|
+
<text x="538.0" y="125" font-size="12" font-weight="600" fill="#1f2328">1.11 µs</text>
|
|
13
13
|
</svg>
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
<svg xmlns="http://www.w3.org/2000/svg" width="720" height="380" viewBox="0 0 720 380" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
|
|
2
2
|
<rect width="720" height="380" fill="#ffffff"/>
|
|
3
|
-
<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">
|
|
4
|
-
<text x="48" y="47" font-size="12" fill="#656d76">
|
|
3
|
+
<text x="48" y="28" font-size="17" font-weight="700" fill="#1f2328">Curated fixtures — passes its own spec</text>
|
|
4
|
+
<text x="48" y="47" font-size="12" fill="#656d76">Curated regression fixtures, not measured accuracy. See external-scorecard.svg for the real-world number.</text>
|
|
5
5
|
<line x1="48" y1="296.0" x2="700" y2="296.0" stroke="#eaeef2" stroke-width="1"/>
|
|
6
6
|
<text x="40" y="300.0" font-size="11" text-anchor="end" fill="#656d76">0%</text>
|
|
7
7
|
<line x1="48" y1="249.6" x2="700" y2="249.6" stroke="#eaeef2" stroke-width="1"/>
|
|
@@ -1,80 +1,121 @@
|
|
|
1
1
|
{
|
|
2
|
-
"
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
"bypass": {
|
|
20
|
-
"destTotal": 35,
|
|
21
|
-
"destCaught": 0,
|
|
22
|
-
"safeTotal": 0,
|
|
23
|
-
"safeFalsePos": 0
|
|
24
|
-
},
|
|
25
|
-
"remote-exec": {
|
|
26
|
-
"destTotal": 3,
|
|
27
|
-
"destCaught": 0,
|
|
28
|
-
"safeTotal": 0,
|
|
29
|
-
"safeFalsePos": 0
|
|
30
|
-
},
|
|
31
|
-
"safe": {
|
|
32
|
-
"destTotal": 0,
|
|
33
|
-
"destCaught": 0,
|
|
34
|
-
"safeTotal": 23,
|
|
35
|
-
"safeFalsePos": 5
|
|
36
|
-
}
|
|
2
|
+
"external": {
|
|
3
|
+
"source": "tldr-pages",
|
|
4
|
+
"commit": "afc5c5409f0c9a94c66980c40cd8215b5f111021",
|
|
5
|
+
"totals": {
|
|
6
|
+
"uniqueCommandsScanned": 28713,
|
|
7
|
+
"destructiveFound": 104,
|
|
8
|
+
"safeFound": 28609,
|
|
9
|
+
"safeSampled": 600
|
|
10
|
+
},
|
|
11
|
+
"sampleSize": 704,
|
|
12
|
+
"legacy": {
|
|
13
|
+
"detectionRate": 53.8,
|
|
14
|
+
"falsePositiveRate": 0.2,
|
|
15
|
+
"destCaught": 56,
|
|
16
|
+
"destTotal": 104,
|
|
17
|
+
"safeFalsePos": 1,
|
|
18
|
+
"safeTotal": 600
|
|
37
19
|
},
|
|
38
|
-
"
|
|
39
|
-
|
|
20
|
+
"current": {
|
|
21
|
+
"detectionRate": 93.3,
|
|
22
|
+
"falsePositiveRate": 0.2,
|
|
23
|
+
"destCaught": 97,
|
|
24
|
+
"destTotal": 104,
|
|
25
|
+
"safeFalsePos": 1,
|
|
26
|
+
"safeTotal": 600,
|
|
27
|
+
"misses": [
|
|
28
|
+
"rm -d path/to/directory",
|
|
29
|
+
"rm -i path/to/file1 path/to/file2 ...",
|
|
30
|
+
"rm -v $HOME/.cache/fuzzel",
|
|
31
|
+
"rm -v path/to/file1 path/to/file2 ...",
|
|
32
|
+
"rm /dev/sdXN",
|
|
33
|
+
"rm path/to/file1 path/to/file2 ...",
|
|
34
|
+
"rm torrent_id"
|
|
35
|
+
],
|
|
36
|
+
"falsePositives": [
|
|
37
|
+
"git filter-repo --path-rename path/to/folder/:"
|
|
38
|
+
]
|
|
39
|
+
}
|
|
40
40
|
},
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
"
|
|
41
|
+
"fixtures": {
|
|
42
|
+
"corpusSize": 71,
|
|
43
|
+
"destructiveCount": 48,
|
|
44
|
+
"safeCount": 23,
|
|
45
|
+
"legacy": {
|
|
46
|
+
"detectionRate": 20.833333333333336,
|
|
47
|
+
"falsePositiveRate": 21.73913043478261,
|
|
48
|
+
"destCaught": 10,
|
|
49
|
+
"destTotal": 48,
|
|
50
|
+
"safeFalsePos": 5,
|
|
51
|
+
"safeTotal": 23,
|
|
52
|
+
"families": {
|
|
53
|
+
"classic": {
|
|
54
|
+
"destTotal": 10,
|
|
55
|
+
"destCaught": 10,
|
|
56
|
+
"safeTotal": 0,
|
|
57
|
+
"safeFalsePos": 0
|
|
58
|
+
},
|
|
59
|
+
"bypass": {
|
|
60
|
+
"destTotal": 35,
|
|
61
|
+
"destCaught": 0,
|
|
62
|
+
"safeTotal": 0,
|
|
63
|
+
"safeFalsePos": 0
|
|
64
|
+
},
|
|
65
|
+
"remote-exec": {
|
|
66
|
+
"destTotal": 3,
|
|
67
|
+
"destCaught": 0,
|
|
68
|
+
"safeTotal": 0,
|
|
69
|
+
"safeFalsePos": 0
|
|
70
|
+
},
|
|
71
|
+
"safe": {
|
|
72
|
+
"destTotal": 0,
|
|
73
|
+
"destCaught": 0,
|
|
74
|
+
"safeTotal": 23,
|
|
75
|
+
"safeFalsePos": 5
|
|
76
|
+
}
|
|
54
77
|
},
|
|
55
|
-
"
|
|
56
|
-
|
|
57
|
-
"destCaught": 35,
|
|
58
|
-
"safeTotal": 0,
|
|
59
|
-
"safeFalsePos": 0
|
|
60
|
-
},
|
|
61
|
-
"remote-exec": {
|
|
62
|
-
"destTotal": 3,
|
|
63
|
-
"destCaught": 3,
|
|
64
|
-
"safeTotal": 0,
|
|
65
|
-
"safeFalsePos": 0
|
|
66
|
-
},
|
|
67
|
-
"safe": {
|
|
68
|
-
"destTotal": 0,
|
|
69
|
-
"destCaught": 0,
|
|
70
|
-
"safeTotal": 23,
|
|
71
|
-
"safeFalsePos": 0
|
|
72
|
-
}
|
|
78
|
+
"opsPerSec": 1260371,
|
|
79
|
+
"usPerCommand": 0.79
|
|
73
80
|
},
|
|
74
|
-
"
|
|
75
|
-
|
|
81
|
+
"current": {
|
|
82
|
+
"detectionRate": 100,
|
|
83
|
+
"falsePositiveRate": 0,
|
|
84
|
+
"destCaught": 48,
|
|
85
|
+
"destTotal": 48,
|
|
86
|
+
"safeFalsePos": 0,
|
|
87
|
+
"safeTotal": 23,
|
|
88
|
+
"families": {
|
|
89
|
+
"classic": {
|
|
90
|
+
"destTotal": 10,
|
|
91
|
+
"destCaught": 10,
|
|
92
|
+
"safeTotal": 0,
|
|
93
|
+
"safeFalsePos": 0
|
|
94
|
+
},
|
|
95
|
+
"bypass": {
|
|
96
|
+
"destTotal": 35,
|
|
97
|
+
"destCaught": 35,
|
|
98
|
+
"safeTotal": 0,
|
|
99
|
+
"safeFalsePos": 0
|
|
100
|
+
},
|
|
101
|
+
"remote-exec": {
|
|
102
|
+
"destTotal": 3,
|
|
103
|
+
"destCaught": 3,
|
|
104
|
+
"safeTotal": 0,
|
|
105
|
+
"safeFalsePos": 0
|
|
106
|
+
},
|
|
107
|
+
"safe": {
|
|
108
|
+
"destTotal": 0,
|
|
109
|
+
"destCaught": 0,
|
|
110
|
+
"safeTotal": 23,
|
|
111
|
+
"safeFalsePos": 0
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"opsPerSec": 901050,
|
|
115
|
+
"usPerCommand": 1.11
|
|
116
|
+
}
|
|
76
117
|
},
|
|
77
|
-
"
|
|
118
|
+
"completionFixtures": {
|
|
78
119
|
"name": "False Completion Dataset",
|
|
79
120
|
"corpusSize": 9,
|
|
80
121
|
"requiredBaseGates": [
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
<svg xmlns="http://www.w3.org/2000/svg" width="720" height="202" viewBox="0 0 720 202" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">
|
|
2
2
|
<rect width="720" height="202" fill="#ffffff"/>
|
|
3
|
-
<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">
|
|
4
|
-
<text x="20" y="47" font-size="12" fill="#656d76">
|
|
3
|
+
<text x="20" y="28" font-size="17" font-weight="700" fill="#1f2328">Completion-enforcement fixtures</text>
|
|
4
|
+
<text x="20" y="47" font-size="12" fill="#656d76">9 hand-authored policy cases (a spec, not a survey): premature claims blocked, valid ones allowed.</text>
|
|
5
5
|
<text x="218" y="87" font-size="12" text-anchor="end" fill="#1f2328">Truthfulness score</text>
|
|
6
6
|
<rect x="230" y="70" width="420" height="22" rx="3" fill="#eaeef2"/>
|
|
7
7
|
<rect x="230" y="70" width="420.0" height="22" rx="3" fill="#2da44e"/>
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "opencode-goal-mode",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Strict Goal Mode agents, commands, and guard plugin for OpenCode.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -31,7 +31,10 @@
|
|
|
31
31
|
"test:unit": "node --test tests/state.test.mjs tests/gates.test.mjs tests/verdicts.test.mjs tests/config.test.mjs tests/persistence.test.mjs",
|
|
32
32
|
"test:agents": "node --test tests/agents.test.mjs tests/commands.test.mjs",
|
|
33
33
|
"test:install": "node --test tests/install.test.mjs",
|
|
34
|
+
"test:visual": "bun tools/visual-test/sidebar-visual.jsx",
|
|
34
35
|
"bench": "node benchmarks/run.mjs",
|
|
36
|
+
"bench:external": "node benchmarks/external.mjs",
|
|
37
|
+
"bench:corpus": "node benchmarks/build-external-corpus.mjs",
|
|
35
38
|
"bench:truthfulness": "node benchmarks/truthfulness.mjs",
|
|
36
39
|
"bench:compare": "node benchmarks/comparison.mjs",
|
|
37
40
|
"pack:check": "npm pack --dry-run",
|
|
@@ -26,6 +26,14 @@ export const DEFAULT_CONFIG = Object.freeze({
|
|
|
26
26
|
sessionTtlMs: 24 * 60 * 60 * 1000,
|
|
27
27
|
/** Emit a TUI toast when completion is blocked. */
|
|
28
28
|
toastOnBlock: true,
|
|
29
|
+
/** Emit a TUI toast when a review gate records a PASS/FAIL, and when completion unlocks. */
|
|
30
|
+
toastOnReview: true,
|
|
31
|
+
/** Show the experimental yellow goal banner in the TUI sidebar (TUI-plugin-capable OpenCode only). */
|
|
32
|
+
sidebarBanner: true,
|
|
33
|
+
/** Foreground colour (hex) for the sidebar goal banner. */
|
|
34
|
+
sidebarColor: "#FFD700",
|
|
35
|
+
/** Foreground colour (hex) for the muted "No goal" sidebar line. */
|
|
36
|
+
sidebarMutedColor: "#808080",
|
|
29
37
|
/** Phrase that, at the start of an assistant message, claims completion. */
|
|
30
38
|
completionMarker: "Goal Completed",
|
|
31
39
|
/** Replacement marker when completion is blocked. */
|
|
@@ -59,6 +67,10 @@ function fromEnv(env) {
|
|
|
59
67
|
GOAL_GUARD_MAX_SESSIONS: ["maxSessions", coerceInt],
|
|
60
68
|
GOAL_GUARD_SESSION_TTL_MS: ["sessionTtlMs", coerceInt],
|
|
61
69
|
GOAL_GUARD_TOAST_ON_BLOCK: ["toastOnBlock", coerceBool],
|
|
70
|
+
GOAL_GUARD_TOAST_ON_REVIEW: ["toastOnReview", coerceBool],
|
|
71
|
+
GOAL_GUARD_SIDEBAR_BANNER: ["sidebarBanner", coerceBool],
|
|
72
|
+
GOAL_GUARD_SIDEBAR_COLOR: ["sidebarColor", (v) => (v == null ? undefined : String(v))],
|
|
73
|
+
GOAL_GUARD_SIDEBAR_MUTED_COLOR: ["sidebarMutedColor", (v) => (v == null ? undefined : String(v))],
|
|
62
74
|
};
|
|
63
75
|
for (const [key, [field, coerce]] of Object.entries(map)) {
|
|
64
76
|
if (env[key] !== undefined) out[field] = coerce(env[key], DEFAULT_CONFIG[field]);
|
|
@@ -415,7 +415,7 @@ const DIRECT_TEST_BINS = new Set(["jest", "mocha", "vitest", "ava", "tap", "tape
|
|
|
415
415
|
const FORMATTERS = new Set(["prettier", "eslint", "black", "ruff", "gofmt", "goimports", "rustfmt", "clang-format", "autopep8", "isort", "standard", "biome", "dprint", "yapf", "stylelint"]);
|
|
416
416
|
|
|
417
417
|
const MUTATING_BINS = new Set(["mkdir", "rmdir", "touch", "ln", "mv", "cp", "tee", "install", "patch", "rsync", "rename", "chmod", "chown", "chgrp", "git-apply"]);
|
|
418
|
-
const DESTRUCTIVE_BINS = new Set(["shred", "mkfs", "fdisk", "parted", "wipefs", "sgdisk", "blkdiscard", "unlink"]);
|
|
418
|
+
const DESTRUCTIVE_BINS = new Set(["shred", "srm", "mkfs", "mkswap", "fdisk", "parted", "wipefs", "sgdisk", "blkdiscard", "unlink"]);
|
|
419
419
|
|
|
420
420
|
/**
|
|
421
421
|
* Classify a single already-split simple command (array of words).
|
|
@@ -603,8 +603,9 @@ function classifyCommand(words, redirects, depth, acc, pipelineCmds, indexInPipe
|
|
|
603
603
|
return;
|
|
604
604
|
}
|
|
605
605
|
|
|
606
|
-
// Destructive disk/file utilities.
|
|
607
|
-
|
|
606
|
+
// Destructive disk/file utilities. `mkfs.<fstype>` (mkfs.ext4, mkfs.erofs, …)
|
|
607
|
+
// is the same irreversible filesystem-format operation as bare `mkfs`.
|
|
608
|
+
if (DESTRUCTIVE_BINS.has(bin) || /^mkfs\./.test(bin)) {
|
|
608
609
|
acc.destructive = true;
|
|
609
610
|
acc.reasons.push(bin);
|
|
610
611
|
return;
|