npm - opencode-goal-mode - Versions diffs - 0.2.2 → 0.2.4 - Mend

opencode-goal-mode 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/ARCHITECTURE.md +16 -7
package/CHANGELOG.md +9 -0
package/README.md +26 -8
package/benchmarks/charts.mjs +176 -0
package/benchmarks/comparison.mjs +48 -0
package/benchmarks/completion-corpus.mjs +70 -0
package/benchmarks/corpus.mjs +92 -0
package/benchmarks/legacy-analyzer.mjs +54 -0
package/benchmarks/run.mjs +198 -0
package/benchmarks/truthfulness.mjs +64 -0
package/commands/goal-evidence-map.md +27 -0
package/docs/benchmarks/latency.svg +3 -3
package/docs/benchmarks/results.json +103 -4
package/docs/benchmarks/truthfulness-score.svg +17 -0
package/package.json +3 -1
package/plugins/goal-guard/events.js +6 -3
package/plugins/goal-guard/state.js +2 -1
package/plugins/goal-guard/summary.js +105 -1
package/plugins/goal-guard/system.js +3 -0
package/plugins/goal-guard/tools.js +35 -1
package/plugins/goal-guard/verdicts.js +38 -1
package/plugins/goal-guard.js +7 -5
package/research/README.md +1 -1
package/research/benchmarks.md +21 -0

package/plugins/goal-guard/tools.js CHANGED Viewed

@@ -12,7 +12,7 @@
  */
 import { tool } from "@opencode-ai/plugin";
-import { statusReport } from "./summary.js";
+import { evidenceMapReport, reviewerMemoryReport, statusReport } from "./summary.js";
 import { recordEvidence } from "./events.js";
 import { refreshStickyGates } from "./gates.js";
 import { createState } from "./state.js";
@@ -46,6 +46,40 @@ export function createGoalTools({ store, config, persist }) {
       },
     }),
+    goal_evidence_map: tool({
+      description:
+        "Return an authoritative read-only evidence map for this session: each acceptance " +
+        "criterion, matching recorded evidence, required reviewer gate status, coverage status, " +
+        "gaps, and next action.",
+      args: {},
+      async execute(_args, ctx) {
+        const state = store.stateFor(ctx.sessionID);
+        const report = evidenceMapReport(state, config);
+        const covered = report.criteria.filter((item) => item.status === "covered").length;
+        return {
+          title: `Evidence map: ${covered}/${report.criteria.length} criteria covered`,
+          output: JSON.stringify(report, null, 2),
+          metadata: { criteriaCount: report.criteria.length, coveredCount: covered, missingGates: report.missingGates },
+        };
+      },
+    }),
+    goal_reviewer_memory: tool({
+      description:
+        "Return durable Reviewer Memory for this session: unresolved and recently resolved " +
+        "reviewer findings carried across cycles. Read-only.",
+      args: {},
+      async execute(_args, ctx) {
+        const state = store.stateFor(ctx.sessionID);
+        const report = reviewerMemoryReport(state);
+        return {
+          title: `Reviewer Memory: ${report.open.length} open findings`,
+          output: JSON.stringify(report, null, 2),
+          metadata: { openCount: report.open.length, total: report.total },
+        };
+      },
+    }),
     goal_contract: tool({
       description:
         "Record or update the Goal Contract for this session (the explicit requirements, " +

package/plugins/goal-guard/verdicts.js CHANGED Viewed

@@ -68,17 +68,54 @@ export function latestVerdictFor(state, agent) {
   return state.latestVerdict[agent] || null;
 }
+function summarizeFinding(text) {
+  const headingRe = /^(blocking findings?|findings?|non-blocking findings?|open questions?|summary|verdict|blocking|issues?)[:\s]*$/i;
+  const lines = String(text || "")
+    .split(/\r?\n/)
+    .map((line) => line.replace(/^[\s>*_-]+/, "").trim())
+    .filter(Boolean)
+    .filter((line) => !headingRe.test(line))
+    .filter((line) => !/^verdict:?\s*(pass|fail)\b/i.test(line));
+  const blocking = lines.find((line) => /block|fail|finding|risk|missing|gap|regression/i.test(line));
+  return String(blocking || lines[0] || "Reviewer reported a blocking finding.").slice(0, 240);
+}
+function updateReviewerMemory(state, agent, verdict, at, seq, text) {
+  state.reviewerMemory ||= [];
+  if (verdict === "PASS") {
+    for (const item of state.reviewerMemory) {
+      if (item.agent === agent && item.status === "open") {
+        item.status = "resolved";
+        item.resolvedAt = at;
+        item.resolvedSeq = seq;
+      }
+    }
+    return;
+  }
+  const finding = summarizeFinding(text);
+  const open = state.reviewerMemory.find((item) => item.agent === agent && item.status === "open" && item.finding === finding);
+  if (open) {
+    open.lastAt = at;
+    open.lastSeq = seq;
+    open.count += 1;
+  } else {
+    state.reviewerMemory.push({ agent, finding, severity: "blocking", status: "open", firstAt: at, firstSeq: seq, lastAt: at, lastSeq: seq, count: 1 });
+  }
+  if (state.reviewerMemory.length > 100) state.reviewerMemory.splice(0, state.reviewerMemory.length - 100);
+}
 /**
  * Record a review verdict for `agent`, stamping it with the next monotonic seq.
  * Increments the review-cycle count when the cycle-closing agent reports.
  */
-export function recordVerdict(store, state, agent, verdict) {
+export function recordVerdict(store, state, agent, verdict, text = "") {
   const at = store.nowIso();
   const seq = store.nextSeq();
   const entry = { agent, verdict, at, seq };
   state.verdicts.push(entry);
   if (state.verdicts.length > 200) state.verdicts.splice(0, state.verdicts.length - 200);
   state.latestVerdict[agent] = { verdict, at, seq };
+  updateReviewerMemory(state, agent, verdict, at, seq, text);
   state.lastReviewAt = at;
   state.lastReviewSeq = seq;
   state.updatedAt = at;

package/plugins/goal-guard.js CHANGED Viewed

@@ -183,16 +183,18 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
         if (tool === "task") {
           const sub = normalizedSubagent(inp);
           if (isReviewAgent(sub)) {
-            const verdict = parseVerdict(textOf(out));
+            const text = textOf(out);
+            const verdict = parseVerdict(text);
             if (verdict) {
-              recordVerdict(store, state, sub, verdict);
+              recordVerdict(store, state, sub, verdict, text);
               recordedAgent = sub;
             }
           }
         } else if (isReviewAgent(state.currentAgent)) {
-          const verdict = parseVerdict(textOf(out));
+          const text = textOf(out);
+          const verdict = parseVerdict(text);
           if (verdict) {
-            recordVerdict(store, state, state.currentAgent, verdict);
+            recordVerdict(store, state, state.currentAgent, verdict, text);
             recordedAgent = state.currentAgent;
           }
         }
@@ -231,7 +233,7 @@ export function createGuard(input = {}, options = {}, overrides = {}) {
         const state = store.stateFor(inp.sessionID);
         out.context.push(
           `Goal Guard state: ${summarizeState(state, config)}. Preserve Goal Contract, Verification Ledger, ` +
-            `Review Ledger, review cycle count, dirty state, and open findings across compaction.`,
+            `Review Ledger, Reviewer Memory, review cycle count, dirty state, and open findings across compaction.`,
         );
       } catch {
         /* ignore */

package/research/README.md CHANGED Viewed

@@ -10,7 +10,7 @@ links resolve in the npm package, but they are not runtime files.
 | [opencode-plugin-platform.md](opencode-plugin-platform.md) | Verified OpenCode plugin-runtime facts (hooks, discovery, permissions, tools) from `@opencode-ai/plugin@1.15.13` source. The pinned runtime reference the plugin is built against. |
 | [goal-mode-comparison.md](goal-mode-comparison.md) | How Goal Mode's mechanical enforcement compares to Claude Code and OpenAI Codex, with citations and honest caveats. |
 | [shell-hardening.md](shell-hardening.md) | The shell-analyzer threat model: the bypass classes the old regex guard missed and how the tokenizer closes each. |
-| [benchmarks.md](benchmarks.md) | Benchmark methodology and results (detection rate, false positives, latency). Reproduce with `npm run bench`. |
+| [benchmarks.md](benchmarks.md) | Benchmark methodology and results (shell guard accuracy plus completion truthfulness). Reproduce charts with `npm run bench` and JSON with `npm run bench:truthfulness`. |
 Every non-obvious platform claim in these documents was verified against the
 installed `@opencode-ai/plugin` type definitions and/or the `sst/opencode`

package/research/benchmarks.md CHANGED Viewed

@@ -5,6 +5,7 @@ checkout. Run:
 ```bash
 npm run bench          # detection / false-positive / latency benchmark
+npm run bench:truthfulness  # print the completion truthfulness benchmark JSON
 npm run bench:compare  # regenerate the capability-comparison chart
 ```
@@ -28,6 +29,14 @@ README embeds.
   marks the session dirty but does not block, so it is not counted here.
 - **Metrics**: detection rate (recall over destructive commands),
   false-positive rate (safe commands wrongly blocked), and per-command latency.
+- **False Completion Dataset** (`benchmarks/completion-corpus.mjs`): labeled final
+  answer scenarios for premature and valid completion claims. It checks whether
+  `completion.js` blocks missing review-cycle lines, zero cycles, stale reviews,
+  mismatched cycle counts, missing contextual gates, and allows inactive or valid
+  completions.
+- **Truthfulness Score** (`benchmarks/truthfulness.mjs`): weighted score over the
+  dataset: 65% decision accuracy (blocked vs allowed) and 35% reason accuracy for
+  blocked false-completion claims.
 ## Results
@@ -43,6 +52,16 @@ accuracy figures do not):
 | Detection — remote-exec | 0% (0/3) | 100% (3/3) |
 | Latency per command | ~2.3 µs | ~3.8 µs |
+False Completion Dataset run:
+| Metric | Goal Mode |
+| --- | --- |
+| Truthfulness score | **100.0%** |
+| Decision accuracy | **100.0%** |
+| Reason accuracy | **100.0%** |
+| False-completion block rate | **100.0%** |
+| Valid-completion allow rate | **100.0%** |
 The legacy guard catches only the *classic* family and misses every obfuscated
 and remote-execution command, while wrongly blocking 1-in-5 benign commands. The
 tokenizer catches the entire corpus with zero false positives, for an extra
@@ -61,3 +80,5 @@ hundreds of thousands of classifications per second).
 - "100% on this corpus" means 100% of the labeled set; new bypass classes that
   are discovered get added to the corpus and fixed (that is how the second-wave
   findings — `sudo -u`, `pnpm dlx`, interpreter shell-out — entered it).
+- The Truthfulness Score is corpus truthfulness for mechanical completion claims,
+  not a global claim that an LLM's prose is semantically true in every domain.