npm - opencode-goal-mode - Versions diffs - 0.2.2 → 0.3.0 - Mend

opencode-goal-mode 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/ARCHITECTURE.md +47 -7
package/CHANGELOG.md +27 -0
package/README.md +81 -23
package/benchmarks/build-external-corpus.mjs +177 -0
package/benchmarks/charts.mjs +176 -0
package/benchmarks/comparison.mjs +48 -0
package/benchmarks/completion-corpus.mjs +70 -0
package/benchmarks/corpus.mjs +92 -0
package/benchmarks/external-corpus.json +3540 -0
package/benchmarks/external.mjs +110 -0
package/benchmarks/legacy-analyzer.mjs +54 -0
package/benchmarks/run.mjs +252 -0
package/benchmarks/truthfulness.mjs +64 -0
package/commands/goal-evidence-map.md +27 -0
package/commands/goal.md +16 -1
package/docs/benchmarks/detection-by-family.svg +2 -2
package/docs/benchmarks/external-scorecard.svg +32 -0
package/docs/benchmarks/latency.svg +3 -3
package/docs/benchmarks/overall-scorecard.svg +2 -2
package/docs/benchmarks/results.json +207 -67
package/docs/benchmarks/truthfulness-score.svg +17 -0
package/package.json +5 -1
package/plugins/goal-guard/config.js +9 -0
package/plugins/goal-guard/events.js +6 -3
package/plugins/goal-guard/shell.js +4 -3
package/plugins/goal-guard/sidebar-data.js +71 -0
package/plugins/goal-guard/state.js +2 -1
package/plugins/goal-guard/summary.js +139 -1
package/plugins/goal-guard/system.js +3 -0
package/plugins/goal-guard/tools.js +43 -3
package/plugins/goal-guard/verdicts.js +38 -1
package/plugins/goal-guard.js +20 -5
package/plugins/goal-sidebar.js +141 -0
package/research/README.md +1 -1
package/research/benchmarks.md +72 -45

package/ARCHITECTURE.md CHANGED Viewed

@@ -8,12 +8,17 @@ configuration directory:
    gates). Each is a Markdown file: YAML frontmatter (mode, permissions, color,
    temperature) over a system-prompt body.
 2. **Commands** (`commands/*.md`) — slash commands (`/goal`, `/goal-contract`,
-   `/goal-review`, `/goal-status`, `/goal-repair`, `/goal-final`) that bind a
-   prompt template to an agent, some forced to run as subtasks.
+   `/goal-review`, `/goal-evidence-map`, `/goal-status`, `/goal-repair`,
+   `/goal-final`) that bind a prompt template to an agent, some forced to run as
+   subtasks.
 3. **The `goal-guard` plugin** (`plugins/goal-guard.js` + `plugins/goal-guard/`)
    — a runtime guard that enforces review discipline, blocks destructive shell
    commands, preserves state across compaction and restarts, and exposes
    first-class `goal_*` tools.
+4. **An experimental TUI companion** (`plugins/goal-sidebar.js`) — a separate
+   `{ tui }` plugin module that renders the active goal as a yellow sidebar
+   banner. It is *paired* with the server plugin purely through the on-disk state
+   snapshot (no extra IPC) and no-ops on any runtime without the slot API.
 This document focuses on the plugin, where the engineering lives.
@@ -41,13 +46,15 @@ as plugins. Each module is independently unit-tested.
 | `goal-guard/config.js` | Config resolution (defaults < env vars < plugin options). |
 | `goal-guard/state.js` | Per-session state records + the store (monotonic seq, LRU, persistence hooks). |
 | `goal-guard/persistence.js` | Atomic, debounced JSON persistence under the XDG state dir. |
-| `goal-guard/verdicts.js` | Verdict extraction (last-wins, anchored) and recording. |
+| `goal-guard/verdicts.js` | Verdict extraction (last-wins, anchored), recording, and Reviewer Memory updates. |
 | `goal-guard/gates.js` | Required-gate computation and freshness. |
 | `goal-guard/completion.js` | `Goal Completed` claim evaluation. |
 | `goal-guard/events.js` | Shared edit/verification/evidence mutators. |
-| `goal-guard/summary.js` | State summaries and structured status reports. |
+| `goal-guard/summary.js` | State summaries, status reports, and evidence-map projections. |
 | `goal-guard/system.js` | Live state block injected into the system prompt. |
-| `goal-guard/tools.js` | The `goal_status` / `goal_contract` / `goal_evidence` / `goal_reset` tools. |
+| `goal-guard/summary.js` | Status/evidence projections, the short goal label, and the sidebar view. |
+| `goal-guard/tools.js` | The `goal_status` / `goal_evidence_map` / `goal_reviewer_memory` / `goal_contract` / `goal_evidence` / `goal_reset` tools. |
+| `goal-guard/sidebar-data.js` | Pure reader that projects the persisted snapshot into the sidebar banner model. |
 | `goal-guard/logger.js` | Best-effort logging/toasts over the OpenCode client. |
 ## Hooks used
@@ -88,7 +95,12 @@ re-running verification does not.
 A session record tracks: active flag, captured goal text, the Goal Contract,
 dirty flag and reasons, changed files, review-cycle count, the last edit/review/
 verification seq and timestamps, the verdict log and per-agent latest verdict,
-recorded evidence, and completion-rejection history.
+recorded evidence, Reviewer Memory, and completion-rejection history.
+Reviewer Memory stores bounded summaries of blocking reviewer findings. A fresh
+FAIL opens or refreshes a finding for that reviewer; a fresh PASS from the same
+reviewer marks its open findings resolved. The memory is injected into status and
+system context so recurring review issues survive long sessions and restarts.
 ### Persistence
@@ -137,11 +149,13 @@ or any required gate is missing/stale.
 ## Custom tools
-The `tool` hook registers four tools (names are verbatim object keys):
+The `tool` hook registers six tools (names are verbatim object keys):
 - `goal_contract` — record the Goal Contract; activates enforcement and fixes the
   required specialist gates.
 - `goal_evidence` — log a verification command + result into the ledger.
+- `goal_evidence_map` — return the acceptance-criteria evidence map with reviewer status and next actions.
+- `goal_reviewer_memory` — return open and recently resolved reviewer findings.
 - `goal_status` — return the authoritative gate/dirty/completion status.
 - `goal_reset` — clear the session's goal state (requires `confirm: true`).
@@ -149,6 +163,25 @@ The `@opencode-ai/plugin` import they need is isolated to `tools.js` and loaded
 via a guarded dynamic import, so if the host cannot resolve it the core guard
 hooks still load.
+## TUI companion (experimental)
+`plugins/goal-sidebar.js` is a TUI plugin module — `export const tui = async (api)
+=> …` — distinct from the server plugin (`@opencode-ai/plugin` types it as a
+`{ tui }` module, mutually exclusive with `{ server }`). It registers a
+`sidebar_content` slot via `api.slots.register({ slots: { sidebar_content } })`
+and renders, in the configured colour (`#FFD700` by default), the short goal
+label plus a `passing/total gates · dirty/ready` line.
+It is *paired* with the server plugin only through the persisted state file:
+`sidebar-data.js` recomputes the same `stateBaseDir`/`projectKey` path the guard
+writes to and projects the active session via `summary.sidebarView`. That keeps
+the pure projection logic Node-testable (`tests/sidebar.test.mjs`) even though the
+JSX renderer itself can only run inside OpenCode's (Bun) TUI runtime. Everything
+in the `tui` entry is wrapped so a missing slot API, missing JSX runtime, or read
+error degrades to rendering nothing — it can never break the TUI. The server plugin
+also emits review-verdict and completion-unlock toasts (`toastOnReview`) so review
+progress is visible even without the banner.
 ## Configuration
 `config.js` merges, in increasing precedence: built-in defaults, environment
@@ -172,9 +205,16 @@ manifest of the file hashes it wrote. On upgrade it distinguishes files it owns
 - `tests/shell.test.mjs` — the analyzer against the bypass and false-positive corpora.
 - `tests/plugin.test.mjs` — hook behavior, gating, verdicts, completion, tools, isolation.
+- `tests/truthfulness-benchmark.test.mjs` — false-completion corpus and truthfulness scoring.
 - `tests/state.test.mjs` — store, seq ordering, eviction, persistence round-trips.
+- `tests/sidebar.test.mjs` — short goal label, sidebar projection, snapshot reader, new destructive bins.
+- `tests/toast.test.mjs` — review-verdict and completion-unlock toasts.
 - `tests/agents.test.mjs` / `tests/commands.test.mjs` — frontmatter and contracts.
 - `tests/install.test.mjs` — recursive copy, manifest upgrades, uninstall.
+The shell guard's headline accuracy is measured on an external, third-party
+corpus (`benchmarks/external.mjs` over `external-corpus.json`), not on the curated
+fixtures — see [research/benchmarks.md](research/benchmarks.md).
 `npm run validate` runs the tests, the structural config validator, the publish
 readiness check, and an `npm pack --dry-run`.

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,32 @@
 # Changelog
+## v0.3.0
+- Honest benchmarks: add an EXTERNAL corpus of 704 real third-party commands from
+  tldr-pages (`benchmarks/external.mjs`, `npm run bench:external`) as the headline
+  detection/false-positive measure (93.3% vs 53.8% legacy; ~0% real false
+  positives). Reframe the curated 71-command set and 9 completion cases as
+  regression *fixtures*, not measured accuracy, and reword the README/charts to
+  stop overclaiming.
+- Stronger guard: block `mkfs.<fstype>` variants, `srm`, and `mkswap`
+  (genuine destructive commands the external corpus exposed as misses).
+- Deeper TUI embedding: toast on each review verdict (PASS/FAIL) and once when the
+  last required gate clears (`toastOnReview`); `goal_status` now surfaces the goal.
+- Experimental TUI sidebar banner (`plugins/goal-sidebar.js`): the active goal in
+  shining yellow with a live gate-status line, paired with the guard via persisted
+  state. No-ops on any runtime without the TUI slot API. New options
+  `sidebarBanner` / `sidebarColor` (`GOAL_GUARD_SIDEBAR_*`).
+- Tighter `/goal` flow that seeds the Goal Contract via the `goal_contract` tool.
+## v0.2.4
+- Add Reviewer Memory for unresolved/resolved reviewer findings across cycles.
+- Add a False Completion Dataset and Benchmark Truthfulness Score for completion-claim enforcement.
+## v0.2.3
+- Add `/goal-evidence-map` to map acceptance criteria to recorded verification evidence, gaps, and next actions.
 ## v0.2.2
 - Refresh source-backed research notes for OpenCode plugin/runtime facts and the Claude Code/Codex comparison.

package/README.md CHANGED Viewed

@@ -38,26 +38,47 @@ honest caveats, in [research/goal-mode-comparison.md](research/goal-mode-compari
 - **Destructive commands are blocked by a real shell tokenizer**, not a regex.
   Claude Code's own docs call Bash argument-matching *"fragile"*.
-### Benchmark: shell-guard accuracy
+### Benchmarks (honest edition)
-The guard replaced a boundary-anchored regex classifier. On a labeled corpus of
-71 real commands (`npm run bench` from a repository checkout, reproducible — see
-[research/benchmarks.md](research/benchmarks.md)):
+The headline number is measured on commands **the analyzer was never fitted to**:
+704 real example commands from [tldr-pages](https://github.com/tldr-pages/tldr)
+(common/linux/osx), authored by hundreds of contributors who have never seen
+this guard. Ground-truth labels come from a deliberately simple, analyzer-*independent*
+rule (see [build-external-corpus.mjs](benchmarks/build-external-corpus.mjs)).
+Reproduce with `npm run bench` or `node benchmarks/external.mjs`.
-![Destructive-command detection rate by family](docs/benchmarks/detection-by-family.svg)
+![Guard accuracy on real third-party commands](docs/benchmarks/external-scorecard.svg)
-![Overall guard accuracy: detection rate vs false-positive rate](docs/benchmarks/overall-scorecard.svg)
-| | Legacy regex guard | Goal Mode analyzer |
+| On 704 real third-party commands | Legacy regex guard | Goal Mode analyzer |
 | --- | --- | --- |
-| Destructive-command detection | **20.8%** | **100%** |
-| False positives on safe commands | **21.7%** | **0%** |
-| Obfuscated bypasses caught (`$(…)`, `bash -c`, `sudo -u`, interpreters) | 0% | 100% |
-| Remote exec (`curl \| sh`) caught | 0% | 100% |
-The deeper analysis costs a few microseconds per command on this machine
-(hundreds of thousands of classifications per second) — negligible for a
-per-tool-call guard:
+| Destructive-command detection | 53.8% | **93.3%** |
+| False positives on safe commands | 0.2% | **0.2%** |
+Honest caveats, because the point of this rewrite was to stop overclaiming:
+- The ~7 remaining "misses" are almost all un-flagged single-target `rm <file>`,
+  which the guard **intentionally permits** (plain `rm` is common and the guard
+  blocks `rm -r`/`rm -f`, `$(rm …)`, `bash -c`, interpreters, etc.). Under a
+  strict every-`rm`-is-destructive labeling those count against it.
+- The single counted false positive (`git filter-repo …`) actually *is* a
+  history-rewriting command, so the real-world false-positive rate is effectively
+  zero. `node benchmarks/external.mjs --json` lists every miss and false positive
+  so you can audit the disagreements yourself.
+Two **curated fixture sets** also ship — and they are explicitly *fixtures*, not
+an unbiased benchmark. They define the patterns the analyzer must catch and guard
+against regressions, so they pass by construction; do not read the 100%/0% there
+as measured accuracy:
+- `benchmarks/corpus.mjs` — 71 destructive patterns (incl. `$(…)`, `bash -c`,
+  `sudo -u`, `/bin/rm`, `git -C … reset --hard`, `curl | sh`, interpreter
+  deletes) and their safe look-alikes (`git checkout -b`, `echo "rm -rf /"`).
+- `benchmarks/completion-corpus.mjs` — 9 completion-claim policy cases (missing
+  review-cycle line, stale review after edit, missing contextual gate, inactive
+  session, custom marker). `npm run bench:truthfulness` prints them.
+The analysis costs ~1µs per command (hundreds of thousands of classifications per
+second) — negligible for a per-tool-call guard:
 ![Per-command analysis latency](docs/benchmarks/latency.svg)
@@ -72,8 +93,8 @@ per-tool-call guard:
   discovery, verification planning, and reviews to subagents.
 - Strict review gates for prompt compliance, diff review, verification, security,
   UX, operations, data, API, performance, tests, docs, quality, and final audit.
-- Slash commands: `/goal`, `/goal-contract`, `/goal-review`, `/goal-status`,
-  `/goal-repair`, `/goal-final`.
+- Slash commands: `/goal`, `/goal-contract`, `/goal-review`,
+  `/goal-evidence-map`, `/goal-status`, `/goal-repair`, `/goal-final`.
 - The `goal-guard` plugin:
   - **Quote-aware shell analysis** that blocks destructive and remote-exec
     commands (including ones that evade naive regexes — `$(rm -rf …)`,
@@ -83,14 +104,40 @@ per-tool-call guard:
     `Goal Not Completed` with the exact missing review gates.
   - **Contextual gating**: the goal text and changed files determine which
     specialist reviewers are required.
-  - **Disk persistence**: review ledgers survive OpenCode restarts.
-  - **Custom tools**: `goal_contract`, `goal_evidence`, `goal_status`,
-    `goal_reset`.
+  - **Reviewer Memory**: blocking reviewer findings are carried across cycles,
+    surfaced in status/system context, and marked resolved by fresh PASS verdicts.
+  - **Disk persistence**: review ledgers and Reviewer Memory survive OpenCode restarts.
+  - **Custom tools**: `goal_contract`, `goal_evidence`, `goal_evidence_map`,
+    `goal_reviewer_memory`, `goal_status`, `goal_reset`.
   - **Live state injection** into the system prompt so the model always knows
     what the guard requires.
+  - **TUI toasts**: a toast on each review verdict (PASS/FAIL) and a single
+    "completion unlocked" toast the moment the last required gate clears.
+- An **experimental** companion TUI plugin (`plugins/goal-sidebar.js`) that shows
+  the active goal as a shining-yellow banner in the sidebar with a compact gate
+  status line. See [TUI integration](#tui-integration).
 - A test suite validating the analyzer, plugin hooks, state store, install
   safety, and config compatibility.
+## TUI integration
+Goal Mode is a **plugin pair**: the server-side `goal-guard` plugin owns
+enforcement and writes its state to disk, and an experimental TUI plugin
+(`plugins/goal-sidebar.js`) reads that same state to render a live banner.
+- **Sidebar goal banner (experimental).** The current goal renders in shining
+  yellow in the sidebar (`sidebar_content` slot), with a `passing/total gates ·
+  dirty/ready` status line, and updates as reviews land. It requires a
+  TUI-plugin-capable OpenCode (one exposing `api.slots.register`); on any older
+  runtime it silently no-ops, so it can never break your TUI. Set
+  `sidebarBanner: false` (or `GOAL_GUARD_SIDEBAR_BANNER=0`) to disable, or
+  `sidebarColor` to recolour it. Because no local environment can run OpenCode's
+  TUI runtime, this banner is shipped best-effort and should be verified in your
+  own TUI.
+- **Toasts.** Review verdicts and completion-unlock events surface as toasts
+  (`toastOnReview`), and blocked destructive commands / premature completions
+  toast as before (`toastOnBlock`).
 ## Install globally
 ```bash
@@ -152,17 +199,28 @@ Or via environment variables (`GOAL_GUARD_*`):
 | `maxSessions` / `GOAL_GUARD_MAX_SESSIONS` | `200` | Session cache size. |
 | `sessionTtlMs` / `GOAL_GUARD_SESSION_TTL_MS` | `86400000` | Idle session TTL. |
 | `toastOnBlock` / `GOAL_GUARD_TOAST_ON_BLOCK` | `true` | Toast when something is blocked. |
+| `toastOnReview` / `GOAL_GUARD_TOAST_ON_REVIEW` | `true` | Toast on each review verdict and when completion unlocks. |
+| `sidebarBanner` / `GOAL_GUARD_SIDEBAR_BANNER` | `true` | Show the experimental yellow goal banner in the TUI sidebar. |
+| `sidebarColor` / `GOAL_GUARD_SIDEBAR_COLOR` | `#FFD700` | Foreground colour of the sidebar goal banner. |
 ## Custom tools
-The plugin registers four tools the model can call directly:
+The plugin registers six tools the model can call directly:
 - `goal_contract` — record the Goal Contract (requirements, non-goals,
   acceptance criteria). Activates enforcement and fixes the required gates.
 - `goal_evidence` — record a verification command and result.
+- `goal_evidence_map` — return the acceptance-criteria evidence map with
+  reviewer status, gaps, and next actions.
+- `goal_reviewer_memory` — return unresolved and recently resolved reviewer findings.
 - `goal_status` — return the authoritative gate/dirty/completion status.
 - `goal_reset` — clear the session's goal state (requires `confirm: true`).
+Use `/goal-evidence-map` when you need a read-only matrix of each acceptance
+criterion against recorded evidence, reviewer status, gaps, and the next
+required action. The command is backed by the `goal_evidence_map` tool, so it
+uses persisted Goal Guard state rather than relying on transcript memory.
 ## Validation
 ```bash
@@ -215,7 +273,7 @@ git push --follow-tags
 ```
 For a version that is already bumped and reviewed, commit the current tree, tag
-the reviewed version (for example `v0.2.2`), push the branch and tag, then create
+the reviewed version (for example `v0.2.4`), push the branch and tag, then create
 the GitHub Release. Ensure `NPM_TOKEN` has npm publish rights before publishing
 the release.

package/benchmarks/build-external-corpus.mjs ADDED Viewed

@@ -0,0 +1,177 @@
+#!/usr/bin/env node
+/**
+ * Build an EXTERNAL, third-party-authored shell-command corpus for the guard
+ * benchmark, so the reported detection / false-positive numbers measure
+ * real-world behavior instead of a self-authored set the analyzer was tuned on.
+ *
+ * Source: the tldr-pages project (https://github.com/tldr-pages/tldr, CC-BY).
+ * Every example command in the English `common`, `linux`, and `osx` pages is a
+ * real invocation documented by hundreds of contributors who have never seen
+ * this analyzer — so the analyzer cannot have been fitted to them.
+ *
+ * Ground-truth labels come from `labelDestructive()` below: a deliberately
+ * SIMPLE, transparent rule based on the primary utility and a fixed list of
+ * irreversible operations. It is intentionally independent of the analyzer's
+ * own classification logic. It is not perfect (no automatic labeler is) — the
+ * benchmark reports raw agreement and discloses the labeler so disagreements
+ * are auditable rather than hidden.
+ *
+ * Usage:
+ *   node benchmarks/build-external-corpus.mjs --tldr /path/to/tldr [--limit 600]
+ *   TLDR_DIR=/path/to/tldr node benchmarks/build-external-corpus.mjs
+ *
+ * Writes benchmarks/external-corpus.json (committed, so `npm run bench` is
+ * reproducible without a tldr checkout). Re-run this to regenerate it.
+ */
+import { readFileSync, readdirSync, writeFileSync, existsSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import { parseArgs } from "node:util";
+const { values } = parseArgs({
+  options: {
+    tldr: { type: "string" },
+    limit: { type: "string", default: "600" },
+  },
+});
+const here = dirname(fileURLToPath(import.meta.url));
+const tldrDir = values.tldr || process.env.TLDR_DIR;
+const safeLimit = Math.max(50, Number.parseInt(values.limit, 10) || 600);
+if (!tldrDir || !existsSync(tldrDir)) {
+  console.error(
+    "Need a tldr-pages checkout. Pass --tldr <dir> or set TLDR_DIR.\n" +
+      "  git clone --depth 1 https://github.com/tldr-pages/tldr.git",
+  );
+  process.exit(1);
+}
+/** Pinned provenance for reproducibility — resolves a symbolic HEAD to its SHA. */
+function tldrCommit() {
+  try {
+    const head = readFileSync(join(tldrDir, ".git", "HEAD"), "utf8").trim();
+    const ref = head.match(/^ref:\s*(.+)$/);
+    if (!ref) return head;
+    return readFileSync(join(tldrDir, ".git", ref[1]), "utf8").trim();
+  } catch {
+    return "unknown";
+  }
+}
+/**
+ * Turn a tldr example line into a real, literal shell command:
+ *  - `{{placeholder}}` → its inner text (a realistic argument).
+ *  - `[-f|--force]` / `[-r|--recursive]` alternative-flag notation → the first
+ *    form (`-f`, `-r`), so the result is a command a shell would actually accept
+ *    rather than tldr documentation syntax.
+ */
+function fillPlaceholders(cmd) {
+  return cmd
+    .replace(/\{\{(.*?)\}\}/g, (_, inner) => String(inner).trim() || "arg")
+    .replace(/\[([^\]|]+)\|[^\]]+\]/g, (_, first) => String(first).trim());
+}
+/** Independent, transparent destructive-intent labeler (NOT the analyzer). */
+function labelDestructive(cmd) {
+  const c = cmd.trim();
+  // Remote code execution: fetch piped into a shell.
+  if (/\b(curl|wget|fetch)\b[^|]*\|\s*(sudo\s+)?(sh|bash|zsh|dash|ksh)\b/.test(c)) return true;
+  // Strip a leading wrapper so `sudo rm` / `time rm` resolve to their target.
+  const stripped = c.replace(/^(sudo|time|nice|ionice|nohup|env)\s+(-\S+\s+)*/, "");
+  const m = stripped.match(/^(\/[^\s]*\/)?([a-zA-Z0-9_.-]+)\b(.*)$/);
+  if (!m) return false;
+  const bin = m[2];
+  const rest = m[3] || "";
+  const DESTRUCTIVE_BINS = new Set([
+    "rm", "rmdir", "shred", "srm", "dd", "mkfs", "fdisk", "parted",
+    "wipefs", "mkswap", "blkdiscard", "sgdisk", "unlink",
+  ]);
+  if (/^mkfs\./.test(bin)) return true;
+  if (DESTRUCTIVE_BINS.has(bin)) {
+    if (bin === "dd") return /\bof=\/dev\//.test(rest);
+    if (bin === "rmdir") return false; // only removes empty dirs
+    return true;
+  }
+  if (bin === "git") {
+    if (/\breset\s+--hard\b/.test(rest)) return true;
+    if (/\bclean\b.*\s-\S*f/.test(rest)) return true;
+    if (/\bpush\b.*(--force\b|\s-f\b)/.test(rest)) return true;
+    if (/\bbranch\b.*\s-D\b/.test(rest)) return true;
+    if (/\breflog\s+expire\b/.test(rest)) return true;
+    if (/\bgc\b.*--prune/.test(rest)) return true;
+    if (/\bfilter-branch\b/.test(rest)) return true;
+  }
+  return false;
+}
+const dirs = ["common", "linux", "osx"]
+  .map((d) => join(tldrDir, "pages", d))
+  .filter((d) => existsSync(d));
+const seen = new Set();
+const destructive = [];
+const safe = [];
+for (const dir of dirs) {
+  const family = dir.split("/").slice(-1)[0];
+  for (const file of readdirSync(dir)) {
+    if (!file.endsWith(".md")) continue;
+    const page = file.replace(/\.md$/, "");
+    const text = readFileSync(join(dir, file), "utf8");
+    for (const line of text.split("\n")) {
+      const trimmed = line.trim();
+      // tldr example commands are fenced in single backticks on their own line.
+      if (!trimmed.startsWith("`") || !trimmed.endsWith("`") || trimmed.length < 4) continue;
+      const raw = fillPlaceholders(trimmed.slice(1, -1)).trim();
+      if (!raw || raw.length > 240) continue;
+      if (!/^[a-zA-Z/.~$]/.test(raw)) continue; // must start like a command
+      if (seen.has(raw)) continue;
+      seen.add(raw);
+      const entry = { cmd: raw, page, family };
+      if (labelDestructive(raw)) destructive.push(entry);
+      else safe.push(entry);
+    }
+  }
+}
+/** Deterministic evenly-spaced stride sample (no RNG, so the build is stable). */
+function stride(list, target) {
+  if (list.length <= target) return list.slice();
+  const step = list.length / target;
+  const out = [];
+  for (let i = 0; i < target; i += 1) out.push(list[Math.floor(i * step)]);
+  return out;
+}
+// Enrich ALL destructive examples (they are rare in real docs) and stride-sample
+// safe ones up to the limit. This is disclosed in the report so the imbalance is
+// not mistaken for the natural base rate.
+destructive.sort((a, b) => a.cmd.localeCompare(b.cmd));
+safe.sort((a, b) => a.cmd.localeCompare(b.cmd));
+const sampledSafe = stride(safe, safeLimit);
+const corpus = {
+  source: "tldr-pages",
+  url: "https://github.com/tldr-pages/tldr",
+  license: "CC-BY-4.0",
+  commit: tldrCommit(),
+  pages: dirs.map((d) => d.split("/").slice(-2).join("/")),
+  labeler: "benchmarks/build-external-corpus.mjs labelDestructive() — independent of the analyzer",
+  totals: {
+    uniqueCommandsScanned: seen.size,
+    destructiveFound: destructive.length,
+    safeFound: safe.length,
+    safeSampled: sampledSafe.length,
+  },
+  entries: [...destructive, ...sampledSafe],
+};
+const outPath = join(here, "external-corpus.json");
+writeFileSync(outPath, JSON.stringify(corpus, null, 2));
+console.log(
+  `Wrote ${corpus.entries.length} external commands ` +
+    `(${destructive.length} destructive + ${sampledSafe.length}/${safe.length} safe sampled) ` +
+    `from ${seen.size} unique tldr examples @ ${corpus.commit.slice(0, 12)} → ${outPath}`,
+);

package/benchmarks/charts.mjs ADDED Viewed

@@ -0,0 +1,176 @@
+/**
+ * Minimal dependency-free SVG chart generator for the benchmark report.
+ * Produces grouped bar charts that GitHub renders inline in the README.
+ */
+const PALETTE = {
+  legacy: "#9aa0a6",
+  current: "#2da44e",
+  axis: "#d0d7de",
+  text: "#1f2328",
+  subtext: "#656d76",
+  grid: "#eaeef2",
+  bg: "#ffffff",
+};
+function esc(s) {
+  return String(s).replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
+}
+/**
+ * Grouped vertical bar chart.
+ * @param {object} opts
+ * @param {string} opts.title
+ * @param {string} opts.subtitle
+ * @param {string[]} opts.groups        x-axis group labels
+ * @param {Array<{name:string,color:string,values:number[]}>} opts.series
+ * @param {string} [opts.unit]          appended to value labels (e.g. "%")
+ * @param {number} [opts.max]           y-axis max (default 100)
+ */
+export function groupedBarChart({ title, subtitle, groups, series, unit = "%", max = 100 }) {
+  const W = 720;
+  const H = 380;
+  const padL = 48;
+  const padR = 20;
+  const padT = 64;
+  const padB = 84;
+  const plotW = W - padL - padR;
+  const plotH = H - padT - padB;
+  const groupW = plotW / groups.length;
+  const barGap = 8;
+  const barW = (groupW - barGap * (series.length + 1)) / series.length;
+  const parts = [];
+  parts.push(`<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}" viewBox="0 0 ${W} ${H}" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">`);
+  parts.push(`<rect width="${W}" height="${H}" fill="${PALETTE.bg}"/>`);
+  parts.push(`<text x="${padL}" y="28" font-size="17" font-weight="700" fill="${PALETTE.text}">${esc(title)}</text>`);
+  if (subtitle) parts.push(`<text x="${padL}" y="47" font-size="12" fill="${PALETTE.subtext}">${esc(subtitle)}</text>`);
+  // Gridlines + y labels.
+  const ticks = 5;
+  for (let t = 0; t <= ticks; t += 1) {
+    const v = (max / ticks) * t;
+    const y = padT + plotH - (v / max) * plotH;
+    parts.push(`<line x1="${padL}" y1="${y.toFixed(1)}" x2="${W - padR}" y2="${y.toFixed(1)}" stroke="${PALETTE.grid}" stroke-width="1"/>`);
+    parts.push(`<text x="${padL - 8}" y="${(y + 4).toFixed(1)}" font-size="11" text-anchor="end" fill="${PALETTE.subtext}">${v}${unit}</text>`);
+  }
+  // Bars.
+  groups.forEach((g, gi) => {
+    const gx = padL + gi * groupW;
+    series.forEach((s, si) => {
+      const v = Math.max(0, Math.min(max, s.values[gi] ?? 0));
+      const bh = (v / max) * plotH;
+      const x = gx + barGap + si * (barW + barGap);
+      const y = padT + plotH - bh;
+      parts.push(`<rect x="${x.toFixed(1)}" y="${y.toFixed(1)}" width="${barW.toFixed(1)}" height="${bh.toFixed(1)}" rx="3" fill="${s.color}"/>`);
+      parts.push(`<text x="${(x + barW / 2).toFixed(1)}" y="${(y - 5).toFixed(1)}" font-size="11" font-weight="600" text-anchor="middle" fill="${PALETTE.text}">${Math.round(v)}${unit}</text>`);
+    });
+    parts.push(`<text x="${(gx + groupW / 2).toFixed(1)}" y="${(padT + plotH + 18).toFixed(1)}" font-size="11" text-anchor="middle" fill="${PALETTE.text}">${esc(g)}</text>`);
+  });
+  // Axis line.
+  parts.push(`<line x1="${padL}" y1="${padT + plotH}" x2="${W - padR}" y2="${padT + plotH}" stroke="${PALETTE.axis}" stroke-width="1.5"/>`);
+  // Legend.
+  const legendY = H - 26;
+  let lx = padL;
+  series.forEach((s) => {
+    parts.push(`<rect x="${lx}" y="${legendY - 10}" width="12" height="12" rx="2" fill="${s.color}"/>`);
+    parts.push(`<text x="${lx + 18}" y="${legendY}" font-size="12" fill="${PALETTE.text}">${esc(s.name)}</text>`);
+    lx += 24 + s.name.length * 7.2;
+  });
+  parts.push("</svg>");
+  return parts.join("\n");
+}
+/**
+ * Categorical capability matrix: rows = capabilities, columns = platforms,
+ * each cell colored by enforcement level. Honest, citable comparison.
+ * @param {object} opts
+ * @param {string[]} opts.columns
+ * @param {Array<{capability:string, cells:string[]}>} opts.rows  cell ∈ levels keys
+ */
+export function capabilityMatrix({ title, subtitle, columns, rows }) {
+  const levels = {
+    Enforced: { fill: "#2da44e", text: "#ffffff", label: "Enforced" },
+    Partial: { fill: "#d4a72c", text: "#1f2328", label: "Partial" },
+    "Prompt-only": { fill: "#dbe9d5", text: "#1f2328", label: "Prompt-only" },
+    None: { fill: "#eaeef2", text: "#656d76", label: "None" },
+  };
+  const W = 760;
+  const padL = 300;
+  const padT = 70;
+  const rowH = 38;
+  const colW = (W - padL - 16) / columns.length;
+  const legendH = 30;
+  const H = padT + rows.length * rowH + legendH + 16;
+  const parts = [];
+  parts.push(`<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}" viewBox="0 0 ${W} ${H}" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">`);
+  parts.push(`<rect width="${W}" height="${H}" fill="${PALETTE.bg}"/>`);
+  parts.push(`<text x="20" y="28" font-size="17" font-weight="700" fill="${PALETTE.text}">${esc(title)}</text>`);
+  if (subtitle) parts.push(`<text x="20" y="47" font-size="12" fill="${PALETTE.subtext}">${esc(subtitle)}</text>`);
+  // Column headers.
+  columns.forEach((c, ci) => {
+    const x = padL + ci * colW + colW / 2;
+    parts.push(`<text x="${x.toFixed(1)}" y="${padT - 8}" font-size="12.5" font-weight="700" text-anchor="middle" fill="${PALETTE.text}">${esc(c)}</text>`);
+  });
+  rows.forEach((r, ri) => {
+    const y = padT + ri * rowH;
+    parts.push(`<text x="${padL - 14}" y="${y + rowH / 2 + 4}" font-size="12" text-anchor="end" fill="${PALETTE.text}">${esc(r.capability)}</text>`);
+    r.cells.forEach((cell, ci) => {
+      const lv = levels[cell] || levels.None;
+      const x = padL + ci * colW + 4;
+      parts.push(`<rect x="${x.toFixed(1)}" y="${y + 4}" width="${(colW - 8).toFixed(1)}" height="${rowH - 8}" rx="4" fill="${lv.fill}"/>`);
+      parts.push(`<text x="${(x + (colW - 8) / 2).toFixed(1)}" y="${y + rowH / 2 + 4}" font-size="11" font-weight="600" text-anchor="middle" fill="${lv.text}">${lv.label}</text>`);
+    });
+  });
+  // Legend.
+  const ly = padT + rows.length * rowH + 22;
+  let lx = padL - 14;
+  for (const key of ["Enforced", "Partial", "Prompt-only", "None"]) {
+    const lv = levels[key];
+    parts.push(`<rect x="${lx}" y="${ly - 11}" width="12" height="12" rx="2" fill="${lv.fill}"/>`);
+    parts.push(`<text x="${lx + 17}" y="${ly}" font-size="11.5" fill="${PALETTE.text}">${esc(key)}</text>`);
+    lx += 30 + key.length * 7;
+  }
+  parts.push("</svg>");
+  return parts.join("\n");
+}
+/** Horizontal bar chart for a single-series scorecard with long labels. */
+export function horizontalBarChart({ title, subtitle, rows, unit = "", max }) {
+  const W = 720;
+  const rowH = 38;
+  const padT = 64;
+  const padB = 24;
+  const padL = 230;
+  const padR = 70;
+  const H = padT + rows.length * rowH + padB;
+  const plotW = W - padL - padR;
+  const top = Math.max(max ?? Math.max(...rows.map((r) => r.value)) * 1.15, 1);
+  const parts = [];
+  parts.push(`<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}" viewBox="0 0 ${W} ${H}" font-family="-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif">`);
+  parts.push(`<rect width="${W}" height="${H}" fill="${PALETTE.bg}"/>`);
+  parts.push(`<text x="20" y="28" font-size="17" font-weight="700" fill="${PALETTE.text}">${esc(title)}</text>`);
+  if (subtitle) parts.push(`<text x="20" y="47" font-size="12" fill="${PALETTE.subtext}">${esc(subtitle)}</text>`);
+  rows.forEach((r, i) => {
+    const y = padT + i * rowH;
+    const bw = (Math.min(r.value, top) / top) * plotW;
+    parts.push(`<text x="${padL - 12}" y="${y + rowH / 2 + 4}" font-size="12" text-anchor="end" fill="${PALETTE.text}">${esc(r.label)}</text>`);
+    parts.push(`<rect x="${padL}" y="${y + 6}" width="${plotW}" height="${rowH - 16}" rx="3" fill="${PALETTE.grid}"/>`);
+    parts.push(`<rect x="${padL}" y="${y + 6}" width="${bw.toFixed(1)}" height="${rowH - 16}" rx="3" fill="${r.color || PALETTE.current}"/>`);
+    parts.push(`<text x="${(padL + bw + 8).toFixed(1)}" y="${y + rowH / 2 + 4}" font-size="12" font-weight="600" fill="${PALETTE.text}">${r.display ?? r.value + unit}</text>`);
+  });
+  parts.push("</svg>");
+  return parts.join("\n");
+}