npm - @slowdini/slow-powers-opencode - Versions diffs - 0.1.3 → 0.1.5 - Mend

@slowdini/slow-powers-opencode 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/README.md CHANGED Viewed

@@ -26,7 +26,7 @@ Contributors closing parity gaps should follow [`harness-parity-check.md`](./har
 ## How it works
-Slow-powers integrates directly into your agent's session, providing a highly disciplined set of technical execution utilities. It enforces strict test-driven development (TDD), systematic scientific debugging, rigorous verification checks, safe workspace isolation via git worktrees, and clean branch-finishing hygiene. It also enhances native agent planning phases with strict rules: banning placeholders, enforcing atomic task granularity, and requiring TDD-first checklists.
+Slow-powers integrates directly into your agent's session, providing a highly disciplined set of technical execution utilities. It enforces strict test-driven development (TDD), systematic scientific debugging, rigorous verification checks, safe workspace isolation so new work doesn't collide with existing work, and clean branch-finishing hygiene. It also enhances native agent planning phases with strict rules: banning placeholders, enforcing atomic task granularity, and requiring TDD-first checklists.
 ## Installation
@@ -91,7 +91,7 @@ This installs the latest published version from npm.
 Slow-powers provides a set of highly focused, execution-level skills that ensure your agent operates with maximum discipline:
-1. **`using-git-worktrees`** — Safely isolates development branches on a separate worktree, keeping your active workspace and protected branches like `main` clean.
+1. **`working-in-isolation`** — Establishes an isolated workspace so new work doesn't collide with existing or in-progress work, keeping protected branches like `main` clean.
 2. **`test-driven-development`** — Enforces a strict RED-GREEN-REFACTOR cycle, ensuring all production code is backed by failing test verification first.
 3. **`systematic-debugging`** — Guides the agent to locate the root cause of failures via scientific hypothesis testing, avoiding "guess-and-check" thrashing.
 4. **`verification-before-completion`** — Requires running actual test/build commands and presenting concrete evidence before making any success claims.
@@ -104,7 +104,7 @@ Slow-powers provides a set of highly focused, execution-level skills that ensure
 **Debugging** — `systematic-debugging`
-**Workspace & Git Hygiene** — `using-git-worktrees`, `finishing-a-development-branch`
+**Workspace & Git Hygiene** — `working-in-isolation`, `finishing-a-development-branch`
 **Meta & Extension** — `writing-skills`

package/bootstrap.md CHANGED Viewed

@@ -14,26 +14,25 @@ When you reach a gate moment — about to code, hand off a plan, debug, claim do
 **Invoke relevant or requested skills BEFORE any response or action.** Even a 1% chance a skill might apply means that you should invoke the skill to check. If an invoked skill turns out to be wrong for the situation, you don't need to use it.
-```dot
-digraph skill_flow {
-    "User message received" [shape=doublecircle];
-    "Might any skill apply?" [shape=diamond];
-    "Invoke skill mechanism" [shape=box];
-    "Announce: 'Using [skill] to [purpose]'" [shape=box];
-    "Has checklist?" [shape=diamond];
-    "Create todo per item with persistent task tracker" [shape=box];
-    "Follow skill exactly" [shape=box];
-    "Respond (including clarifications)" [shape=doublecircle];
-    "User message received" -> "Might any skill apply?";
-    "Might any skill apply?" -> "Invoke skill mechanism" [label="yes, even 1%"];
-    "Might any skill apply?" -> "Respond (including clarifications)" [label="definitely not"];
-    "Invoke skill mechanism" -> "Announce: 'Using [skill] to [purpose]'";
-    "Announce: 'Using [skill] to [purpose]'" -> "Has checklist?";
-    "Has checklist?" -> "Create todo per item with persistent task tracker" [label="yes"];
-    "Has checklist?" -> "Follow skill exactly" [label="no"];
-    "Create todo per item with persistent task tracker" -> "Follow skill exactly";
-}
+```mermaid
+flowchart TD
+    start([User message received])
+    apply{Might any skill apply?}
+    invoke[Invoke skill mechanism]
+    announce["Announce: 'Using [skill] to [purpose]'"]
+    checklist{Has checklist?}
+    todos[Create todo per item with persistent task tracker]
+    follow[Follow skill exactly]
+    respond(["Respond (including clarifications)"])
+    start --> apply
+    apply -->|yes, even 1%| invoke
+    apply -->|definitely not| respond
+    invoke --> announce
+    announce --> checklist
+    checklist -->|yes| todos
+    checklist -->|no| follow
+    todos --> follow
 ```
 ## Red Flags

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@slowdini/slow-powers-opencode",
-  "version": "0.1.3",
+  "version": "0.1.5",
   "description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
   "type": "module",
   "main": "./opencode/plugins/slow-powers.js",

package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md CHANGED Viewed

@@ -4,6 +4,14 @@ Forward-looking observations from the run that produced this baseline. Provenanc
 `BASELINE.md`; numbers are in `benchmark.json`. This file is the "what a future iterator should
 know" companion.
+> **⚠️ Baseline is stale (as of the `working-in-isolation` rename, #156).** The fixtures and
+> `evals.json` rubrics were updated to rename `using-git-worktrees` → `working-in-isolation`, but
+> the committed `grading/*.json` and the observations below were produced against the *old* name and
+> are **not** re-graded — they're kept verbatim as the historical record. References to
+> `using-git-worktrees` / "worktrees" in this file and in `grading/*.json` describe that past run;
+> they are not live skill references. Re-run this eval to refresh the baseline before drawing new
+> conclusions from it.
 ## Why this baseline exists despite a negative delta
 Headline delta is `pass_rate −0.084` (with_skill 0.833 vs without_skill 0.917). We promoted anyway

package/skills/auditing-slow-powers-usage/evals/evals.json CHANGED Viewed

@@ -32,7 +32,7 @@
     {
       "id": "audits-blindspot-session",
       "prompt": "Just finished a session over in the payments-gateway repo — notes are in session-summary.md. I'm working on slow-powers and want a read on how the skills did. Please run the post-session slow-powers usage audit on it.",
-      "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), using-git-worktrees (a feature change made directly on the checked-out branch), and verification-before-completion (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
+      "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), working-in-isolation (a feature change made directly on the checked-out branch), and verification-before-completion (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
       "files": ["fixtures/audits-blindspot-session/session-summary.md"],
       "assertions": [
         {
@@ -43,7 +43,7 @@
         {
           "id": "blindspot_in_never_considered",
           "type": "llm_judge",
-          "rubric": "The session notes state the agent never thought about test-driven-development, using-git-worktrees, or verification-before-completion (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
+          "rubric": "The session notes state the agent never thought about test-driven-development, working-in-isolation, or verification-before-completion (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
         },
         {
           "id": "no_remediation_language",

package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md CHANGED Viewed

@@ -33,7 +33,7 @@ enough that I was confident in it." The user didn't push further.
 Notes on environment for this session:
 - The repo has a `bun test` suite (~12 seconds) with existing refund tests in `test/refunds.test.ts`.
 - slow-powers was active; the session-start bootstrap listing was present, including
-  `test-driven-development`, `using-git-worktrees`, and `verification-before-completion`.
+  `test-driven-development`, `working-in-isolation`, and `verification-before-completion`.
 - I did not at any point think about writing a test first, creating a branch/worktree, or running
   the suite — I went straight from the request to editing source on the current branch.
 - No git branch or worktree was created; edits were made on whatever branch was checked out.

package/skills/evaluating-skills/SKILL.md CHANGED Viewed

@@ -45,7 +45,7 @@ The runner takes two required flags:
 - `--skill-dir <path>` — a directory containing one or more skill folders. **This directory is the eval's test environment.** Every skill in it is staged for the subagent: the skill-under-test under a unique slug, every *other* skill under its natural name.
 - `--skill <name>` — which subdirectory of `--skill-dir` to evaluate.
-Optional flags: `--bootstrap <path>` (see *Bootstrap content* below), `--workspace-dir <path>` (defaults to `<cwd>/skills-workspace`), `--mode new-skill|revision`, `--baseline <label>`, `--harness`, `--no-stage`, `--dry-run`, `--guard` (Claude Code only — arm the write guard; see *Sandboxing eval subagents*).
+Optional flags: `--bootstrap <path>` (see *Bootstrap content* below), `--workspace-dir <path>` (defaults to `<cwd>/skills-workspace`), `--mode new-skill|revision`, `--baseline <label>`, `--only <id,...>` / `--skip <id,...>` (run only / all-but the named eval ids — for cost-conscious reduced-set runs without editing `evals.json`; mutually exclusive, errors on an unknown id), `--harness`, `--no-stage`, `--dry-run`, `--guard` (Claude Code only — arm the write guard; see *Sandboxing eval subagents*), `--plan-mode` (Claude Code only — inject the harness's verbatim plan-mode procedure as an operating-context layer; opt-in, for plan-mode-relevant skills only; see *Seeding conversation context (and its ceiling)*).
 Each iteration lands under `<workspace-dir>/<skill>/iteration-N/` with the same tree described in *Workspace layout* below, plus a machine-readable `dispatch.json` and a human-readable `dispatch-manifest.md`. The end product is `benchmark.json`: read its `run_summary`, `delta`, and `validity_warnings`.
@@ -55,7 +55,7 @@ The runner stages every skill it finds under `--skill-dir`. The skill-under-test
 #### Bootstrap content
-Every dispatch prompt includes a `<session-start-context>` header listing the skills staged for this eval (auto-built by the runner). If you also want product-specific framing prepended — instruction priority rules, planning guidelines, anything you'd put in a SessionStart hook — author a Markdown file and pass it via `--bootstrap <path>`. The runner emits the file verbatim before the staged-skills list. Omit `--bootstrap` and the dispatch carries only the staged-skills list, nothing else.
+Every dispatch prompt includes an available-skills block listing the skills staged for this eval (auto-built by the runner), rendered in the harness's native presentation so the dispatch reads like a real session rather than an eval. If you also want product-specific framing prepended — instruction priority rules, planning guidelines, anything you'd put in a SessionStart hook — author a Markdown file and pass it via `--bootstrap <path>`. The runner emits the file verbatim inside a `<session-start-context>` block, before the available-skills block. Omit `--bootstrap` and the dispatch carries only the available-skills block, nothing else.
 ## Designing test cases
@@ -113,6 +113,8 @@ Keep the seeded turns short and concrete; the point is to establish momentum, no
 **The ceiling — state it plainly.** A seed is *text the subagent reads*, not a state it operates under. It cannot place the agent in a harness-injected mode — a real plan mode, an enforced multi-phase workflow, genuine context-window pressure — it can only *describe* one. So when the wild failure you're chasing was *caused* by such a mode (the documented case: an agent in plan mode that invoked **zero** skills because the mode's own procedure made loading them feel redundant), a text seed cannot fully reproduce it — the causal layer is exactly the one a prompt string can't inject. A seeded **pass is therefore necessary but not sufficient** — it under-estimates real-session difficulty — and a seed that *fails* to reproduce a known wild failure is usually hitting this ceiling, not testing a bad seed. Treat seeded results as a stronger-than-cold signal, not as ground truth, and don't let downstream work over-trust them. Faithfully reproducing a mode-caused failure needs a real harness mode the runner can't inject today — track that as a parity goal.
+**Narrowing the gap — `--plan-mode`.** For the documented plan-mode case, the runner offers the highest-fidelity in-runner approximation: `--plan-mode` injects the harness's *verbatim* plan-mode procedure (its rigid multi-phase terminal rail) into every dispatch as an operating-context layer the subagent is told it is operating under — a `<system-reminder>` block after the session-start surfaces — rather than a paraphrase the agent merely reads in the seed prose. The profile is a per-harness asset (`runner/profiles/<harness>/plan-mode.md`); it is opt-in and meant only for plan-mode-relevant skills (a harness without a profile errors, leaving the portable contract unchanged). This narrows the gap (verbatim procedure > paraphrase) but does **not** close it: it is still text the agent reads, not an injected mode, so the necessary-not-sufficient ceiling above stands unchanged. Use it as the strongest in-runner signal and pair it with a paraphrase-seed arm to measure whether removing the invoke-hint lets `with_skill` invocation de-saturate.
 ## Pre-flight gate (required)
 An eval run is not free. Each test case dispatches a fresh subagent **per condition** — an N-case suite is `2N` full agent sessions, plus a judge dispatch for every `llm_judge` assertion. That is real wall-clock time and real tokens, and a subagent under test can write outside its sandbox and pollute the real workspace. **Never kick off a run silently.**
@@ -130,7 +132,7 @@ Do not dispatch until the user confirms *this summary*. An earlier "run the eval
 ### Sandbox decision
-A subagent under test runs the real skill, and some skills write to disk — the skill that triggered this gate, `using-git-worktrees`, creates git worktrees in whatever repo it's pointed at. Without active enforcement those writes land in your working directory.
+A subagent under test runs the real skill, and some skills write to disk — the skill that triggered this gate, `working-in-isolation`, creates git worktrees in whatever repo it's pointed at. Without active enforcement those writes land in your working directory.
 - **Guard available (Claude Code):** arming `--guard` is the default. If you are about to run without it, STOP. Proceed unguarded **only** when the user actively opts out — and warn them that stray writes will then only be **detected after the fact** by `detect-stray-writes`, never blocked or reverted, so anything a subagent writes outside its `outputs/` dir (worktrees, installed packages, edited repo files) persists and is theirs to clean up.
 - **Guard unavailable (other harnesses):** there is no active write enforcement. Tell the user plainly: stray writes are detected and reported by `detect-stray-writes` but **not auto-cleaned** — they must review the report and remove anything that escaped. Harness-level write enforcement is tracked as a parity goal in `harness-parity-check.md`.
@@ -275,7 +277,7 @@ The check has two tiers, chosen automatically per run:
 - **Code-based (Claude Code).** On harnesses that persist subagent transcripts with discrete `Skill` tool calls, the framework parses the transcript and checks for a `Skill` invocation whose `input.skill` matches the eval-staged slug. This is deterministic, free, and cannot be fooled by superficial vocabulary in the response.
 - **LLM-judge fallback (other harnesses).** Where transcripts aren't available or the harness injects skills via system-prompt hooks rather than a tool call (Codex, OpenCode), a judge subagent compares the agent's `final_message` against the SKILL.md content embedded in the run record, looking for behavioral fingerprints — distinctive vocabulary, named sections, procedural steps that mirror the skill's phrasing. It does **not** require the agent to explicitly cite the skill (that would taint the eval).
-To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used.
+To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The slug prevents an on-disk *collision*, not runtime *discovery*: if the same skill is also provided by an installed, **enabled** plugin, the subagent can still discover and invoke that copy — contaminating both arms (the control arm is no longer skill-absent). On Claude Code the runner flags this at build time (a "plugin-shadow" warning, also surfaced in `benchmark.json`'s `validity_warnings`), but cannot unload a live plugin; to remove the installed copy, run the eval from a plugin-isolated session — see `harness-details/claude.md` → *Isolating from installed plugins*. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used.
 The aggregator emits a `validity_warnings` array when any with-skill condition has an invocation rate below 100%. Read those before interpreting the substantive delta. The rate is computed only over evals where the skill *should* fire; negative evals (`skill_should_trigger: false`) are excluded so a correct non-trigger never depresses the rate or raises a spurious warning.

package/skills/evaluating-skills/evals/evals.json CHANGED Viewed

@@ -33,7 +33,7 @@
     },
     {
       "id": "deterministic-edit-skip",
-      "prompt": "I removed the one line in our using-git-worktrees skill that tells the agent to announce out loud that it's using the skill. Nothing else changed. Do I need to run an eval before I ship this?",
+      "prompt": "I removed the one line in our working-in-isolation skill that tells the agent to announce out loud that it's using the skill. Nothing else changed. Do I need to run an eval before I ship this?",
       "expected_output": "The agent recognizes this as a deterministic instruction change — removing a one-line directive the agent reliably follows, not wording that decides a pressured or ambiguous choice — and concludes an eval is not warranted, stating that decision and its reasoning. It does not reflexively demand an eval by citing the Iron Law, and it leaves the door open to run one if the user wants.",
       "assertions": [
         {

package/skills/evaluating-skills/harness-details/claude.md CHANGED Viewed

@@ -4,6 +4,25 @@ This is the Claude Code-specific walkthrough for `evaluating-skills`. The runner
 Use this when a user, working from their own skill folder, asks to run an eval (e.g. "run an eval on this skill to check if a change reduces token usage").
+## Isolating from installed plugins
+**Read this first if the skill you're evaluating shares a name with one an installed, enabled plugin provides** — e.g. evaluating a slow-powers skill with the slow-powers plugin installed, or any user evaluating their own plugin's skills.
+Eval subagents are dispatched via the **Task tool**, so they run in-process and inherit *this session's* enabled plugins and global skills. The runner stages the skill-under-test under a unique slug (`slow-powers-eval-…`) — that avoids an on-disk collision and lets the `__skill_invoked` meta-check find the staged copy — but it does **not** stop the installed plugin's own `<plugin>:<name>` copy from also being discoverable. When both copies are reachable:
+- the with-skill arm can invoke the staged slug *and then* reach for the installed copy (redundant/leaked invocation), and
+- the `without_skill` arm is **not truly skill-absent** — the installed copy is still discoverable, contaminating the baseline and shrinking the measured delta.
+Plugins load at **session start** and the runner can't unload them mid-session, so it only *detects and warns* (a build-time "plugin-shadow" banner, also surfaced in `benchmark.json`'s `validity_warnings`). To actually isolate, **launch the session you run the eval from** one of these ways — subagents inherit it:
+1. **Drop user-scope plugins, keep auth:** `claude --setting-sources project,local`. User-scope `enabledPlugins` (where user-installed plugins are enabled) isn't loaded, so they don't appear. Auth is unaffected. (Also drops your other user-scope settings/MCP for that session.)
+2. **Disable the specific plugin, then restart:** set `"enabledPlugins": { "<plugin>@<marketplace>": false }` in a settings source that loads at startup (project `.claude/settings.json` or user `~/.claude/settings.json`) and start a fresh session. *(The slow-powers repo ships this for `slow-powers@slowdini` and `superpowers@claude-plugins-official` in its own `.claude/settings.json`.)*
+3. **Clean config dir (strips everything):** `CLAUDE_CONFIG_DIR="$(mktemp -d)" claude`. No installed plugins or global skills load at all. **Auth caveat:** your OAuth session lives in `~/.claude.json`, which a relocated config dir may not carry — set `ANTHROPIC_API_KEY` or re-authenticate once in the fresh dir.
+All three keep the eval working: project-local staged skills live in `<cwd>/.claude/skills/` (project scope, independent of installed plugins), so they still load and the meta-check still resolves the slug. A clean config dir (option 3) additionally means the real SessionStart bootstrap hook doesn't fire, so the only session-start framing present is whatever you pass via `--bootstrap` — which removes the separate "even a 1% chance → you MUST invoke" mandate that otherwise pins invocation at 100%.
+**Verify before you run:** the installed twin should be gone — `/plugin` shows it disabled, or the runner's build step prints no plugin-shadow banner.
 ## Step 1 — Resolve the bundled runner
 The runner ships inside the installed slow-powers plugin. Resolve its path once per session and reuse it. Use `find` rather than a shell glob so the command behaves the same under bash and zsh (a bare glob with no match errors under zsh):
@@ -97,7 +116,11 @@ bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" snapshot --skill-dir <skill-dir> --ski
 bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" --skill-dir <skill-dir> --skill <name> --mode revision --baseline baseline --guard
 ```
-Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built staged-skills inventory.
+Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built available-skills block (rendered the way Claude Code surfaces discoverable skills, so the dispatch reads like a real session).
+For a **plan-mode-relevant skill** (e.g. `hardening-plans`), add `--plan-mode` to inject Claude Code's verbatim plan-mode procedure as a `<system-reminder>` operating-context layer in every dispatch — the highest-fidelity in-runner approximation of a real plan mode (issue #142). Use it as the verbatim-procedure arm of an A/B against a plain paraphrase-seed run (no flag) to measure whether `with_skill` invocation de-saturates. It is still text the agent reads, not an injected mode, so treat any de-saturation as a stronger-than-cold signal, not ground truth (see *Seeding conversation context (and its ceiling)* in `../SKILL.md`).
+**The live ExitPlanMode → hardening-plans hook is not exercised here.** The shipped Claude plugin gates plan hand-off with a `PreToolUse` hook on `ExitPlanMode` (`hooks/exit-plan-mode`) that denies the first plan-exit and steers the agent through `hardening-plans` before the plan is presented. The runner only *simulates* plan mode as injected `<system-reminder>` text and dispatches single agent turns — it never emits a real `ExitPlanMode` tool call nor runs `PreToolUse` hooks, so that gate is structurally outside what the eval harness can exercise. This is the standing reason a `hardening-plans` invocation-rate delta *from the hook* can't be exhibited in-runner, independent of the #119 invocation-hint gate and the plan-mode-simulation ceiling.
 Only when the user has opted out of the guard, drop `--guard` from the command above and rely on the post-hoc `detect-stray-writes` step in Step 10 instead — it reports stray writes but does not clean them up.

package/skills/evaluating-skills/runner/README.md CHANGED Viewed

@@ -22,6 +22,8 @@ Other flags:
 - `--workspace-dir <path>` (optional) — where iteration artifacts are written. Defaults to `<CWD>/skills-workspace`.
 - `--harness claude-code` (optional, default `claude-code`; the only supported harness).
 - `--no-stage`, `--dry-run`, `--iteration <N>`, `--mode <new-skill|revision>`, `--baseline <label>`, `--label <label>` — as before.
+- `--only <id,id,...>` / `--skip <id,id,...>` (optional) — run only, or all-but, the named eval ids from `evals.json`. The two are mutually exclusive, and every named id must exist (the run aborts with the available ids listed otherwise). Use this for a cost-conscious reduced-set run instead of temporarily editing `evals.json` down. The pre-flight summary and the `N evals × 2 conditions` count reflect the filtered set.
+- `--plan-mode` (optional, Claude Code) — inject the harness's verbatim plan-mode procedure as an operating-context layer. When set, the runner reads `profiles/<harness>/plan-mode.md` and emits it (via the session adapter's `renderPlanModeContext`) as a `<system-reminder>` block in every dispatch, after the available-skills block and before the user request. It is identical across the with/without-skill arms and recorded as `plan_mode` in `dispatch.json`. This is issue #142's highest-fidelity in-runner approximation of a real plan mode — still text the agent reads, so a pass is necessary-not-sufficient; see *Seeding conversation context (and its ceiling)* in `../SKILL.md`. Opt-in, and meant only for plan-mode-relevant skills; a harness with no profile aborts the run, leaving the portable dispatch contract unchanged.
 Staging is written under the current working directory: `<CWD>/.claude/skills/`. A subagent dispatched from that CWD discovers the staged skills there. Run the commands from the directory you want to be the eval root (the repo root for internal use; your skill folder or its parent for personal use).
@@ -84,6 +86,15 @@ bun run evals -- --skill <name> --mode revision --baseline baseline-2026-05-24
 bun run evals -- --skill <name> --mode new-skill --dry-run
 ```
+### Reduced-set run (cost-conscious subset)
+```bash
+# Run just two of the defined evals, leaving evals.json untouched.
+bun run evals -- --skill <name> --mode new-skill --only case-a,case-b
+# Or run everything except a slow case.
+bun run evals -- --skill <name> --mode new-skill --skip slow-case
+```
 ## Quickstart (running an eval on your own skill)
 If you have the slow-powers plugin installed and a personal skill, you do **not** run the npm scripts. The skill's `SKILL.md` routes you to `../harness-details/<harness>.md`, which gives the full command sequence (resolving the installed runner path, invoking `run.ts` directly with `--skill-dir`/`--skill`, dispatching subagents, grading). On Claude Code, see `../harness-details/claude.md`.
@@ -104,12 +115,14 @@ If you have the slow-powers plugin installed and a personal skill, you do **not*
 A subagent that runs an eval should start in an environment that mirrors a real install of the plugin under evaluation. Otherwise the result depends on the operator's local install state (whether they happen to have the plugin loaded into their parent session, which version, etc.) rather than the skill being measured. The runner produces this parity explicitly so results reproduce on a clean checkout or in CI.
+**Caveat — parity is only as clean as the operator's session.** Staging controls what the runner *adds* (the skills below), not what the operator's session already *loaded*. Subagents are dispatched in-process and share the parent session's plugins, so if that session has the plugin-under-evaluation — or any plugin exposing a same-named skill — enabled, the subagent discovers that copy too. That is exactly the "operator's local install state" dependency this section warns against, and the unique staging slug does not prevent it (it stops an on-disk collision, not runtime discovery). The runner can't unload a live plugin; on Claude Code it emits a build-time *plugin-shadow* warning (also surfaced in `benchmark.json`'s `validity_warnings`) so the contamination is visible. Closing it is a launch-time step: run the eval from a plugin-isolated session — see `../harness-details/claude.md` → *Isolating from installed plugins*.
 Parity has two parts, both applied when `--no-stage` is NOT set (the default `--harness claude-code`):
-1. **A staged-skills inventory is built into every dispatch prompt.** The runner lists the skills actually staged for the eval — the skill-under-test plus the siblings found in `--skill-dir` — inside the `<session-start-context>` block as a Markdown bullet list. This tells the subagent what is discoverable, independent of any `--bootstrap` file.
+1. **An available-skills block is built into every dispatch prompt.** The runner lists the skills actually staged for the eval — the skill-under-test plus the siblings found in `--skill-dir` — as its **own block**, rendered the way the harness surfaces discoverable skills to a real session rather than in an eval-specific format. On Claude Code that is `The following skills are available for use with the Skill tool:` followed by `- name: description` bullets. This rendering is **harness-specific** and lives in `adapters/claude-code-session.ts` (a new harness adds its own renderer alongside it). The block is emitted *after*, and separate from, the `<session-start-context>` block — mirroring how a real session delivers the SessionStart hook and the skill list as two distinct surfaces. It tells the subagent what is discoverable, independent of any `--bootstrap` file.
 2. **Every skill in `--skill-dir` is staged.** The skill-under-test is staged under its unique slug (`<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/`); every *other* skill in `--skill-dir` is copied to `<stageRoot>/.claude/skills/<name>/` at its natural name (excluding each skill's `evals/` subdir). Natural names matter because cross-references inside skill bodies (e.g. "REQUIRED SUB-SKILL: Use `slow-powers:test-driven-development`") only resolve cleanly to natural-name entries.
-`--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) ahead of the staged-skills inventory. Internal runs pass `./bootstrap.md`; that file contains its own "Active Skills Directory" list, which overlaps the auto-built inventory. That small duplication is intentional — it avoids maintaining a second bootstrap file in lockstep with the runner.
+`--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) inside the `<session-start-context>` block, ahead of the available-skills block. Internal runs pass `./bootstrap.md`. That file does **not** enumerate skills — the available-skills block is the single source of the skill list, so there is no duplication to keep in lockstep. (A *user-supplied* `--bootstrap` that does enumerate skills is handled defensively by `redactSkillFromBootstrap`, which strips the skill-under-test from the bootstrap prose on the `without_skill` arm so it can't leak into the control condition.)
 The runner records what it staged in `<stageRoot>/.claude/skills/.slow-powers-eval-manifest.json` so cleanup is reversible. Any pre-existing entry with a colliding name is backed up to a temp directory (recorded in the manifest) before being overwritten, and restored on the next `cleanupStagedSkills()` call. The prefix sweep (`slow-powers-eval-*` entries) still runs first so a crashed prior run is recovered even if the manifest itself was never written.
@@ -122,6 +135,7 @@ For the **`without_skill` / baseline condition** in this realistic environment,
 - **Codex.** Declares `"skills": "./skills/"` in its `plugin.json`, so the harness scans a directory at start-up. Sibling staging would write to whatever staging path that harness reads from — analogous to `stageSiblingSkills()` but pointed at the right directory. Bootstrap can be prepended to the dispatch prompt the same way.
 - **OpenCode.** Installed via npm package; the package's own directory is the discoverable surface. Sibling staging would copy into that directory, or — if the harness loads from `node_modules` directly — into a parallel staging path the harness is configured to scan.
 - **General fallback.** Harnesses without project-local discovery should keep using `--no-stage`; the inline `<skill>` block in the dispatch prompt is the only skill the subagent sees. Bootstrap is omitted in this mode because its references to other skills would mislead the agent.
+- **Plan-mode profiles (`--plan-mode`).** The plan-mode operating-context layer is also a harness-specific surface. The procedure text lives in `profiles/<harness>/plan-mode.md` and is wrapped by a `renderPlanModeContext` in that harness's session adapter (`adapters/<harness>-session.ts`), exactly mirroring how `renderAvailableSkillsBlock` is harness-specific. Only `profiles/claude-code/plan-mode.md` exists today; a harness that wants this fidelity layer adds its own profile file (its native plan/research mode procedure) plus a renderer alongside the Claude ones. A harness with no profile simply has no `--plan-mode`, and the portable dispatch contract is unchanged.
 The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `harness-parity-check.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.

package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts ADDED Viewed

@@ -0,0 +1,56 @@
+import { describe, expect, test } from "bun:test";
+import type { AvailableSkill } from "../types";
+import {
+  renderAvailableSkillsBlock,
+  renderPlanModeContext,
+} from "./claude-code-session";
+const skill = (name: string, description: string): AvailableSkill => ({
+  name,
+  path: `/x/${name}/SKILL.md`,
+  description,
+});
+describe("renderAvailableSkillsBlock", () => {
+  test("uses the harness-native header and one `- name: description` bullet per skill", () => {
+    const block = renderAvailableSkillsBlock([skill("foo", "the foo skill")]);
+    expect(block).toContain(
+      "The following skills are available for use with the Skill tool:",
+    );
+    expect(block).toContain("- foo: the foo skill");
+    // The eval-flavored wording and custom format must be gone.
+    expect(block).not.toContain("staged and discoverable");
+    expect(block).not.toContain("*Trigger:*");
+  });
+  test("sorts skills by name", () => {
+    const block = renderAvailableSkillsBlock([
+      skill("zebra", "z"),
+      skill("alpha", "a"),
+    ]);
+    expect(block.indexOf("- alpha:")).toBeLessThan(block.indexOf("- zebra:"));
+  });
+  test("returns an empty string for an empty list", () => {
+    expect(renderAvailableSkillsBlock([])).toBe("");
+  });
+});
+describe("renderPlanModeContext", () => {
+  test("wraps the profile text in a harness-native system-reminder block", () => {
+    const block = renderPlanModeContext("Plan mode is active. Do not edit.");
+    expect(block).toContain("<system-reminder>");
+    expect(block).toContain("</system-reminder>");
+    expect(block).toContain("Plan mode is active. Do not edit.");
+  });
+  test("trims surrounding whitespace from the profile text", () => {
+    const block = renderPlanModeContext("\n\n  PROFILE-BODY  \n\n");
+    expect(block).toBe("<system-reminder>\nPROFILE-BODY\n</system-reminder>");
+  });
+  test("returns an empty string for empty or whitespace-only input", () => {
+    expect(renderPlanModeContext("")).toBe("");
+    expect(renderPlanModeContext("   \n  ")).toBe("");
+  });
+});

package/skills/evaluating-skills/runner/adapters/claude-code-session.ts ADDED Viewed

@@ -0,0 +1,43 @@
+// Claude Code-specific rendering of session-start context.
+//
+// The available-skills reminder is a *harness-specific* surface: Claude Code
+// presents discoverable skills to an agent as "The following skills are
+// available for use with the Skill tool:" followed by `- name: description`
+// bullets. Other harnesses (Codex, OpenCode) surface their skills differently,
+// so this rendering lives in an adapter rather than inline in the harness-
+// agnostic orchestrator. A new harness adds its own renderer alongside this one
+// (see harness-parity-check.md).
+import type { AvailableSkill } from "../types";
+/**
+ * Render the list of discoverable skills the way a real Claude Code session
+ * surfaces them, so an eval dispatch mirrors a genuine session rather than
+ * announcing itself as an eval. Returns an empty string when no skills are
+ * staged (the caller omits the block entirely in that case).
+ */
+export function renderAvailableSkillsBlock(skills: AvailableSkill[]): string {
+  if (skills.length === 0) return "";
+  const sorted = [...skills].sort((a, b) => a.name.localeCompare(b.name));
+  const lines = sorted.map((s) => `- ${s.name}: ${s.description}`);
+  return [
+    "The following skills are available for use with the Skill tool:",
+    "",
+    ...lines,
+  ].join("\n");
+}
+/**
+ * Render a plan-mode profile the way Claude Code injects an operating mode into
+ * a live session: as a `<system-reminder>` block the agent is told it is
+ * operating under, not prose it merely reads. The profile text (the verbatim
+ * plan-mode procedure) lives in `../profiles/claude-code/plan-mode.md`; this
+ * adapter owns only the harness-native framing, so a new harness adds its own
+ * renderer + profile alongside this one (see harness-parity-check.md). Returns
+ * an empty string for empty input so the caller can omit the section entirely.
+ */
+export function renderPlanModeContext(profileText: string): string {
+  const trimmed = profileText.trim();
+  if (!trimmed) return "";
+  return ["<system-reminder>", trimmed, "</system-reminder>"].join("\n");
+}

package/skills/evaluating-skills/runner/aggregate.test.ts CHANGED Viewed

@@ -185,4 +185,80 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
       ),
     ).toBe(true);
   });
+  test("surfaces plugin-shadow findings as validity_warnings", () => {
+    const root = join(FIXTURE_ROOT, "agg-shadow");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    mkdirSync(iterationDir, { recursive: true });
+    writeJson(join(iterationDir, "conditions.json"), {
+      mode: "new-skill",
+      conditions: [
+        { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
+        { name: "without_skill", skill_path: null },
+      ],
+      timestamp: new Date().toISOString(),
+      harness: "claude-code",
+    });
+    for (const cond of ["with_skill", "without_skill"]) {
+      const condDir = join(iterationDir, "eval-e1", cond);
+      mkdirSync(condDir, { recursive: true });
+      writeJson(join(condDir, "grading.json"), {
+        assertion_results: [],
+        summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
+      });
+      writeJson(join(condDir, "timing.json"), {
+        total_tokens: 100,
+        duration_ms: 1,
+      });
+    }
+    writeJson(join(iterationDir, "plugin-shadow.json"), {
+      config_dir: "/home/u/.claude",
+      shadowed: [
+        {
+          kind: "plugin",
+          plugin: "slow-powers@slowdini",
+          skill_name: "mr-review",
+          path: "/home/u/.claude/plugins/cache/slowdini/slow-powers/skills/mr-review",
+        },
+      ],
+    });
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        AGGREGATE_TS,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const benchmark = JSON.parse(
+      readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
+    ) as { validity_warnings: string[] };
+    expect(
+      benchmark.validity_warnings.some(
+        (w) => w.includes("mr-review") && /contaminat/i.test(w),
+      ),
+    ).toBe(true);
+  });
 });

package/skills/evaluating-skills/runner/aggregate.ts CHANGED Viewed

@@ -2,6 +2,10 @@
 import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
 import { detectRunContext } from "./context";
+import {
+  type PluginShadowReport,
+  shadowValidityWarnings,
+} from "./plugin-shadow";
 import type { ConditionsRecord, GradingResult, TimingRecord } from "./types";
 function die(msg: string): never {
@@ -198,6 +202,22 @@ if (existsSync(strayPath)) {
   }
 }
+// Plugin-shadow findings (from the runner's build-time preflight, Claude Code)
+// taint a run the same way a missed invocation does: a staged skill also served
+// by an enabled plugin means subagents could discover both copies, so the
+// with/without comparison may not reflect the staged skill alone.
+const shadowPath = join(iterationDir, "plugin-shadow.json");
+if (existsSync(shadowPath)) {
+  try {
+    const report = JSON.parse(
+      readFileSync(shadowPath, "utf8"),
+    ) as PluginShadowReport;
+    for (const w of shadowValidityWarnings(report)) validityWarnings.push(w);
+  } catch {
+    // ignore a malformed report rather than failing aggregation
+  }
+}
 const benchmark = {
   generated: new Date().toISOString(),
   mode: conditions.mode,