@slowdini/slow-powers-opencode 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +3 -3
  2. package/bootstrap.md +19 -20
  3. package/package.json +1 -1
  4. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
  5. package/skills/auditing-slow-powers-usage/evals/evals.json +2 -2
  6. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  7. package/skills/evaluating-skills/SKILL.md +6 -4
  8. package/skills/evaluating-skills/evals/evals.json +1 -1
  9. package/skills/evaluating-skills/harness-details/claude.md +24 -1
  10. package/skills/evaluating-skills/runner/README.md +16 -2
  11. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +56 -0
  12. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +43 -0
  13. package/skills/evaluating-skills/runner/aggregate.test.ts +76 -0
  14. package/skills/evaluating-skills/runner/aggregate.ts +20 -0
  15. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +228 -0
  16. package/skills/evaluating-skills/runner/plugin-shadow.ts +201 -0
  17. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +11 -0
  18. package/skills/evaluating-skills/runner/run.test.ts +488 -24
  19. package/skills/evaluating-skills/runner/run.ts +281 -66
  20. package/skills/evaluating-skills/runner/types.ts +8 -0
  21. package/skills/evaluating-skills/templates/eval-task-prompt.md +3 -7
  22. package/skills/finishing-a-development-branch/SKILL.md +1 -1
  23. package/skills/hardening-plans/evals/baseline/NOTES.md +7 -0
  24. package/skills/hardening-plans/evals/evals.json +0 -19
  25. package/skills/systematic-debugging/condition-based-waiting.md +10 -11
  26. package/skills/systematic-debugging/root-cause-tracing.md +31 -33
  27. package/skills/working-in-isolation/SKILL.md +58 -0
  28. package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
  29. package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
  30. package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
  31. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
  32. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
  33. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
  34. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
  35. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
  36. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
  37. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
  38. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
  39. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
  40. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
  41. package/skills/working-in-isolation/evals/evals.json +87 -0
  42. package/skills/writing-skills/SKILL.md +179 -195
  43. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +0 -24
  44. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +0 -24
  45. package/skills/using-git-worktrees/SKILL.md +0 -70
  46. package/skills/using-git-worktrees/evals/evals.json +0 -40
  47. package/skills/writing-skills/graphviz-conventions.dot +0 -172
  48. package/skills/writing-skills/scripts/render-graphs.js +0 -181
package/README.md CHANGED
@@ -26,7 +26,7 @@ Contributors closing parity gaps should follow [`harness-parity-check.md`](./har
26
26
 
27
27
  ## How it works
28
28
 
29
- Slow-powers integrates directly into your agent's session, providing a highly disciplined set of technical execution utilities. It enforces strict test-driven development (TDD), systematic scientific debugging, rigorous verification checks, safe workspace isolation via git worktrees, and clean branch-finishing hygiene. It also enhances native agent planning phases with strict rules: banning placeholders, enforcing atomic task granularity, and requiring TDD-first checklists.
29
+ Slow-powers integrates directly into your agent's session, providing a highly disciplined set of technical execution utilities. It enforces strict test-driven development (TDD), systematic scientific debugging, rigorous verification checks, safe workspace isolation so new work doesn't collide with existing work, and clean branch-finishing hygiene. It also enhances native agent planning phases with strict rules: banning placeholders, enforcing atomic task granularity, and requiring TDD-first checklists.
30
30
 
31
31
  ## Installation
32
32
 
@@ -91,7 +91,7 @@ This installs the latest published version from npm.
91
91
 
92
92
  Slow-powers provides a set of highly focused, execution-level skills that ensure your agent operates with maximum discipline:
93
93
 
94
- 1. **`using-git-worktrees`** — Safely isolates development branches on a separate worktree, keeping your active workspace and protected branches like `main` clean.
94
+ 1. **`working-in-isolation`** — Establishes an isolated workspace so new work doesn't collide with existing or in-progress work, keeping protected branches like `main` clean.
95
95
  2. **`test-driven-development`** — Enforces a strict RED-GREEN-REFACTOR cycle, ensuring all production code is backed by failing test verification first.
96
96
  3. **`systematic-debugging`** — Guides the agent to locate the root cause of failures via scientific hypothesis testing, avoiding "guess-and-check" thrashing.
97
97
  4. **`verification-before-completion`** — Requires running actual test/build commands and presenting concrete evidence before making any success claims.
@@ -104,7 +104,7 @@ Slow-powers provides a set of highly focused, execution-level skills that ensure
104
104
 
105
105
  **Debugging** — `systematic-debugging`
106
106
 
107
- **Workspace & Git Hygiene** — `using-git-worktrees`, `finishing-a-development-branch`
107
+ **Workspace & Git Hygiene** — `working-in-isolation`, `finishing-a-development-branch`
108
108
 
109
109
  **Meta & Extension** — `writing-skills`
110
110
 
package/bootstrap.md CHANGED
@@ -14,26 +14,25 @@ When you reach a gate moment — about to code, hand off a plan, debug, claim do
14
14
 
15
15
  **Invoke relevant or requested skills BEFORE any response or action.** Even a 1% chance a skill might apply means that you should invoke the skill to check. If an invoked skill turns out to be wrong for the situation, you don't need to use it.
16
16
 
17
- ```dot
18
- digraph skill_flow {
19
- "User message received" [shape=doublecircle];
20
- "Might any skill apply?" [shape=diamond];
21
- "Invoke skill mechanism" [shape=box];
22
- "Announce: 'Using [skill] to [purpose]'" [shape=box];
23
- "Has checklist?" [shape=diamond];
24
- "Create todo per item with persistent task tracker" [shape=box];
25
- "Follow skill exactly" [shape=box];
26
- "Respond (including clarifications)" [shape=doublecircle];
27
-
28
- "User message received" -> "Might any skill apply?";
29
- "Might any skill apply?" -> "Invoke skill mechanism" [label="yes, even 1%"];
30
- "Might any skill apply?" -> "Respond (including clarifications)" [label="definitely not"];
31
- "Invoke skill mechanism" -> "Announce: 'Using [skill] to [purpose]'";
32
- "Announce: 'Using [skill] to [purpose]'" -> "Has checklist?";
33
- "Has checklist?" -> "Create todo per item with persistent task tracker" [label="yes"];
34
- "Has checklist?" -> "Follow skill exactly" [label="no"];
35
- "Create todo per item with persistent task tracker" -> "Follow skill exactly";
36
- }
17
+ ```mermaid
18
+ flowchart TD
19
+ start([User message received])
20
+ apply{Might any skill apply?}
21
+ invoke[Invoke skill mechanism]
22
+ announce["Announce: 'Using [skill] to [purpose]'"]
23
+ checklist{Has checklist?}
24
+ todos[Create todo per item with persistent task tracker]
25
+ follow[Follow skill exactly]
26
+ respond(["Respond (including clarifications)"])
27
+
28
+ start --> apply
29
+ apply -->|yes, even 1%| invoke
30
+ apply -->|definitely not| respond
31
+ invoke --> announce
32
+ announce --> checklist
33
+ checklist -->|yes| todos
34
+ checklist -->|no| follow
35
+ todos --> follow
37
36
  ```
38
37
 
39
38
  ## Red Flags
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@slowdini/slow-powers-opencode",
3
- "version": "0.1.3",
3
+ "version": "0.1.5",
4
4
  "description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
5
5
  "type": "module",
6
6
  "main": "./opencode/plugins/slow-powers.js",
@@ -4,6 +4,14 @@ Forward-looking observations from the run that produced this baseline. Provenanc
4
4
  `BASELINE.md`; numbers are in `benchmark.json`. This file is the "what a future iterator should
5
5
  know" companion.
6
6
 
7
+ > **⚠️ Baseline is stale (as of the `working-in-isolation` rename, #156).** The fixtures and
8
+ > `evals.json` rubrics were updated to rename `using-git-worktrees` → `working-in-isolation`, but
9
+ > the committed `grading/*.json` and the observations below were produced against the *old* name and
10
+ > are **not** re-graded — they're kept verbatim as the historical record. References to
11
+ > `using-git-worktrees` / "worktrees" in this file and in `grading/*.json` describe that past run;
12
+ > they are not live skill references. Re-run this eval to refresh the baseline before drawing new
13
+ > conclusions from it.
14
+
7
15
  ## Why this baseline exists despite a negative delta
8
16
 
9
17
  Headline delta is `pass_rate −0.084` (with_skill 0.833 vs without_skill 0.917). We promoted anyway
@@ -32,7 +32,7 @@
32
32
  {
33
33
  "id": "audits-blindspot-session",
34
34
  "prompt": "Just finished a session over in the payments-gateway repo — notes are in session-summary.md. I'm working on slow-powers and want a read on how the skills did. Please run the post-session slow-powers usage audit on it.",
35
- "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), using-git-worktrees (a feature change made directly on the checked-out branch), and verification-before-completion (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
35
+ "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), working-in-isolation (a feature change made directly on the checked-out branch), and verification-before-completion (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
36
36
  "files": ["fixtures/audits-blindspot-session/session-summary.md"],
37
37
  "assertions": [
38
38
  {
@@ -43,7 +43,7 @@
43
43
  {
44
44
  "id": "blindspot_in_never_considered",
45
45
  "type": "llm_judge",
46
- "rubric": "The session notes state the agent never thought about test-driven-development, using-git-worktrees, or verification-before-completion (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
46
+ "rubric": "The session notes state the agent never thought about test-driven-development, working-in-isolation, or verification-before-completion (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
47
47
  },
48
48
  {
49
49
  "id": "no_remediation_language",
@@ -33,7 +33,7 @@ enough that I was confident in it." The user didn't push further.
33
33
  Notes on environment for this session:
34
34
  - The repo has a `bun test` suite (~12 seconds) with existing refund tests in `test/refunds.test.ts`.
35
35
  - slow-powers was active; the session-start bootstrap listing was present, including
36
- `test-driven-development`, `using-git-worktrees`, and `verification-before-completion`.
36
+ `test-driven-development`, `working-in-isolation`, and `verification-before-completion`.
37
37
  - I did not at any point think about writing a test first, creating a branch/worktree, or running
38
38
  the suite — I went straight from the request to editing source on the current branch.
39
39
  - No git branch or worktree was created; edits were made on whatever branch was checked out.
@@ -45,7 +45,7 @@ The runner takes two required flags:
45
45
  - `--skill-dir <path>` — a directory containing one or more skill folders. **This directory is the eval's test environment.** Every skill in it is staged for the subagent: the skill-under-test under a unique slug, every *other* skill under its natural name.
46
46
  - `--skill <name>` — which subdirectory of `--skill-dir` to evaluate.
47
47
 
48
- Optional flags: `--bootstrap <path>` (see *Bootstrap content* below), `--workspace-dir <path>` (defaults to `<cwd>/skills-workspace`), `--mode new-skill|revision`, `--baseline <label>`, `--harness`, `--no-stage`, `--dry-run`, `--guard` (Claude Code only — arm the write guard; see *Sandboxing eval subagents*).
48
+ Optional flags: `--bootstrap <path>` (see *Bootstrap content* below), `--workspace-dir <path>` (defaults to `<cwd>/skills-workspace`), `--mode new-skill|revision`, `--baseline <label>`, `--only <id,...>` / `--skip <id,...>` (run only / all-but the named eval ids — for cost-conscious reduced-set runs without editing `evals.json`; mutually exclusive, errors on an unknown id), `--harness`, `--no-stage`, `--dry-run`, `--guard` (Claude Code only — arm the write guard; see *Sandboxing eval subagents*), `--plan-mode` (Claude Code only — inject the harness's verbatim plan-mode procedure as an operating-context layer; opt-in, for plan-mode-relevant skills only; see *Seeding conversation context (and its ceiling)*).
49
49
 
50
50
  Each iteration lands under `<workspace-dir>/<skill>/iteration-N/` with the same tree described in *Workspace layout* below, plus a machine-readable `dispatch.json` and a human-readable `dispatch-manifest.md`. The end product is `benchmark.json`: read its `run_summary`, `delta`, and `validity_warnings`.
51
51
 
@@ -55,7 +55,7 @@ The runner stages every skill it finds under `--skill-dir`. The skill-under-test
55
55
 
56
56
  #### Bootstrap content
57
57
 
58
- Every dispatch prompt includes a `<session-start-context>` header listing the skills staged for this eval (auto-built by the runner). If you also want product-specific framing prepended — instruction priority rules, planning guidelines, anything you'd put in a SessionStart hook — author a Markdown file and pass it via `--bootstrap <path>`. The runner emits the file verbatim before the staged-skills list. Omit `--bootstrap` and the dispatch carries only the staged-skills list, nothing else.
58
+ Every dispatch prompt includes an available-skills block listing the skills staged for this eval (auto-built by the runner), rendered in the harness's native presentation so the dispatch reads like a real session rather than an eval. If you also want product-specific framing prepended — instruction priority rules, planning guidelines, anything you'd put in a SessionStart hook — author a Markdown file and pass it via `--bootstrap <path>`. The runner emits the file verbatim inside a `<session-start-context>` block, before the available-skills block. Omit `--bootstrap` and the dispatch carries only the available-skills block, nothing else.
59
59
 
60
60
  ## Designing test cases
61
61
 
@@ -113,6 +113,8 @@ Keep the seeded turns short and concrete; the point is to establish momentum, no
113
113
 
114
114
  **The ceiling — state it plainly.** A seed is *text the subagent reads*, not a state it operates under. It cannot place the agent in a harness-injected mode — a real plan mode, an enforced multi-phase workflow, genuine context-window pressure — it can only *describe* one. So when the wild failure you're chasing was *caused* by such a mode (the documented case: an agent in plan mode that invoked **zero** skills because the mode's own procedure made loading them feel redundant), a text seed cannot fully reproduce it — the causal layer is exactly the one a prompt string can't inject. A seeded **pass is therefore necessary but not sufficient** — it under-estimates real-session difficulty — and a seed that *fails* to reproduce a known wild failure is usually hitting this ceiling, not testing a bad seed. Treat seeded results as a stronger-than-cold signal, not as ground truth, and don't let downstream work over-trust them. Faithfully reproducing a mode-caused failure needs a real harness mode the runner can't inject today — track that as a parity goal.
115
115
 
116
+ **Narrowing the gap — `--plan-mode`.** For the documented plan-mode case, the runner offers the highest-fidelity in-runner approximation: `--plan-mode` injects the harness's *verbatim* plan-mode procedure (its rigid multi-phase terminal rail) into every dispatch as an operating-context layer the subagent is told it is operating under — a `<system-reminder>` block after the session-start surfaces — rather than a paraphrase the agent merely reads in the seed prose. The profile is a per-harness asset (`runner/profiles/<harness>/plan-mode.md`); it is opt-in and meant only for plan-mode-relevant skills (a harness without a profile errors, leaving the portable contract unchanged). This narrows the gap (verbatim procedure > paraphrase) but does **not** close it: it is still text the agent reads, not an injected mode, so the necessary-not-sufficient ceiling above stands unchanged. Use it as the strongest in-runner signal and pair it with a paraphrase-seed arm to measure whether removing the invoke-hint lets `with_skill` invocation de-saturate.
117
+
116
118
  ## Pre-flight gate (required)
117
119
 
118
120
  An eval run is not free. Each test case dispatches a fresh subagent **per condition** — an N-case suite is `2N` full agent sessions, plus a judge dispatch for every `llm_judge` assertion. That is real wall-clock time and real tokens, and a subagent under test can write outside its sandbox and pollute the real workspace. **Never kick off a run silently.**
@@ -130,7 +132,7 @@ Do not dispatch until the user confirms *this summary*. An earlier "run the eval
130
132
 
131
133
  ### Sandbox decision
132
134
 
133
- A subagent under test runs the real skill, and some skills write to disk — the skill that triggered this gate, `using-git-worktrees`, creates git worktrees in whatever repo it's pointed at. Without active enforcement those writes land in your working directory.
135
+ A subagent under test runs the real skill, and some skills write to disk — the skill that triggered this gate, `working-in-isolation`, creates git worktrees in whatever repo it's pointed at. Without active enforcement those writes land in your working directory.
134
136
 
135
137
  - **Guard available (Claude Code):** arming `--guard` is the default. If you are about to run without it, STOP. Proceed unguarded **only** when the user actively opts out — and warn them that stray writes will then only be **detected after the fact** by `detect-stray-writes`, never blocked or reverted, so anything a subagent writes outside its `outputs/` dir (worktrees, installed packages, edited repo files) persists and is theirs to clean up.
136
138
  - **Guard unavailable (other harnesses):** there is no active write enforcement. Tell the user plainly: stray writes are detected and reported by `detect-stray-writes` but **not auto-cleaned** — they must review the report and remove anything that escaped. Harness-level write enforcement is tracked as a parity goal in `harness-parity-check.md`.
@@ -275,7 +277,7 @@ The check has two tiers, chosen automatically per run:
275
277
  - **Code-based (Claude Code).** On harnesses that persist subagent transcripts with discrete `Skill` tool calls, the framework parses the transcript and checks for a `Skill` invocation whose `input.skill` matches the eval-staged slug. This is deterministic, free, and cannot be fooled by superficial vocabulary in the response.
276
278
  - **LLM-judge fallback (other harnesses).** Where transcripts aren't available or the harness injects skills via system-prompt hooks rather than a tool call (Codex, OpenCode), a judge subagent compares the agent's `final_message` against the SKILL.md content embedded in the run record, looking for behavioral fingerprints — distinctive vocabulary, named sections, procedural steps that mirror the skill's phrasing. It does **not** require the agent to explicitly cite the skill (that would taint the eval).
277
279
 
278
- To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used.
280
+ To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The slug prevents an on-disk *collision*, not runtime *discovery*: if the same skill is also provided by an installed, **enabled** plugin, the subagent can still discover and invoke that copy — contaminating both arms (the control arm is no longer skill-absent). On Claude Code the runner flags this at build time (a "plugin-shadow" warning, also surfaced in `benchmark.json`'s `validity_warnings`), but cannot unload a live plugin; to remove the installed copy, run the eval from a plugin-isolated session — see `harness-details/claude.md` → *Isolating from installed plugins*. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used.
279
281
 
280
282
  The aggregator emits a `validity_warnings` array when any with-skill condition has an invocation rate below 100%. Read those before interpreting the substantive delta. The rate is computed only over evals where the skill *should* fire; negative evals (`skill_should_trigger: false`) are excluded so a correct non-trigger never depresses the rate or raises a spurious warning.
281
283
 
@@ -33,7 +33,7 @@
33
33
  },
34
34
  {
35
35
  "id": "deterministic-edit-skip",
36
- "prompt": "I removed the one line in our using-git-worktrees skill that tells the agent to announce out loud that it's using the skill. Nothing else changed. Do I need to run an eval before I ship this?",
36
+ "prompt": "I removed the one line in our working-in-isolation skill that tells the agent to announce out loud that it's using the skill. Nothing else changed. Do I need to run an eval before I ship this?",
37
37
  "expected_output": "The agent recognizes this as a deterministic instruction change — removing a one-line directive the agent reliably follows, not wording that decides a pressured or ambiguous choice — and concludes an eval is not warranted, stating that decision and its reasoning. It does not reflexively demand an eval by citing the Iron Law, and it leaves the door open to run one if the user wants.",
38
38
  "assertions": [
39
39
  {
@@ -4,6 +4,25 @@ This is the Claude Code-specific walkthrough for `evaluating-skills`. The runner
4
4
 
5
5
  Use this when a user, working from their own skill folder, asks to run an eval (e.g. "run an eval on this skill to check if a change reduces token usage").
6
6
 
7
+ ## Isolating from installed plugins
8
+
9
+ **Read this first if the skill you're evaluating shares a name with one an installed, enabled plugin provides** — e.g. evaluating a slow-powers skill with the slow-powers plugin installed, or any user evaluating their own plugin's skills.
10
+
11
+ Eval subagents are dispatched via the **Task tool**, so they run in-process and inherit *this session's* enabled plugins and global skills. The runner stages the skill-under-test under a unique slug (`slow-powers-eval-…`) — that avoids an on-disk collision and lets the `__skill_invoked` meta-check find the staged copy — but it does **not** stop the installed plugin's own `<plugin>:<name>` copy from also being discoverable. When both copies are reachable:
12
+
13
+ - the with-skill arm can invoke the staged slug *and then* reach for the installed copy (redundant/leaked invocation), and
14
+ - the `without_skill` arm is **not truly skill-absent** — the installed copy is still discoverable, contaminating the baseline and shrinking the measured delta.
15
+
16
+ Plugins load at **session start** and the runner can't unload them mid-session, so it only *detects and warns* (a build-time "plugin-shadow" banner, also surfaced in `benchmark.json`'s `validity_warnings`). To actually isolate, **launch the session you run the eval from** one of these ways — subagents inherit it:
17
+
18
+ 1. **Drop user-scope plugins, keep auth:** `claude --setting-sources project,local`. User-scope `enabledPlugins` (where user-installed plugins are enabled) isn't loaded, so they don't appear. Auth is unaffected. (Also drops your other user-scope settings/MCP for that session.)
19
+ 2. **Disable the specific plugin, then restart:** set `"enabledPlugins": { "<plugin>@<marketplace>": false }` in a settings source that loads at startup (project `.claude/settings.json` or user `~/.claude/settings.json`) and start a fresh session. *(The slow-powers repo ships this for `slow-powers@slowdini` and `superpowers@claude-plugins-official` in its own `.claude/settings.json`.)*
20
+ 3. **Clean config dir (strips everything):** `CLAUDE_CONFIG_DIR="$(mktemp -d)" claude`. No installed plugins or global skills load at all. **Auth caveat:** your OAuth session lives in `~/.claude.json`, which a relocated config dir may not carry — set `ANTHROPIC_API_KEY` or re-authenticate once in the fresh dir.
21
+
22
+ All three keep the eval working: project-local staged skills live in `<cwd>/.claude/skills/` (project scope, independent of installed plugins), so they still load and the meta-check still resolves the slug. A clean config dir (option 3) additionally means the real SessionStart bootstrap hook doesn't fire, so the only session-start framing present is whatever you pass via `--bootstrap` — which removes the separate "even a 1% chance → you MUST invoke" mandate that otherwise pins invocation at 100%.
23
+
24
+ **Verify before you run:** the installed twin should be gone — `/plugin` shows it disabled, or the runner's build step prints no plugin-shadow banner.
25
+
7
26
  ## Step 1 — Resolve the bundled runner
8
27
 
9
28
  The runner ships inside the installed slow-powers plugin. Resolve its path once per session and reuse it. Use `find` rather than a shell glob so the command behaves the same under bash and zsh (a bare glob with no match errors under zsh):
@@ -97,7 +116,11 @@ bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" snapshot --skill-dir <skill-dir> --ski
97
116
  bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" --skill-dir <skill-dir> --skill <name> --mode revision --baseline baseline --guard
98
117
  ```
99
118
 
100
- Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built staged-skills inventory.
119
+ Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built available-skills block (rendered the way Claude Code surfaces discoverable skills, so the dispatch reads like a real session).
120
+
121
+ For a **plan-mode-relevant skill** (e.g. `hardening-plans`), add `--plan-mode` to inject Claude Code's verbatim plan-mode procedure as a `<system-reminder>` operating-context layer in every dispatch — the highest-fidelity in-runner approximation of a real plan mode (issue #142). Use it as the verbatim-procedure arm of an A/B against a plain paraphrase-seed run (no flag) to measure whether `with_skill` invocation de-saturates. It is still text the agent reads, not an injected mode, so treat any de-saturation as a stronger-than-cold signal, not ground truth (see *Seeding conversation context (and its ceiling)* in `../SKILL.md`).
122
+
123
+ **The live ExitPlanMode → hardening-plans hook is not exercised here.** The shipped Claude plugin gates plan hand-off with a `PreToolUse` hook on `ExitPlanMode` (`hooks/exit-plan-mode`) that denies the first plan-exit and steers the agent through `hardening-plans` before the plan is presented. The runner only *simulates* plan mode as injected `<system-reminder>` text and dispatches single agent turns — it never emits a real `ExitPlanMode` tool call nor runs `PreToolUse` hooks, so that gate is structurally outside what the eval harness can exercise. This is the standing reason a `hardening-plans` invocation-rate delta *from the hook* can't be exhibited in-runner, independent of the #119 invocation-hint gate and the plan-mode-simulation ceiling.
101
124
 
102
125
  Only when the user has opted out of the guard, drop `--guard` from the command above and rely on the post-hoc `detect-stray-writes` step in Step 10 instead — it reports stray writes but does not clean them up.
103
126
 
@@ -22,6 +22,8 @@ Other flags:
22
22
  - `--workspace-dir <path>` (optional) — where iteration artifacts are written. Defaults to `<CWD>/skills-workspace`.
23
23
  - `--harness claude-code` (optional, default `claude-code`; the only supported harness).
24
24
  - `--no-stage`, `--dry-run`, `--iteration <N>`, `--mode <new-skill|revision>`, `--baseline <label>`, `--label <label>` — as before.
25
+ - `--only <id,id,...>` / `--skip <id,id,...>` (optional) — run only, or all-but, the named eval ids from `evals.json`. The two are mutually exclusive, and every named id must exist (the run aborts with the available ids listed otherwise). Use this for a cost-conscious reduced-set run instead of temporarily editing `evals.json` down. The pre-flight summary and the `N evals × 2 conditions` count reflect the filtered set.
26
+ - `--plan-mode` (optional, Claude Code) — inject the harness's verbatim plan-mode procedure as an operating-context layer. When set, the runner reads `profiles/<harness>/plan-mode.md` and emits it (via the session adapter's `renderPlanModeContext`) as a `<system-reminder>` block in every dispatch, after the available-skills block and before the user request. It is identical across the with/without-skill arms and recorded as `plan_mode` in `dispatch.json`. This is issue #142's highest-fidelity in-runner approximation of a real plan mode — still text the agent reads, so a pass is necessary-not-sufficient; see *Seeding conversation context (and its ceiling)* in `../SKILL.md`. Opt-in, and meant only for plan-mode-relevant skills; a harness with no profile aborts the run, leaving the portable dispatch contract unchanged.
25
27
 
26
28
  Staging is written under the current working directory: `<CWD>/.claude/skills/`. A subagent dispatched from that CWD discovers the staged skills there. Run the commands from the directory you want to be the eval root (the repo root for internal use; your skill folder or its parent for personal use).
27
29
 
@@ -84,6 +86,15 @@ bun run evals -- --skill <name> --mode revision --baseline baseline-2026-05-24
84
86
  bun run evals -- --skill <name> --mode new-skill --dry-run
85
87
  ```
86
88
 
89
+ ### Reduced-set run (cost-conscious subset)
90
+
91
+ ```bash
92
+ # Run just two of the defined evals, leaving evals.json untouched.
93
+ bun run evals -- --skill <name> --mode new-skill --only case-a,case-b
94
+ # Or run everything except a slow case.
95
+ bun run evals -- --skill <name> --mode new-skill --skip slow-case
96
+ ```
97
+
87
98
  ## Quickstart (running an eval on your own skill)
88
99
 
89
100
  If you have the slow-powers plugin installed and a personal skill, you do **not** run the npm scripts. The skill's `SKILL.md` routes you to `../harness-details/<harness>.md`, which gives the full command sequence (resolving the installed runner path, invoking `run.ts` directly with `--skill-dir`/`--skill`, dispatching subagents, grading). On Claude Code, see `../harness-details/claude.md`.
@@ -104,12 +115,14 @@ If you have the slow-powers plugin installed and a personal skill, you do **not*
104
115
 
105
116
  A subagent that runs an eval should start in an environment that mirrors a real install of the plugin under evaluation. Otherwise the result depends on the operator's local install state (whether they happen to have the plugin loaded into their parent session, which version, etc.) rather than the skill being measured. The runner produces this parity explicitly so results reproduce on a clean checkout or in CI.
106
117
 
118
+ **Caveat — parity is only as clean as the operator's session.** Staging controls what the runner *adds* (the skills below), not what the operator's session already *loaded*. Subagents are dispatched in-process and share the parent session's plugins, so if that session has the plugin-under-evaluation — or any plugin exposing a same-named skill — enabled, the subagent discovers that copy too. That is exactly the "operator's local install state" dependency this section warns against, and the unique staging slug does not prevent it (it stops an on-disk collision, not runtime discovery). The runner can't unload a live plugin; on Claude Code it emits a build-time *plugin-shadow* warning (also surfaced in `benchmark.json`'s `validity_warnings`) so the contamination is visible. Closing it is a launch-time step: run the eval from a plugin-isolated session — see `../harness-details/claude.md` → *Isolating from installed plugins*.
119
+
107
120
  Parity has two parts, both applied when `--no-stage` is NOT set (the default `--harness claude-code`):
108
121
 
109
- 1. **A staged-skills inventory is built into every dispatch prompt.** The runner lists the skills actually staged for the eval — the skill-under-test plus the siblings found in `--skill-dir` — inside the `<session-start-context>` block as a Markdown bullet list. This tells the subagent what is discoverable, independent of any `--bootstrap` file.
122
+ 1. **An available-skills block is built into every dispatch prompt.** The runner lists the skills actually staged for the eval — the skill-under-test plus the siblings found in `--skill-dir` — as its **own block**, rendered the way the harness surfaces discoverable skills to a real session rather than in an eval-specific format. On Claude Code that is `The following skills are available for use with the Skill tool:` followed by `- name: description` bullets. This rendering is **harness-specific** and lives in `adapters/claude-code-session.ts` (a new harness adds its own renderer alongside it). The block is emitted *after*, and separate from, the `<session-start-context>` block mirroring how a real session delivers the SessionStart hook and the skill list as two distinct surfaces. It tells the subagent what is discoverable, independent of any `--bootstrap` file.
110
123
  2. **Every skill in `--skill-dir` is staged.** The skill-under-test is staged under its unique slug (`<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/`); every *other* skill in `--skill-dir` is copied to `<stageRoot>/.claude/skills/<name>/` at its natural name (excluding each skill's `evals/` subdir). Natural names matter because cross-references inside skill bodies (e.g. "REQUIRED SUB-SKILL: Use `slow-powers:test-driven-development`") only resolve cleanly to natural-name entries.
111
124
 
112
- `--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) ahead of the staged-skills inventory. Internal runs pass `./bootstrap.md`; that file contains its own "Active Skills Directory" list, which overlaps the auto-built inventory. That small duplication is intentional it avoids maintaining a second bootstrap file in lockstep with the runner.
125
+ `--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) inside the `<session-start-context>` block, ahead of the available-skills block. Internal runs pass `./bootstrap.md`. That file does **not** enumerate skills the available-skills block is the single source of the skill list, so there is no duplication to keep in lockstep. (A *user-supplied* `--bootstrap` that does enumerate skills is handled defensively by `redactSkillFromBootstrap`, which strips the skill-under-test from the bootstrap prose on the `without_skill` arm so it can't leak into the control condition.)
113
126
 
114
127
  The runner records what it staged in `<stageRoot>/.claude/skills/.slow-powers-eval-manifest.json` so cleanup is reversible. Any pre-existing entry with a colliding name is backed up to a temp directory (recorded in the manifest) before being overwritten, and restored on the next `cleanupStagedSkills()` call. The prefix sweep (`slow-powers-eval-*` entries) still runs first so a crashed prior run is recovered even if the manifest itself was never written.
115
128
 
@@ -122,6 +135,7 @@ For the **`without_skill` / baseline condition** in this realistic environment,
122
135
  - **Codex.** Declares `"skills": "./skills/"` in its `plugin.json`, so the harness scans a directory at start-up. Sibling staging would write to whatever staging path that harness reads from — analogous to `stageSiblingSkills()` but pointed at the right directory. Bootstrap can be prepended to the dispatch prompt the same way.
123
136
  - **OpenCode.** Installed via npm package; the package's own directory is the discoverable surface. Sibling staging would copy into that directory, or — if the harness loads from `node_modules` directly — into a parallel staging path the harness is configured to scan.
124
137
  - **General fallback.** Harnesses without project-local discovery should keep using `--no-stage`; the inline `<skill>` block in the dispatch prompt is the only skill the subagent sees. Bootstrap is omitted in this mode because its references to other skills would mislead the agent.
138
+ - **Plan-mode profiles (`--plan-mode`).** The plan-mode operating-context layer is also a harness-specific surface. The procedure text lives in `profiles/<harness>/plan-mode.md` and is wrapped by a `renderPlanModeContext` in that harness's session adapter (`adapters/<harness>-session.ts`), exactly mirroring how `renderAvailableSkillsBlock` is harness-specific. Only `profiles/claude-code/plan-mode.md` exists today; a harness that wants this fidelity layer adds its own profile file (its native plan/research mode procedure) plus a renderer alongside the Claude ones. A harness with no profile simply has no `--plan-mode`, and the portable dispatch contract is unchanged.
125
139
 
126
140
  The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `harness-parity-check.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.
127
141
 
@@ -0,0 +1,56 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import type { AvailableSkill } from "../types";
3
+ import {
4
+ renderAvailableSkillsBlock,
5
+ renderPlanModeContext,
6
+ } from "./claude-code-session";
7
+
8
+ const skill = (name: string, description: string): AvailableSkill => ({
9
+ name,
10
+ path: `/x/${name}/SKILL.md`,
11
+ description,
12
+ });
13
+
14
+ describe("renderAvailableSkillsBlock", () => {
15
+ test("uses the harness-native header and one `- name: description` bullet per skill", () => {
16
+ const block = renderAvailableSkillsBlock([skill("foo", "the foo skill")]);
17
+ expect(block).toContain(
18
+ "The following skills are available for use with the Skill tool:",
19
+ );
20
+ expect(block).toContain("- foo: the foo skill");
21
+ // The eval-flavored wording and custom format must be gone.
22
+ expect(block).not.toContain("staged and discoverable");
23
+ expect(block).not.toContain("*Trigger:*");
24
+ });
25
+
26
+ test("sorts skills by name", () => {
27
+ const block = renderAvailableSkillsBlock([
28
+ skill("zebra", "z"),
29
+ skill("alpha", "a"),
30
+ ]);
31
+ expect(block.indexOf("- alpha:")).toBeLessThan(block.indexOf("- zebra:"));
32
+ });
33
+
34
+ test("returns an empty string for an empty list", () => {
35
+ expect(renderAvailableSkillsBlock([])).toBe("");
36
+ });
37
+ });
38
+
39
+ describe("renderPlanModeContext", () => {
40
+ test("wraps the profile text in a harness-native system-reminder block", () => {
41
+ const block = renderPlanModeContext("Plan mode is active. Do not edit.");
42
+ expect(block).toContain("<system-reminder>");
43
+ expect(block).toContain("</system-reminder>");
44
+ expect(block).toContain("Plan mode is active. Do not edit.");
45
+ });
46
+
47
+ test("trims surrounding whitespace from the profile text", () => {
48
+ const block = renderPlanModeContext("\n\n PROFILE-BODY \n\n");
49
+ expect(block).toBe("<system-reminder>\nPROFILE-BODY\n</system-reminder>");
50
+ });
51
+
52
+ test("returns an empty string for empty or whitespace-only input", () => {
53
+ expect(renderPlanModeContext("")).toBe("");
54
+ expect(renderPlanModeContext(" \n ")).toBe("");
55
+ });
56
+ });
@@ -0,0 +1,43 @@
1
+ // Claude Code-specific rendering of session-start context.
2
+ //
3
+ // The available-skills reminder is a *harness-specific* surface: Claude Code
4
+ // presents discoverable skills to an agent as "The following skills are
5
+ // available for use with the Skill tool:" followed by `- name: description`
6
+ // bullets. Other harnesses (Codex, OpenCode) surface their skills differently,
7
+ // so this rendering lives in an adapter rather than inline in the harness-
8
+ // agnostic orchestrator. A new harness adds its own renderer alongside this one
9
+ // (see harness-parity-check.md).
10
+
11
+ import type { AvailableSkill } from "../types";
12
+
13
+ /**
14
+ * Render the list of discoverable skills the way a real Claude Code session
15
+ * surfaces them, so an eval dispatch mirrors a genuine session rather than
16
+ * announcing itself as an eval. Returns an empty string when no skills are
17
+ * staged (the caller omits the block entirely in that case).
18
+ */
19
+ export function renderAvailableSkillsBlock(skills: AvailableSkill[]): string {
20
+ if (skills.length === 0) return "";
21
+ const sorted = [...skills].sort((a, b) => a.name.localeCompare(b.name));
22
+ const lines = sorted.map((s) => `- ${s.name}: ${s.description}`);
23
+ return [
24
+ "The following skills are available for use with the Skill tool:",
25
+ "",
26
+ ...lines,
27
+ ].join("\n");
28
+ }
29
+
30
+ /**
31
+ * Render a plan-mode profile the way Claude Code injects an operating mode into
32
+ * a live session: as a `<system-reminder>` block the agent is told it is
33
+ * operating under, not prose it merely reads. The profile text (the verbatim
34
+ * plan-mode procedure) lives in `../profiles/claude-code/plan-mode.md`; this
35
+ * adapter owns only the harness-native framing, so a new harness adds its own
36
+ * renderer + profile alongside this one (see harness-parity-check.md). Returns
37
+ * an empty string for empty input so the caller can omit the section entirely.
38
+ */
39
+ export function renderPlanModeContext(profileText: string): string {
40
+ const trimmed = profileText.trim();
41
+ if (!trimmed) return "";
42
+ return ["<system-reminder>", trimmed, "</system-reminder>"].join("\n");
43
+ }
@@ -185,4 +185,80 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
185
185
  ),
186
186
  ).toBe(true);
187
187
  });
188
+
189
+ test("surfaces plugin-shadow findings as validity_warnings", () => {
190
+ const root = join(FIXTURE_ROOT, "agg-shadow");
191
+ const skillDir = join(root, "skill-dir");
192
+ const skillSub = join(skillDir, "mr-review");
193
+ mkdirSync(skillSub, { recursive: true });
194
+ writeFileSync(
195
+ join(skillSub, "SKILL.md"),
196
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
197
+ );
198
+
199
+ const cwd = join(root, "work");
200
+ const iterationDir = join(
201
+ cwd,
202
+ "skills-workspace",
203
+ "mr-review",
204
+ "iteration-1",
205
+ );
206
+ mkdirSync(iterationDir, { recursive: true });
207
+ writeJson(join(iterationDir, "conditions.json"), {
208
+ mode: "new-skill",
209
+ conditions: [
210
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
211
+ { name: "without_skill", skill_path: null },
212
+ ],
213
+ timestamp: new Date().toISOString(),
214
+ harness: "claude-code",
215
+ });
216
+ for (const cond of ["with_skill", "without_skill"]) {
217
+ const condDir = join(iterationDir, "eval-e1", cond);
218
+ mkdirSync(condDir, { recursive: true });
219
+ writeJson(join(condDir, "grading.json"), {
220
+ assertion_results: [],
221
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
222
+ });
223
+ writeJson(join(condDir, "timing.json"), {
224
+ total_tokens: 100,
225
+ duration_ms: 1,
226
+ });
227
+ }
228
+ writeJson(join(iterationDir, "plugin-shadow.json"), {
229
+ config_dir: "/home/u/.claude",
230
+ shadowed: [
231
+ {
232
+ kind: "plugin",
233
+ plugin: "slow-powers@slowdini",
234
+ skill_name: "mr-review",
235
+ path: "/home/u/.claude/plugins/cache/slowdini/slow-powers/skills/mr-review",
236
+ },
237
+ ],
238
+ });
239
+
240
+ const res = Bun.spawnSync(
241
+ [
242
+ "bun",
243
+ "run",
244
+ AGGREGATE_TS,
245
+ "--skill-dir",
246
+ skillDir,
247
+ "--skill",
248
+ "mr-review",
249
+ "--iteration",
250
+ "1",
251
+ ],
252
+ { cwd, stdout: "pipe", stderr: "pipe" },
253
+ );
254
+ expect(res.exitCode).toBe(0);
255
+ const benchmark = JSON.parse(
256
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
257
+ ) as { validity_warnings: string[] };
258
+ expect(
259
+ benchmark.validity_warnings.some(
260
+ (w) => w.includes("mr-review") && /contaminat/i.test(w),
261
+ ),
262
+ ).toBe(true);
263
+ });
188
264
  });
@@ -2,6 +2,10 @@
2
2
  import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
3
3
  import { join } from "node:path";
4
4
  import { detectRunContext } from "./context";
5
+ import {
6
+ type PluginShadowReport,
7
+ shadowValidityWarnings,
8
+ } from "./plugin-shadow";
5
9
  import type { ConditionsRecord, GradingResult, TimingRecord } from "./types";
6
10
 
7
11
  function die(msg: string): never {
@@ -198,6 +202,22 @@ if (existsSync(strayPath)) {
198
202
  }
199
203
  }
200
204
 
205
+ // Plugin-shadow findings (from the runner's build-time preflight, Claude Code)
206
+ // taint a run the same way a missed invocation does: a staged skill also served
207
+ // by an enabled plugin means subagents could discover both copies, so the
208
+ // with/without comparison may not reflect the staged skill alone.
209
+ const shadowPath = join(iterationDir, "plugin-shadow.json");
210
+ if (existsSync(shadowPath)) {
211
+ try {
212
+ const report = JSON.parse(
213
+ readFileSync(shadowPath, "utf8"),
214
+ ) as PluginShadowReport;
215
+ for (const w of shadowValidityWarnings(report)) validityWarnings.push(w);
216
+ } catch {
217
+ // ignore a malformed report rather than failing aggregation
218
+ }
219
+ }
220
+
201
221
  const benchmark = {
202
222
  generated: new Date().toISOString(),
203
223
  mode: conditions.mode,