@slowdini/slow-powers-opencode 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -8
- package/bootstrap.md +50 -4
- package/package.json +2 -3
- package/skills/evaluating-skills/SKILL.md +5 -3
- package/skills/evaluating-skills/harness-details/claude.md +24 -1
- package/skills/evaluating-skills/runner/README.md +16 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +56 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +43 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +76 -0
- package/skills/evaluating-skills/runner/aggregate.ts +20 -0
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +228 -0
- package/skills/evaluating-skills/runner/plugin-shadow.ts +201 -0
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +11 -0
- package/skills/evaluating-skills/runner/run.test.ts +488 -24
- package/skills/evaluating-skills/runner/run.ts +281 -66
- package/skills/evaluating-skills/runner/types.ts +8 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +3 -7
- package/skills/hardening-plans/evals/baseline/NOTES.md +7 -0
- package/skills/hardening-plans/evals/evals.json +0 -19
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +0 -24
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +0 -24
package/README.md
CHANGED
|
@@ -33,6 +33,26 @@ Slow-powers integrates directly into your agent's session, providing a highly di
|
|
|
33
33
|
Installation differs by harness. If you use more than one, install
|
|
34
34
|
Slow-powers separately for each.
|
|
35
35
|
|
|
36
|
+
### Install with your agent
|
|
37
|
+
|
|
38
|
+
Don't want to look up the steps? Open the harness you want Slow-powers on and
|
|
39
|
+
paste this prompt to its agent — it'll read the guide, work out which harness
|
|
40
|
+
it's in, and do the install for you:
|
|
41
|
+
|
|
42
|
+
```text
|
|
43
|
+
Install the "slow-powers" plugin for the coding-agent harness you are currently
|
|
44
|
+
running in. Read the installation guide at
|
|
45
|
+
https://github.com/slowdini/slow-powers#installation, determine which harness
|
|
46
|
+
this is (Claude Code, Codex CLI, or OpenCode), and follow the matching steps —
|
|
47
|
+
run the documented marketplace/install commands for Claude Code or Codex, or add
|
|
48
|
+
the package to the `plugin` array in opencode.json for OpenCode. Then tell me
|
|
49
|
+
exactly what you changed and what I need to do to finish (e.g. restart the
|
|
50
|
+
session so the skills load).
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
The per-harness instructions below are the source of truth the agent follows —
|
|
54
|
+
and the reference for installing by hand.
|
|
55
|
+
|
|
36
56
|
### Claude Code
|
|
37
57
|
|
|
38
58
|
```
|
|
@@ -65,13 +85,7 @@ Add Slow-powers to the `plugin` array in your `opencode.json` (global or project
|
|
|
65
85
|
}
|
|
66
86
|
```
|
|
67
87
|
|
|
68
|
-
This installs the latest version from npm.
|
|
69
|
-
|
|
70
|
-
```json
|
|
71
|
-
{
|
|
72
|
-
"plugin": ["github:slowdini/slow-powers#main"]
|
|
73
|
-
}
|
|
74
|
-
```
|
|
88
|
+
This installs the latest published version from npm.
|
|
75
89
|
|
|
76
90
|
## The Core Execution Utilities
|
|
77
91
|
|
|
@@ -110,7 +124,7 @@ Flat layout — skills and assets live at root, harness-specific integration liv
|
|
|
110
124
|
- `tests/` — Cross-cutting and harness-specific tests
|
|
111
125
|
- `.claude-plugin/` — Claude Code plugin manifest and hooks
|
|
112
126
|
- `.codex-plugin/` — OpenAI Codex plugin manifest
|
|
113
|
-
- `opencode/` — OpenCode plugin
|
|
127
|
+
- `opencode/` — OpenCode plugin
|
|
114
128
|
- `.claude-plugin/marketplace.json` — Claude Code marketplace registry
|
|
115
129
|
- `package.json` — OpenCode plugin manifest + dev tooling
|
|
116
130
|
- `harness-parity-check.md` — Instructions for an agent in any harness to audit feature gaps and prep to close one
|
package/bootstrap.md
CHANGED
|
@@ -1,12 +1,58 @@
|
|
|
1
1
|
# Instructions for using Slow-powers Skills
|
|
2
2
|
|
|
3
|
+
<EXTREMELY-IMPORTANT>
|
|
4
|
+
If you think there is even a 1% chance a skill might apply to what you are doing, you ABSOLUTELY MUST invoke the skill.
|
|
5
|
+
IF A SKILL APPLIES TO YOUR TASK, YOU DO NOT HAVE A CHOICE. YOU MUST USE IT.
|
|
6
|
+
This is not negotiable. This is not optional. You cannot rationalize your way out of this.
|
|
7
|
+
</EXTREMELY-IMPORTANT>
|
|
8
|
+
|
|
3
9
|
These skills are quality gates on procedures you already run. They don't grant abilities — they enhance how you execute work you already know how to do.
|
|
4
10
|
|
|
5
|
-
When you reach a gate moment — about to code, debug, claim done, finish a branch — the matching skill's description surfaces it. Load it then, even if your procedure already feels complete. That "feels complete" is the gate's target.
|
|
11
|
+
When you reach a gate moment — about to code, hand off a plan, debug, claim done, finish a branch — the matching skill's description surfaces it. Load it then, even if your procedure already feels complete. That "feels complete" is the gate's target.
|
|
6
12
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
13
|
+
## The Rule
|
|
14
|
+
|
|
15
|
+
**Invoke relevant or requested skills BEFORE any response or action.** Even a 1% chance a skill might apply means that you should invoke the skill to check. If an invoked skill turns out to be wrong for the situation, you don't need to use it.
|
|
16
|
+
|
|
17
|
+
```dot
|
|
18
|
+
digraph skill_flow {
|
|
19
|
+
"User message received" [shape=doublecircle];
|
|
20
|
+
"Might any skill apply?" [shape=diamond];
|
|
21
|
+
"Invoke skill mechanism" [shape=box];
|
|
22
|
+
"Announce: 'Using [skill] to [purpose]'" [shape=box];
|
|
23
|
+
"Has checklist?" [shape=diamond];
|
|
24
|
+
"Create todo per item with persistent task tracker" [shape=box];
|
|
25
|
+
"Follow skill exactly" [shape=box];
|
|
26
|
+
"Respond (including clarifications)" [shape=doublecircle];
|
|
27
|
+
|
|
28
|
+
"User message received" -> "Might any skill apply?";
|
|
29
|
+
"Might any skill apply?" -> "Invoke skill mechanism" [label="yes, even 1%"];
|
|
30
|
+
"Might any skill apply?" -> "Respond (including clarifications)" [label="definitely not"];
|
|
31
|
+
"Invoke skill mechanism" -> "Announce: 'Using [skill] to [purpose]'";
|
|
32
|
+
"Announce: 'Using [skill] to [purpose]'" -> "Has checklist?";
|
|
33
|
+
"Has checklist?" -> "Create todo per item with persistent task tracker" [label="yes"];
|
|
34
|
+
"Has checklist?" -> "Follow skill exactly" [label="no"];
|
|
35
|
+
"Create todo per item with persistent task tracker" -> "Follow skill exactly";
|
|
36
|
+
}
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Red Flags
|
|
40
|
+
|
|
41
|
+
These thoughts mean STOP — you're rationalizing:
|
|
42
|
+
|
|
43
|
+
| Thought | Reality |
|
|
44
|
+
|---------|---------|
|
|
45
|
+
| "This is just a simple question" | Questions are tasks. Check for skills. |
|
|
46
|
+
| "I need more context first" | Skill check comes BEFORE clarifying questions. |
|
|
47
|
+
| "I can check git/files quickly" | Files lack conversation context. Check for skills. |
|
|
48
|
+
| "Let me gather information first" | Skills tell you HOW to gather information. |
|
|
49
|
+
| "This doesn't need a formal skill" | If a skill exists, use it. |
|
|
50
|
+
| "I remember this skill" | Skills evolve. Read current version. |
|
|
51
|
+
| "This doesn't count as a task" | Action = task. Check for skills. |
|
|
52
|
+
| "The skill is overkill" | Simple things become complex. Use it. |
|
|
53
|
+
| "I'll just do this one thing first" | Check BEFORE doing anything. |
|
|
54
|
+
| "This feels productive" | Undisciplined action wastes time. Skills prevent this. |
|
|
55
|
+
| "I know what that means" | Knowing the concept ≠ using the skill. Invoke it. |
|
|
10
56
|
|
|
11
57
|
## Instruction Priority
|
|
12
58
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@slowdini/slow-powers-opencode",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4",
|
|
4
4
|
"description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./opencode/plugins/slow-powers.js",
|
|
@@ -45,11 +45,10 @@
|
|
|
45
45
|
"evals:grade": "bun run skills/evaluating-skills/runner/grade.ts --skill-dir ./skills",
|
|
46
46
|
"evals:aggregate": "bun run skills/evaluating-skills/runner/aggregate.ts --skill-dir ./skills",
|
|
47
47
|
"evals:promote-baseline": "bun run skills/evaluating-skills/runner/promote-baseline.ts --skill-dir ./skills",
|
|
48
|
-
"version": "bun scripts/bump-version.ts",
|
|
49
48
|
"check": "biome check --write .",
|
|
50
49
|
"check:ci": "biome check --error-on-warnings .",
|
|
51
50
|
"typecheck": "tsc --noEmit",
|
|
52
|
-
"
|
|
51
|
+
"prepare": "node .husky/install.mjs",
|
|
53
52
|
"prepublishOnly": "node -e \"if (process.env.CI !== 'true') { console.error('Publishing should be done via CI'); process.exit(1); }\""
|
|
54
53
|
},
|
|
55
54
|
"devDependencies": {
|
|
@@ -45,7 +45,7 @@ The runner takes two required flags:
|
|
|
45
45
|
- `--skill-dir <path>` — a directory containing one or more skill folders. **This directory is the eval's test environment.** Every skill in it is staged for the subagent: the skill-under-test under a unique slug, every *other* skill under its natural name.
|
|
46
46
|
- `--skill <name>` — which subdirectory of `--skill-dir` to evaluate.
|
|
47
47
|
|
|
48
|
-
Optional flags: `--bootstrap <path>` (see *Bootstrap content* below), `--workspace-dir <path>` (defaults to `<cwd>/skills-workspace`), `--mode new-skill|revision`, `--baseline <label>`, `--harness`, `--no-stage`, `--dry-run`, `--guard` (Claude Code only — arm the write guard; see *Sandboxing eval subagents*).
|
|
48
|
+
Optional flags: `--bootstrap <path>` (see *Bootstrap content* below), `--workspace-dir <path>` (defaults to `<cwd>/skills-workspace`), `--mode new-skill|revision`, `--baseline <label>`, `--only <id,...>` / `--skip <id,...>` (run only / all-but the named eval ids — for cost-conscious reduced-set runs without editing `evals.json`; mutually exclusive, errors on an unknown id), `--harness`, `--no-stage`, `--dry-run`, `--guard` (Claude Code only — arm the write guard; see *Sandboxing eval subagents*), `--plan-mode` (Claude Code only — inject the harness's verbatim plan-mode procedure as an operating-context layer; opt-in, for plan-mode-relevant skills only; see *Seeding conversation context (and its ceiling)*).
|
|
49
49
|
|
|
50
50
|
Each iteration lands under `<workspace-dir>/<skill>/iteration-N/` with the same tree described in *Workspace layout* below, plus a machine-readable `dispatch.json` and a human-readable `dispatch-manifest.md`. The end product is `benchmark.json`: read its `run_summary`, `delta`, and `validity_warnings`.
|
|
51
51
|
|
|
@@ -55,7 +55,7 @@ The runner stages every skill it finds under `--skill-dir`. The skill-under-test
|
|
|
55
55
|
|
|
56
56
|
#### Bootstrap content
|
|
57
57
|
|
|
58
|
-
Every dispatch prompt includes
|
|
58
|
+
Every dispatch prompt includes an available-skills block listing the skills staged for this eval (auto-built by the runner), rendered in the harness's native presentation so the dispatch reads like a real session rather than an eval. If you also want product-specific framing prepended — instruction priority rules, planning guidelines, anything you'd put in a SessionStart hook — author a Markdown file and pass it via `--bootstrap <path>`. The runner emits the file verbatim inside a `<session-start-context>` block, before the available-skills block. Omit `--bootstrap` and the dispatch carries only the available-skills block, nothing else.
|
|
59
59
|
|
|
60
60
|
## Designing test cases
|
|
61
61
|
|
|
@@ -113,6 +113,8 @@ Keep the seeded turns short and concrete; the point is to establish momentum, no
|
|
|
113
113
|
|
|
114
114
|
**The ceiling — state it plainly.** A seed is *text the subagent reads*, not a state it operates under. It cannot place the agent in a harness-injected mode — a real plan mode, an enforced multi-phase workflow, genuine context-window pressure — it can only *describe* one. So when the wild failure you're chasing was *caused* by such a mode (the documented case: an agent in plan mode that invoked **zero** skills because the mode's own procedure made loading them feel redundant), a text seed cannot fully reproduce it — the causal layer is exactly the one a prompt string can't inject. A seeded **pass is therefore necessary but not sufficient** — it under-estimates real-session difficulty — and a seed that *fails* to reproduce a known wild failure is usually hitting this ceiling, not testing a bad seed. Treat seeded results as a stronger-than-cold signal, not as ground truth, and don't let downstream work over-trust them. Faithfully reproducing a mode-caused failure needs a real harness mode the runner can't inject today — track that as a parity goal.
|
|
115
115
|
|
|
116
|
+
**Narrowing the gap — `--plan-mode`.** For the documented plan-mode case, the runner offers the highest-fidelity in-runner approximation: `--plan-mode` injects the harness's *verbatim* plan-mode procedure (its rigid multi-phase terminal rail) into every dispatch as an operating-context layer the subagent is told it is operating under — a `<system-reminder>` block after the session-start surfaces — rather than a paraphrase the agent merely reads in the seed prose. The profile is a per-harness asset (`runner/profiles/<harness>/plan-mode.md`); it is opt-in and meant only for plan-mode-relevant skills (a harness without a profile errors, leaving the portable contract unchanged). This narrows the gap (verbatim procedure > paraphrase) but does **not** close it: it is still text the agent reads, not an injected mode, so the necessary-not-sufficient ceiling above stands unchanged. Use it as the strongest in-runner signal and pair it with a paraphrase-seed arm to measure whether removing the invoke-hint lets `with_skill` invocation de-saturate.
|
|
117
|
+
|
|
116
118
|
## Pre-flight gate (required)
|
|
117
119
|
|
|
118
120
|
An eval run is not free. Each test case dispatches a fresh subagent **per condition** — an N-case suite is `2N` full agent sessions, plus a judge dispatch for every `llm_judge` assertion. That is real wall-clock time and real tokens, and a subagent under test can write outside its sandbox and pollute the real workspace. **Never kick off a run silently.**
|
|
@@ -275,7 +277,7 @@ The check has two tiers, chosen automatically per run:
|
|
|
275
277
|
- **Code-based (Claude Code).** On harnesses that persist subagent transcripts with discrete `Skill` tool calls, the framework parses the transcript and checks for a `Skill` invocation whose `input.skill` matches the eval-staged slug. This is deterministic, free, and cannot be fooled by superficial vocabulary in the response.
|
|
276
278
|
- **LLM-judge fallback (other harnesses).** Where transcripts aren't available or the harness injects skills via system-prompt hooks rather than a tool call (Codex, OpenCode), a judge subagent compares the agent's `final_message` against the SKILL.md content embedded in the run record, looking for behavioral fingerprints — distinctive vocabulary, named sections, procedural steps that mirror the skill's phrasing. It does **not** require the agent to explicitly cite the skill (that would taint the eval).
|
|
277
279
|
|
|
278
|
-
To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used.
|
|
280
|
+
To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The slug prevents an on-disk *collision*, not runtime *discovery*: if the same skill is also provided by an installed, **enabled** plugin, the subagent can still discover and invoke that copy — contaminating both arms (the control arm is no longer skill-absent). On Claude Code the runner flags this at build time (a "plugin-shadow" warning, also surfaced in `benchmark.json`'s `validity_warnings`), but cannot unload a live plugin; to remove the installed copy, run the eval from a plugin-isolated session — see `harness-details/claude.md` → *Isolating from installed plugins*. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used.
|
|
279
281
|
|
|
280
282
|
The aggregator emits a `validity_warnings` array when any with-skill condition has an invocation rate below 100%. Read those before interpreting the substantive delta. The rate is computed only over evals where the skill *should* fire; negative evals (`skill_should_trigger: false`) are excluded so a correct non-trigger never depresses the rate or raises a spurious warning.
|
|
281
283
|
|
|
@@ -4,6 +4,25 @@ This is the Claude Code-specific walkthrough for `evaluating-skills`. The runner
|
|
|
4
4
|
|
|
5
5
|
Use this when a user, working from their own skill folder, asks to run an eval (e.g. "run an eval on this skill to check if a change reduces token usage").
|
|
6
6
|
|
|
7
|
+
## Isolating from installed plugins
|
|
8
|
+
|
|
9
|
+
**Read this first if the skill you're evaluating shares a name with one an installed, enabled plugin provides** — e.g. evaluating a slow-powers skill with the slow-powers plugin installed, or any user evaluating their own plugin's skills.
|
|
10
|
+
|
|
11
|
+
Eval subagents are dispatched via the **Task tool**, so they run in-process and inherit *this session's* enabled plugins and global skills. The runner stages the skill-under-test under a unique slug (`slow-powers-eval-…`) — that avoids an on-disk collision and lets the `__skill_invoked` meta-check find the staged copy — but it does **not** stop the installed plugin's own `<plugin>:<name>` copy from also being discoverable. When both copies are reachable:
|
|
12
|
+
|
|
13
|
+
- the with-skill arm can invoke the staged slug *and then* reach for the installed copy (redundant/leaked invocation), and
|
|
14
|
+
- the `without_skill` arm is **not truly skill-absent** — the installed copy is still discoverable, contaminating the baseline and shrinking the measured delta.
|
|
15
|
+
|
|
16
|
+
Plugins load at **session start** and the runner can't unload them mid-session, so it only *detects and warns* (a build-time "plugin-shadow" banner, also surfaced in `benchmark.json`'s `validity_warnings`). To actually isolate, **launch the session you run the eval from** one of these ways — subagents inherit it:
|
|
17
|
+
|
|
18
|
+
1. **Drop user-scope plugins, keep auth:** `claude --setting-sources project,local`. User-scope `enabledPlugins` (where user-installed plugins are enabled) isn't loaded, so they don't appear. Auth is unaffected. (Also drops your other user-scope settings/MCP for that session.)
|
|
19
|
+
2. **Disable the specific plugin, then restart:** set `"enabledPlugins": { "<plugin>@<marketplace>": false }` in a settings source that loads at startup (project `.claude/settings.json` or user `~/.claude/settings.json`) and start a fresh session. *(The slow-powers repo ships this for `slow-powers@slowdini` and `superpowers@claude-plugins-official` in its own `.claude/settings.json`.)*
|
|
20
|
+
3. **Clean config dir (strips everything):** `CLAUDE_CONFIG_DIR="$(mktemp -d)" claude`. No installed plugins or global skills load at all. **Auth caveat:** your OAuth session lives in `~/.claude.json`, which a relocated config dir may not carry — set `ANTHROPIC_API_KEY` or re-authenticate once in the fresh dir.
|
|
21
|
+
|
|
22
|
+
All three keep the eval working: project-local staged skills live in `<cwd>/.claude/skills/` (project scope, independent of installed plugins), so they still load and the meta-check still resolves the slug. A clean config dir (option 3) additionally means the real SessionStart bootstrap hook doesn't fire, so the only session-start framing present is whatever you pass via `--bootstrap` — which removes the separate "even a 1% chance → you MUST invoke" mandate that otherwise pins invocation at 100%.
|
|
23
|
+
|
|
24
|
+
**Verify before you run:** the installed twin should be gone — `/plugin` shows it disabled, or the runner's build step prints no plugin-shadow banner.
|
|
25
|
+
|
|
7
26
|
## Step 1 — Resolve the bundled runner
|
|
8
27
|
|
|
9
28
|
The runner ships inside the installed slow-powers plugin. Resolve its path once per session and reuse it. Use `find` rather than a shell glob so the command behaves the same under bash and zsh (a bare glob with no match errors under zsh):
|
|
@@ -97,7 +116,11 @@ bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" snapshot --skill-dir <skill-dir> --ski
|
|
|
97
116
|
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" --skill-dir <skill-dir> --skill <name> --mode revision --baseline baseline --guard
|
|
98
117
|
```
|
|
99
118
|
|
|
100
|
-
Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built
|
|
119
|
+
Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built available-skills block (rendered the way Claude Code surfaces discoverable skills, so the dispatch reads like a real session).
|
|
120
|
+
|
|
121
|
+
For a **plan-mode-relevant skill** (e.g. `hardening-plans`), add `--plan-mode` to inject Claude Code's verbatim plan-mode procedure as a `<system-reminder>` operating-context layer in every dispatch — the highest-fidelity in-runner approximation of a real plan mode (issue #142). Use it as the verbatim-procedure arm of an A/B against a plain paraphrase-seed run (no flag) to measure whether `with_skill` invocation de-saturates. It is still text the agent reads, not an injected mode, so treat any de-saturation as a stronger-than-cold signal, not ground truth (see *Seeding conversation context (and its ceiling)* in `../SKILL.md`).
|
|
122
|
+
|
|
123
|
+
**The live ExitPlanMode → hardening-plans hook is not exercised here.** The shipped Claude plugin gates plan hand-off with a `PreToolUse` hook on `ExitPlanMode` (`hooks/exit-plan-mode`) that denies the first plan-exit and steers the agent through `hardening-plans` before the plan is presented. The runner only *simulates* plan mode as injected `<system-reminder>` text and dispatches single agent turns — it never emits a real `ExitPlanMode` tool call nor runs `PreToolUse` hooks, so that gate is structurally outside what the eval harness can exercise. This is the standing reason a `hardening-plans` invocation-rate delta *from the hook* can't be exhibited in-runner, independent of the #119 invocation-hint gate and the plan-mode-simulation ceiling.
|
|
101
124
|
|
|
102
125
|
Only when the user has opted out of the guard, drop `--guard` from the command above and rely on the post-hoc `detect-stray-writes` step in Step 10 instead — it reports stray writes but does not clean them up.
|
|
103
126
|
|
|
@@ -22,6 +22,8 @@ Other flags:
|
|
|
22
22
|
- `--workspace-dir <path>` (optional) — where iteration artifacts are written. Defaults to `<CWD>/skills-workspace`.
|
|
23
23
|
- `--harness claude-code` (optional, default `claude-code`; the only supported harness).
|
|
24
24
|
- `--no-stage`, `--dry-run`, `--iteration <N>`, `--mode <new-skill|revision>`, `--baseline <label>`, `--label <label>` — as before.
|
|
25
|
+
- `--only <id,id,...>` / `--skip <id,id,...>` (optional) — run only, or all-but, the named eval ids from `evals.json`. The two are mutually exclusive, and every named id must exist (the run aborts with the available ids listed otherwise). Use this for a cost-conscious reduced-set run instead of temporarily editing `evals.json` down. The pre-flight summary and the `N evals × 2 conditions` count reflect the filtered set.
|
|
26
|
+
- `--plan-mode` (optional, Claude Code) — inject the harness's verbatim plan-mode procedure as an operating-context layer. When set, the runner reads `profiles/<harness>/plan-mode.md` and emits it (via the session adapter's `renderPlanModeContext`) as a `<system-reminder>` block in every dispatch, after the available-skills block and before the user request. It is identical across the with/without-skill arms and recorded as `plan_mode` in `dispatch.json`. This is issue #142's highest-fidelity in-runner approximation of a real plan mode — still text the agent reads, so a pass is necessary-not-sufficient; see *Seeding conversation context (and its ceiling)* in `../SKILL.md`. Opt-in, and meant only for plan-mode-relevant skills; a harness with no profile aborts the run, leaving the portable dispatch contract unchanged.
|
|
25
27
|
|
|
26
28
|
Staging is written under the current working directory: `<CWD>/.claude/skills/`. A subagent dispatched from that CWD discovers the staged skills there. Run the commands from the directory you want to be the eval root (the repo root for internal use; your skill folder or its parent for personal use).
|
|
27
29
|
|
|
@@ -84,6 +86,15 @@ bun run evals -- --skill <name> --mode revision --baseline baseline-2026-05-24
|
|
|
84
86
|
bun run evals -- --skill <name> --mode new-skill --dry-run
|
|
85
87
|
```
|
|
86
88
|
|
|
89
|
+
### Reduced-set run (cost-conscious subset)
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Run just two of the defined evals, leaving evals.json untouched.
|
|
93
|
+
bun run evals -- --skill <name> --mode new-skill --only case-a,case-b
|
|
94
|
+
# Or run everything except a slow case.
|
|
95
|
+
bun run evals -- --skill <name> --mode new-skill --skip slow-case
|
|
96
|
+
```
|
|
97
|
+
|
|
87
98
|
## Quickstart (running an eval on your own skill)
|
|
88
99
|
|
|
89
100
|
If you have the slow-powers plugin installed and a personal skill, you do **not** run the npm scripts. The skill's `SKILL.md` routes you to `../harness-details/<harness>.md`, which gives the full command sequence (resolving the installed runner path, invoking `run.ts` directly with `--skill-dir`/`--skill`, dispatching subagents, grading). On Claude Code, see `../harness-details/claude.md`.
|
|
@@ -104,12 +115,14 @@ If you have the slow-powers plugin installed and a personal skill, you do **not*
|
|
|
104
115
|
|
|
105
116
|
A subagent that runs an eval should start in an environment that mirrors a real install of the plugin under evaluation. Otherwise the result depends on the operator's local install state (whether they happen to have the plugin loaded into their parent session, which version, etc.) rather than the skill being measured. The runner produces this parity explicitly so results reproduce on a clean checkout or in CI.
|
|
106
117
|
|
|
118
|
+
**Caveat — parity is only as clean as the operator's session.** Staging controls what the runner *adds* (the skills below), not what the operator's session already *loaded*. Subagents are dispatched in-process and share the parent session's plugins, so if that session has the plugin-under-evaluation — or any plugin exposing a same-named skill — enabled, the subagent discovers that copy too. That is exactly the "operator's local install state" dependency this section warns against, and the unique staging slug does not prevent it (it stops an on-disk collision, not runtime discovery). The runner can't unload a live plugin; on Claude Code it emits a build-time *plugin-shadow* warning (also surfaced in `benchmark.json`'s `validity_warnings`) so the contamination is visible. Closing it is a launch-time step: run the eval from a plugin-isolated session — see `../harness-details/claude.md` → *Isolating from installed plugins*.
|
|
119
|
+
|
|
107
120
|
Parity has two parts, both applied when `--no-stage` is NOT set (the default `--harness claude-code`):
|
|
108
121
|
|
|
109
|
-
1. **
|
|
122
|
+
1. **An available-skills block is built into every dispatch prompt.** The runner lists the skills actually staged for the eval — the skill-under-test plus the siblings found in `--skill-dir` — as its **own block**, rendered the way the harness surfaces discoverable skills to a real session rather than in an eval-specific format. On Claude Code that is `The following skills are available for use with the Skill tool:` followed by `- name: description` bullets. This rendering is **harness-specific** and lives in `adapters/claude-code-session.ts` (a new harness adds its own renderer alongside it). The block is emitted *after*, and separate from, the `<session-start-context>` block — mirroring how a real session delivers the SessionStart hook and the skill list as two distinct surfaces. It tells the subagent what is discoverable, independent of any `--bootstrap` file.
|
|
110
123
|
2. **Every skill in `--skill-dir` is staged.** The skill-under-test is staged under its unique slug (`<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/`); every *other* skill in `--skill-dir` is copied to `<stageRoot>/.claude/skills/<name>/` at its natural name (excluding each skill's `evals/` subdir). Natural names matter because cross-references inside skill bodies (e.g. "REQUIRED SUB-SKILL: Use `slow-powers:test-driven-development`") only resolve cleanly to natural-name entries.
|
|
111
124
|
|
|
112
|
-
`--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) ahead of the
|
|
125
|
+
`--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) inside the `<session-start-context>` block, ahead of the available-skills block. Internal runs pass `./bootstrap.md`. That file does **not** enumerate skills — the available-skills block is the single source of the skill list, so there is no duplication to keep in lockstep. (A *user-supplied* `--bootstrap` that does enumerate skills is handled defensively by `redactSkillFromBootstrap`, which strips the skill-under-test from the bootstrap prose on the `without_skill` arm so it can't leak into the control condition.)
|
|
113
126
|
|
|
114
127
|
The runner records what it staged in `<stageRoot>/.claude/skills/.slow-powers-eval-manifest.json` so cleanup is reversible. Any pre-existing entry with a colliding name is backed up to a temp directory (recorded in the manifest) before being overwritten, and restored on the next `cleanupStagedSkills()` call. The prefix sweep (`slow-powers-eval-*` entries) still runs first so a crashed prior run is recovered even if the manifest itself was never written.
|
|
115
128
|
|
|
@@ -122,6 +135,7 @@ For the **`without_skill` / baseline condition** in this realistic environment,
|
|
|
122
135
|
- **Codex.** Declares `"skills": "./skills/"` in its `plugin.json`, so the harness scans a directory at start-up. Sibling staging would write to whatever staging path that harness reads from — analogous to `stageSiblingSkills()` but pointed at the right directory. Bootstrap can be prepended to the dispatch prompt the same way.
|
|
123
136
|
- **OpenCode.** Installed via npm package; the package's own directory is the discoverable surface. Sibling staging would copy into that directory, or — if the harness loads from `node_modules` directly — into a parallel staging path the harness is configured to scan.
|
|
124
137
|
- **General fallback.** Harnesses without project-local discovery should keep using `--no-stage`; the inline `<skill>` block in the dispatch prompt is the only skill the subagent sees. Bootstrap is omitted in this mode because its references to other skills would mislead the agent.
|
|
138
|
+
- **Plan-mode profiles (`--plan-mode`).** The plan-mode operating-context layer is also a harness-specific surface. The procedure text lives in `profiles/<harness>/plan-mode.md` and is wrapped by a `renderPlanModeContext` in that harness's session adapter (`adapters/<harness>-session.ts`), exactly mirroring how `renderAvailableSkillsBlock` is harness-specific. Only `profiles/claude-code/plan-mode.md` exists today; a harness that wants this fidelity layer adds its own profile file (its native plan/research mode procedure) plus a renderer alongside the Claude ones. A harness with no profile simply has no `--plan-mode`, and the portable dispatch contract is unchanged.
|
|
125
139
|
|
|
126
140
|
The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `harness-parity-check.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.
|
|
127
141
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import type { AvailableSkill } from "../types";
|
|
3
|
+
import {
|
|
4
|
+
renderAvailableSkillsBlock,
|
|
5
|
+
renderPlanModeContext,
|
|
6
|
+
} from "./claude-code-session";
|
|
7
|
+
|
|
8
|
+
const skill = (name: string, description: string): AvailableSkill => ({
|
|
9
|
+
name,
|
|
10
|
+
path: `/x/${name}/SKILL.md`,
|
|
11
|
+
description,
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
describe("renderAvailableSkillsBlock", () => {
|
|
15
|
+
test("uses the harness-native header and one `- name: description` bullet per skill", () => {
|
|
16
|
+
const block = renderAvailableSkillsBlock([skill("foo", "the foo skill")]);
|
|
17
|
+
expect(block).toContain(
|
|
18
|
+
"The following skills are available for use with the Skill tool:",
|
|
19
|
+
);
|
|
20
|
+
expect(block).toContain("- foo: the foo skill");
|
|
21
|
+
// The eval-flavored wording and custom format must be gone.
|
|
22
|
+
expect(block).not.toContain("staged and discoverable");
|
|
23
|
+
expect(block).not.toContain("*Trigger:*");
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test("sorts skills by name", () => {
|
|
27
|
+
const block = renderAvailableSkillsBlock([
|
|
28
|
+
skill("zebra", "z"),
|
|
29
|
+
skill("alpha", "a"),
|
|
30
|
+
]);
|
|
31
|
+
expect(block.indexOf("- alpha:")).toBeLessThan(block.indexOf("- zebra:"));
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
test("returns an empty string for an empty list", () => {
|
|
35
|
+
expect(renderAvailableSkillsBlock([])).toBe("");
|
|
36
|
+
});
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
describe("renderPlanModeContext", () => {
|
|
40
|
+
test("wraps the profile text in a harness-native system-reminder block", () => {
|
|
41
|
+
const block = renderPlanModeContext("Plan mode is active. Do not edit.");
|
|
42
|
+
expect(block).toContain("<system-reminder>");
|
|
43
|
+
expect(block).toContain("</system-reminder>");
|
|
44
|
+
expect(block).toContain("Plan mode is active. Do not edit.");
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
test("trims surrounding whitespace from the profile text", () => {
|
|
48
|
+
const block = renderPlanModeContext("\n\n PROFILE-BODY \n\n");
|
|
49
|
+
expect(block).toBe("<system-reminder>\nPROFILE-BODY\n</system-reminder>");
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test("returns an empty string for empty or whitespace-only input", () => {
|
|
53
|
+
expect(renderPlanModeContext("")).toBe("");
|
|
54
|
+
expect(renderPlanModeContext(" \n ")).toBe("");
|
|
55
|
+
});
|
|
56
|
+
});
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// Claude Code-specific rendering of session-start context.
|
|
2
|
+
//
|
|
3
|
+
// The available-skills reminder is a *harness-specific* surface: Claude Code
|
|
4
|
+
// presents discoverable skills to an agent as "The following skills are
|
|
5
|
+
// available for use with the Skill tool:" followed by `- name: description`
|
|
6
|
+
// bullets. Other harnesses (Codex, OpenCode) surface their skills differently,
|
|
7
|
+
// so this rendering lives in an adapter rather than inline in the harness-
|
|
8
|
+
// agnostic orchestrator. A new harness adds its own renderer alongside this one
|
|
9
|
+
// (see harness-parity-check.md).
|
|
10
|
+
|
|
11
|
+
import type { AvailableSkill } from "../types";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Render the list of discoverable skills the way a real Claude Code session
|
|
15
|
+
* surfaces them, so an eval dispatch mirrors a genuine session rather than
|
|
16
|
+
* announcing itself as an eval. Returns an empty string when no skills are
|
|
17
|
+
* staged (the caller omits the block entirely in that case).
|
|
18
|
+
*/
|
|
19
|
+
export function renderAvailableSkillsBlock(skills: AvailableSkill[]): string {
|
|
20
|
+
if (skills.length === 0) return "";
|
|
21
|
+
const sorted = [...skills].sort((a, b) => a.name.localeCompare(b.name));
|
|
22
|
+
const lines = sorted.map((s) => `- ${s.name}: ${s.description}`);
|
|
23
|
+
return [
|
|
24
|
+
"The following skills are available for use with the Skill tool:",
|
|
25
|
+
"",
|
|
26
|
+
...lines,
|
|
27
|
+
].join("\n");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Render a plan-mode profile the way Claude Code injects an operating mode into
|
|
32
|
+
* a live session: as a `<system-reminder>` block the agent is told it is
|
|
33
|
+
* operating under, not prose it merely reads. The profile text (the verbatim
|
|
34
|
+
* plan-mode procedure) lives in `../profiles/claude-code/plan-mode.md`; this
|
|
35
|
+
* adapter owns only the harness-native framing, so a new harness adds its own
|
|
36
|
+
* renderer + profile alongside this one (see harness-parity-check.md). Returns
|
|
37
|
+
* an empty string for empty input so the caller can omit the section entirely.
|
|
38
|
+
*/
|
|
39
|
+
export function renderPlanModeContext(profileText: string): string {
|
|
40
|
+
const trimmed = profileText.trim();
|
|
41
|
+
if (!trimmed) return "";
|
|
42
|
+
return ["<system-reminder>", trimmed, "</system-reminder>"].join("\n");
|
|
43
|
+
}
|
|
@@ -185,4 +185,80 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
|
|
|
185
185
|
),
|
|
186
186
|
).toBe(true);
|
|
187
187
|
});
|
|
188
|
+
|
|
189
|
+
test("surfaces plugin-shadow findings as validity_warnings", () => {
|
|
190
|
+
const root = join(FIXTURE_ROOT, "agg-shadow");
|
|
191
|
+
const skillDir = join(root, "skill-dir");
|
|
192
|
+
const skillSub = join(skillDir, "mr-review");
|
|
193
|
+
mkdirSync(skillSub, { recursive: true });
|
|
194
|
+
writeFileSync(
|
|
195
|
+
join(skillSub, "SKILL.md"),
|
|
196
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
197
|
+
);
|
|
198
|
+
|
|
199
|
+
const cwd = join(root, "work");
|
|
200
|
+
const iterationDir = join(
|
|
201
|
+
cwd,
|
|
202
|
+
"skills-workspace",
|
|
203
|
+
"mr-review",
|
|
204
|
+
"iteration-1",
|
|
205
|
+
);
|
|
206
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
207
|
+
writeJson(join(iterationDir, "conditions.json"), {
|
|
208
|
+
mode: "new-skill",
|
|
209
|
+
conditions: [
|
|
210
|
+
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
211
|
+
{ name: "without_skill", skill_path: null },
|
|
212
|
+
],
|
|
213
|
+
timestamp: new Date().toISOString(),
|
|
214
|
+
harness: "claude-code",
|
|
215
|
+
});
|
|
216
|
+
for (const cond of ["with_skill", "without_skill"]) {
|
|
217
|
+
const condDir = join(iterationDir, "eval-e1", cond);
|
|
218
|
+
mkdirSync(condDir, { recursive: true });
|
|
219
|
+
writeJson(join(condDir, "grading.json"), {
|
|
220
|
+
assertion_results: [],
|
|
221
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
222
|
+
});
|
|
223
|
+
writeJson(join(condDir, "timing.json"), {
|
|
224
|
+
total_tokens: 100,
|
|
225
|
+
duration_ms: 1,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
writeJson(join(iterationDir, "plugin-shadow.json"), {
|
|
229
|
+
config_dir: "/home/u/.claude",
|
|
230
|
+
shadowed: [
|
|
231
|
+
{
|
|
232
|
+
kind: "plugin",
|
|
233
|
+
plugin: "slow-powers@slowdini",
|
|
234
|
+
skill_name: "mr-review",
|
|
235
|
+
path: "/home/u/.claude/plugins/cache/slowdini/slow-powers/skills/mr-review",
|
|
236
|
+
},
|
|
237
|
+
],
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
const res = Bun.spawnSync(
|
|
241
|
+
[
|
|
242
|
+
"bun",
|
|
243
|
+
"run",
|
|
244
|
+
AGGREGATE_TS,
|
|
245
|
+
"--skill-dir",
|
|
246
|
+
skillDir,
|
|
247
|
+
"--skill",
|
|
248
|
+
"mr-review",
|
|
249
|
+
"--iteration",
|
|
250
|
+
"1",
|
|
251
|
+
],
|
|
252
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
253
|
+
);
|
|
254
|
+
expect(res.exitCode).toBe(0);
|
|
255
|
+
const benchmark = JSON.parse(
|
|
256
|
+
readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
|
|
257
|
+
) as { validity_warnings: string[] };
|
|
258
|
+
expect(
|
|
259
|
+
benchmark.validity_warnings.some(
|
|
260
|
+
(w) => w.includes("mr-review") && /contaminat/i.test(w),
|
|
261
|
+
),
|
|
262
|
+
).toBe(true);
|
|
263
|
+
});
|
|
188
264
|
});
|
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
3
|
import { join } from "node:path";
|
|
4
4
|
import { detectRunContext } from "./context";
|
|
5
|
+
import {
|
|
6
|
+
type PluginShadowReport,
|
|
7
|
+
shadowValidityWarnings,
|
|
8
|
+
} from "./plugin-shadow";
|
|
5
9
|
import type { ConditionsRecord, GradingResult, TimingRecord } from "./types";
|
|
6
10
|
|
|
7
11
|
function die(msg: string): never {
|
|
@@ -198,6 +202,22 @@ if (existsSync(strayPath)) {
|
|
|
198
202
|
}
|
|
199
203
|
}
|
|
200
204
|
|
|
205
|
+
// Plugin-shadow findings (from the runner's build-time preflight, Claude Code)
|
|
206
|
+
// taint a run the same way a missed invocation does: a staged skill also served
|
|
207
|
+
// by an enabled plugin means subagents could discover both copies, so the
|
|
208
|
+
// with/without comparison may not reflect the staged skill alone.
|
|
209
|
+
const shadowPath = join(iterationDir, "plugin-shadow.json");
|
|
210
|
+
if (existsSync(shadowPath)) {
|
|
211
|
+
try {
|
|
212
|
+
const report = JSON.parse(
|
|
213
|
+
readFileSync(shadowPath, "utf8"),
|
|
214
|
+
) as PluginShadowReport;
|
|
215
|
+
for (const w of shadowValidityWarnings(report)) validityWarnings.push(w);
|
|
216
|
+
} catch {
|
|
217
|
+
// ignore a malformed report rather than failing aggregation
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
201
221
|
const benchmark = {
|
|
202
222
|
generated: new Date().toISOString(),
|
|
203
223
|
mode: conditions.mode,
|