@slowdini/slow-powers-opencode 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -8
- package/package.json +5 -1
- package/skills/evaluating-skills/SKILL.md +19 -17
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
package/README.md
CHANGED
|
@@ -12,17 +12,27 @@ and clarifying skill content.
|
|
|
12
12
|
|
|
13
13
|
## Quickstart
|
|
14
14
|
|
|
15
|
-
Give your agent superpowers with slow-powers: [Claude Code](#claude-code) · [Codex CLI](#codex-cli) · [OpenCode](#opencode). Support varies per harness — see the [feature support](#feature-support)
|
|
15
|
+
Give your agent superpowers with slow-powers: [Claude Code](#claude-code) · [Codex CLI](#codex-cli) · [OpenCode](#opencode). Support varies per harness — see the [feature support](#feature-support) tables.
|
|
16
16
|
|
|
17
17
|
## Feature support
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|-----------------|----------|----------------------------------------------------------------|
|
|
21
|
-
| Claude Code | Full | Reference implementation |
|
|
22
|
-
| Codex CLI | Partial | Plugin manifest + shared hooks; no eval transcript adapter |
|
|
23
|
-
| OpenCode | Partial | JS plugin with bootstrap injection; no eval transcript adapter |
|
|
19
|
+
Parity is tracked on two independent surfaces. **Plugin distribution** is how Slow-powers reaches a user's session — manifests, bootstrap injection, skill discovery, hooks:
|
|
24
20
|
|
|
25
|
-
|
|
21
|
+
| Harness | Status | Notes |
|
|
22
|
+
|-----------------|----------|-----------------------------------------------------------------------------------|
|
|
23
|
+
| Claude Code | Full | Reference implementation |
|
|
24
|
+
| Codex CLI | Full | Plugin manifest + shared `hooks/hooks.json`; the plan hand-off hook is Claude-native (N/A here, see #141) |
|
|
25
|
+
| OpenCode | Full | JS plugin (npm package) injects bootstrap and registers skills via the native plugin API |
|
|
26
|
+
|
|
27
|
+
The **skill-eval runner** (developer tooling under `skills/evaluating-skills/`, slated to move into its own project) is tracked separately:
|
|
28
|
+
|
|
29
|
+
| Harness | Status | Notes |
|
|
30
|
+
|-----------------|----------|-----------------------------------------------------------------------------------|
|
|
31
|
+
| Claude Code | Full | Reference implementation: transcript adapter, auto-record, `--guard`, `--plan-mode` |
|
|
32
|
+
| Codex CLI | Manual | No transcript adapter — hand-authored run records; `llm_judge` assertions carry the measurement |
|
|
33
|
+
| OpenCode | Manual | No transcript adapter — hand-authored run records; `llm_judge` assertions carry the measurement |
|
|
34
|
+
|
|
35
|
+
Contributors closing parity gaps should follow [`harness-parity-check.md`](./harness-parity-check.md) for distribution gaps, or [`skills/evaluating-skills/harness-parity.md`](./skills/evaluating-skills/harness-parity.md) for eval-runner gaps: each audits which features are wired up for a given harness and preps an agent to close one gap.
|
|
26
36
|
|
|
27
37
|
## How it works
|
|
28
38
|
|
|
@@ -136,7 +146,7 @@ Flat layout — skills and assets live at root, harness-specific integration liv
|
|
|
136
146
|
- `opencode/` — OpenCode plugin
|
|
137
147
|
- `.claude-plugin/marketplace.json` — Claude Code marketplace registry
|
|
138
148
|
- `package.json` — OpenCode plugin manifest + dev tooling
|
|
139
|
-
- `harness-parity-check.md` — Instructions for an agent in any harness to audit
|
|
149
|
+
- `harness-parity-check.md` — Instructions for an agent in any harness to audit plugin-distribution gaps and prep to close one (the eval runner's counterpart lives at `skills/evaluating-skills/harness-parity.md`)
|
|
140
150
|
|
|
141
151
|
## Releasing
|
|
142
152
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@slowdini/slow-powers-opencode",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./opencode/plugins/slow-powers.js",
|
|
@@ -39,9 +39,13 @@
|
|
|
39
39
|
"evals": "bun run skills/evaluating-skills/runner/run.ts --skill-dir ./skills --bootstrap ./bootstrap.md",
|
|
40
40
|
"evals:snapshot": "bun run skills/evaluating-skills/runner/run.ts snapshot --skill-dir ./skills",
|
|
41
41
|
"evals:validate": "bun run skills/evaluating-skills/runner/validate-all.ts --skill-dir ./skills",
|
|
42
|
+
"evals:ingest": "bun run skills/evaluating-skills/runner/run.ts ingest --skill-dir ./skills",
|
|
43
|
+
"evals:finalize": "bun run skills/evaluating-skills/runner/run.ts finalize --skill-dir ./skills",
|
|
44
|
+
"evals:record-runs": "bun run skills/evaluating-skills/runner/record-runs.ts --skill-dir ./skills",
|
|
42
45
|
"evals:fill-transcripts": "bun run skills/evaluating-skills/runner/fill-transcripts.ts --skill-dir ./skills",
|
|
43
46
|
"evals:detect-stray-writes": "bun run skills/evaluating-skills/runner/detect-stray-writes.ts --skill-dir ./skills",
|
|
44
47
|
"evals:teardown-guard": "bun run skills/evaluating-skills/runner/run.ts teardown-guard --skill-dir ./skills",
|
|
48
|
+
"evals:teardown": "bun run skills/evaluating-skills/runner/run.ts teardown --skill-dir ./skills",
|
|
45
49
|
"evals:grade": "bun run skills/evaluating-skills/runner/grade.ts --skill-dir ./skills",
|
|
46
50
|
"evals:aggregate": "bun run skills/evaluating-skills/runner/aggregate.ts --skill-dir ./skills",
|
|
47
51
|
"evals:promote-baseline": "bun run skills/evaluating-skills/runner/promote-baseline.ts --skill-dir ./skills",
|
|
@@ -21,14 +21,16 @@ Compares `with_skill/` vs `without_skill/`. Use when validating a brand-new skil
|
|
|
21
21
|
|
|
22
22
|
### Mode B — Revision comparison
|
|
23
23
|
|
|
24
|
-
Compares `old_skill/` vs `new_skill/`. **This is the common case.** Use when testing a language change to an existing skill — snapshot the
|
|
24
|
+
Compares `old_skill/` vs `new_skill/`. **This is the common case.** Use when testing a language change to an existing skill — snapshot the old SKILL.md as a baseline, then run both variants against the same prompts.
|
|
25
25
|
|
|
26
|
-
Mode B workflow:
|
|
27
|
-
1.
|
|
28
|
-
2.
|
|
26
|
+
Mode B workflow (edit-first — the usual order):
|
|
27
|
+
1. Edit the skill (the new version is now in the working tree)
|
|
28
|
+
2. Snapshot the old version straight from git: `snapshot --label <tag> --ref HEAD` (any commit/tag/branch works; `--ref` reads git without touching the working tree)
|
|
29
29
|
3. Run the eval with `--mode revision --baseline <snapshot-label>`
|
|
30
30
|
4. Grade and aggregate; review the delta
|
|
31
31
|
|
|
32
|
+
If you snapshot *before* editing, omit `--ref` in step 2 (it reads the working tree) and do it ahead of step 1.
|
|
33
|
+
|
|
32
34
|
A negative or zero delta is a signal to revert the change — the new language did not improve behavior.
|
|
33
35
|
|
|
34
36
|
## Running an eval on a skill
|
|
@@ -51,7 +53,7 @@ Each iteration lands under `<workspace-dir>/<skill>/iteration-N/` with the same
|
|
|
51
53
|
|
|
52
54
|
#### What gets staged
|
|
53
55
|
|
|
54
|
-
The runner stages every skill it finds under `--skill-dir`. The skill-under-test goes under a unique slug for the `__skill_invoked` meta-check; sibling skills stage under their natural names so cross-references resolve. **If your `--skill-dir` contains only your one skill, the eval runs in isolation** — references like "REQUIRED SUB-SKILL: `slow-powers:test-driven-development`" won't resolve, and your assertions must not depend on a sibling skill firing. To include other skills as siblings, copy or symlink them into `--skill-dir` before running.
|
|
56
|
+
The runner stages every skill it finds under `--skill-dir`. The skill-under-test goes under a unique slug for the `__skill_invoked` meta-check — with its sibling asset files (any non-`SKILL.md`, non-`evals/` content) copied alongside, so a multi-file skill whose `SKILL.md` links a companion doc (e.g. `[code-review.md](code-review.md)`) still resolves once staged; sibling skills stage under their natural names so cross-references resolve. **If your `--skill-dir` contains only your one skill, the eval runs in isolation** — references like "REQUIRED SUB-SKILL: `slow-powers:test-driven-development`" won't resolve, and your assertions must not depend on a sibling skill firing. To include other skills as siblings, copy or symlink them into `--skill-dir` before running.
|
|
55
57
|
|
|
56
58
|
#### Bootstrap content
|
|
57
59
|
|
|
@@ -135,7 +137,7 @@ Do not dispatch until the user confirms *this summary*. An earlier "run the eval
|
|
|
135
137
|
A subagent under test runs the real skill, and some skills write to disk — the skill that triggered this gate, `working-in-isolation`, creates git worktrees in whatever repo it's pointed at. Without active enforcement those writes land in your working directory.
|
|
136
138
|
|
|
137
139
|
- **Guard available (Claude Code):** arming `--guard` is the default. If you are about to run without it, STOP. Proceed unguarded **only** when the user actively opts out — and warn them that stray writes will then only be **detected after the fact** by `detect-stray-writes`, never blocked or reverted, so anything a subagent writes outside its `outputs/` dir (worktrees, installed packages, edited repo files) persists and is theirs to clean up.
|
|
138
|
-
- **Guard unavailable (other harnesses):** there is no active write enforcement. Tell the user plainly: stray writes are detected and reported by `detect-stray-writes` but **not auto-cleaned** — they must review the report and remove anything that escaped. Harness-level write enforcement is tracked as a parity goal in `harness-parity
|
|
140
|
+
- **Guard unavailable (other harnesses):** there is no active write enforcement. Tell the user plainly: stray writes are detected and reported by `detect-stray-writes` but **not auto-cleaned** — they must review the report and remove anything that escaped. Harness-level write enforcement is tracked as a parity goal in `harness-parity.md`.
|
|
139
141
|
|
|
140
142
|
## Red Flags — STOP before dispatching
|
|
141
143
|
|
|
@@ -157,13 +159,13 @@ For each test case, dispatch fresh general-purpose subagents — one per conditi
|
|
|
157
159
|
|
|
158
160
|
Subagents MUST start with clean context. State leaking from previous runs invalidates the comparison.
|
|
159
161
|
|
|
160
|
-
|
|
162
|
+
Each run needs a portable **run record** (`run.json`, matching `schema/run-record.schema.json`) and a timing record (`timing.json`) holding:
|
|
161
163
|
|
|
162
|
-
- `total_tokens` and `duration_ms`
|
|
164
|
+
- `total_tokens` and `duration_ms`
|
|
163
165
|
- The final user-facing message
|
|
164
166
|
- The tool invocations (best effort — see "Transcript access" below)
|
|
165
167
|
|
|
166
|
-
|
|
168
|
+
On a harness with persisted transcripts (Claude Code), `record-runs` assembles both records from disk after the dispatches — nothing is captured by hand. On a transcript-less harness, capture them manually when each subagent completes: tokens/duration come from the harness's task completion event (**these may not be persisted anywhere else; save them immediately**), and the record is written via that harness's adapter or by hand.
|
|
167
169
|
|
|
168
170
|
### Driving the eval loop
|
|
169
171
|
|
|
@@ -171,12 +173,10 @@ The agent itself drives the entire loop from inside a normal agent session:
|
|
|
171
173
|
|
|
172
174
|
1. The agent invokes the runner via Bash to build the workspace (same command as above).
|
|
173
175
|
2. The agent reads the generated `dispatch.json` (machine-readable sibling of the manifest). Each task object points at a `dispatch_prompt_path` (a file holding the full prompt), an `agent_description` to pass through as the dispatch description, and exact `run_record_path` and `timing_path` to write to. The prompt lives in a file rather than inline in `dispatch.json` so the agent never has to reproduce kilobytes of prompt text per dispatch. The `agent_description` is namespaced with the iteration and a per-run nonce (`<eval_id>:<condition>:i<N>-<nonce>`) so transcripts from different iterations sharing one session's subagents dir can't collide — **pass it verbatim; do not reconstruct it from the eval id and condition.**
|
|
174
|
-
3. For each task, the agent dispatches a fresh subagent using its host's primitive, instructing it to read the file at `dispatch_prompt_path` and follow it exactly, and passing `agent_description` verbatim as the dispatch `description`. Passing the description through unchanged is what lets the transcript adapter correlate transcripts to runs in step
|
|
175
|
-
4.
|
|
176
|
-
5. (Claude Code)
|
|
177
|
-
6. (
|
|
178
|
-
7. The agent runs the grader (Bash) and then dispatches judge subagents for any `llm_judge` assertions — same pattern: read a tasks file, dispatch, write results back to a path.
|
|
179
|
-
8. The agent runs the aggregator.
|
|
176
|
+
3. For each task, the agent dispatches a fresh subagent using its host's primitive, instructing it to read the file at `dispatch_prompt_path` and follow it exactly, and passing `agent_description` verbatim as the dispatch `description`. Passing the description through unchanged is what lets the transcript adapter correlate transcripts to runs in step 4.
|
|
177
|
+
4. (Claude Code) After all dispatches return, the agent runs `bun run evals:ingest` once — a fixed-order chain of record-runs (assembles every task's `run.json` from `dispatch.json` + the subagent's own `outputs/final-message.md` + the persisted transcript, and backfills `timing.json` with transcript-derived tokens/duration, `"source": "transcript"`; never clobbers a record that already exists), fill-transcripts, detect-stray-writes (see *Sandboxing eval subagents* below), and the grader. It stops where only the agent can act: dispatching a judge subagent for each `llm_judge` assertion — same pattern as step 3: read a tasks file, dispatch, write results back to a path.
|
|
178
|
+
5. (Claude Code) After the judges return, the agent runs `bun run evals:finalize` — grade `--finalize` then the aggregator — and reads the benchmark.
|
|
179
|
+
6. (Other harnesses) The portable path is the same loop run by hand: when each subagent returns, the agent writes the run record to `run_record_path` and the timing record to `timing_path` itself (without a transcript adapter, `tool_invocations` stays `[]` and `transcript_check` assertions grade as unverifiable), then runs the grader, dispatches judges, finalizes, and aggregates as individual commands.
|
|
180
180
|
|
|
181
181
|
Agent-driven mode is the common case because the framework is most useful from inside the harness where the skill is being iterated. Use it when you want a single in-session "run the eval and report the delta" flow.
|
|
182
182
|
|
|
@@ -195,7 +195,9 @@ Design your assertions accordingly. For maximally portable evals, lean on `llm_j
|
|
|
195
195
|
The dispatch prompt tells each subagent to write only inside its `outputs/` dir, but nothing in the portable contract *enforces* that — a misbehaving subagent can edit the real repo or run `npm install` against the repo root, silently corrupting the very runner it's being measured by. Two layers guard against this:
|
|
196
196
|
|
|
197
197
|
- **Detection (all harnesses).** After `fill-transcripts` populates `tool_invocations`, run `bun run evals:detect-stray-writes --skill <name> --iteration <N>`. It reads each task's `outputs_dir` from `dispatch.json` and scans the invocations for **violations** (`Write`/`Edit`/`MultiEdit`/`NotebookEdit` whose path resolves outside the run's `outputs/`) and **warnings** (Bash commands matching install/`git`/`sed -i`/redirection patterns that don't reference `outputs/`). Findings land in `stray-writes.json`; the aggregator turns each run with violations into a `validity_warnings` entry, so a tainted data point is flagged the same way a missed skill invocation is. This is portable because it works off the same transcripts the adapters already parse — but it only *reports*, after the fact; it never reverts what a subagent wrote.
|
|
198
|
-
- **Hard guard (Claude Code, default posture).** `--guard` stages a `PreToolUse` hook that actively *blocks* out-of-bounds writes and installs while the subagents run
|
|
198
|
+
- **Hard guard (Claude Code, default posture).** `--guard` stages a `PreToolUse` hook that actively *blocks* out-of-bounds writes and installs while the subagents run — including `git worktree add` and Bash that creates files under `.claude` or a bare `skills/`. On Claude Code it is the default — the *Pre-flight gate* requires you to arm it unless the user explicitly opts out. It's Claude-Code-specific; see `harness-details/claude.md`. Harness-level write enforcement is tracked as a parity goal in `harness-parity.md`.
|
|
199
|
+
|
|
200
|
+
A run ends with teardown (`bun run evals:teardown --skill <name>`, or the `teardown` runner command): it disarms the guard, removes the staged skill set the runner created under `<cwd>/.claude/skills/`, **and** reclaims the skill's `skills-workspace/` artifacts, so a completed run leaves nothing behind that wasn't meant to be committed. Pre-existing project skills and `.claude/settings.json` are left intact. Teardown only deletes what's safe: iterations whose results are committed (it keys off the `.promoted.json` marker `promote-baseline` drops) and snapshots reproducible from a git ref. Iterations with results you haven't promoted, and working-tree snapshots, are **preserved** with a warning telling you to promote or discard them. Pass the same `--workspace-dir` you ran with if you used a custom one.
|
|
199
201
|
|
|
200
202
|
## Workspace layout
|
|
201
203
|
|
|
@@ -277,7 +279,7 @@ The check has two tiers, chosen automatically per run:
|
|
|
277
279
|
- **Code-based (Claude Code).** On harnesses that persist subagent transcripts with discrete `Skill` tool calls, the framework parses the transcript and checks for a `Skill` invocation whose `input.skill` matches the eval-staged slug. This is deterministic, free, and cannot be fooled by superficial vocabulary in the response.
|
|
278
280
|
- **LLM-judge fallback (other harnesses).** Where transcripts aren't available or the harness injects skills via system-prompt hooks rather than a tool call (Codex, OpenCode), a judge subagent compares the agent's `final_message` against the SKILL.md content embedded in the run record, looking for behavioral fingerprints — distinctive vocabulary, named sections, procedural steps that mirror the skill's phrasing. It does **not** require the agent to explicitly cite the skill (that would taint the eval).
|
|
279
281
|
|
|
280
|
-
To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The slug prevents an on-disk *collision*, not runtime *discovery*: if the same skill is also provided by an installed, **enabled** plugin, the subagent can still discover and invoke that copy — contaminating both arms (the control arm is no longer skill-absent). On Claude Code the runner flags this at build time (a "plugin-shadow" warning, also surfaced in `benchmark.json`'s `validity_warnings`), but cannot unload a live plugin; to remove the installed copy, run the eval from a plugin-isolated session — see `harness-details/claude.md` → *Isolating from installed plugins*. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used.
|
|
282
|
+
To enable the code-based check on Claude Code, the runner stages each condition's SKILL.md snapshot (plus the skill's sibling asset files) at `<repoRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md`. The unique slug prevents collisions with already-installed production skills (relevant when evaluating skills in a repo where the same skills are also installed) and is what the code-based check looks for in the transcript. The slug prevents an on-disk *collision*, not runtime *discovery*: if the same skill is also provided by an installed, **enabled** plugin, the subagent can still discover and invoke that copy — contaminating both arms (the control arm is no longer skill-absent). On Claude Code the runner flags this at build time (a "plugin-shadow" warning, also surfaced in `benchmark.json`'s `validity_warnings`), but cannot unload a live plugin; to remove the installed copy, run the eval from a plugin-isolated session — see `harness-details/claude.md` → *Isolating from installed plugins*. The dispatch prompt deliberately omits any inline `<skill>...</skill>` block so the subagent must discover and invoke the staged skill naturally — this measures whether the skill's `description:` actually triggers it. Stale staged skills are swept at the start of each fresh run. Pass `--no-stage` to opt out (e.g., when running the same eval against a harness that doesn't support project-local skill discovery); the runner will fall back to inlining the SKILL.md text in the dispatch prompt, and the LLM-judge meta-check will be used. The inline fallback carries only the SKILL.md text — sibling asset files aren't inlined — so a multi-file skill whose behavior depends on a linked companion doc needs the staged path, not `--no-stage`.
|
|
281
283
|
|
|
282
284
|
The aggregator emits a `validity_warnings` array when any with-skill condition has an invocation rate below 100%. Read those before interpreting the substantive delta. The rate is computed only over evals where the skill *should* fire; negative evals (`skill_should_trigger: false`) are excluded so a correct non-trigger never depresses the rate or raises a spurious warning.
|
|
283
285
|
|
|
@@ -100,7 +100,9 @@ This is a required gate (see *Pre-flight gate* in `../SKILL.md`). Do not run the
|
|
|
100
100
|
|
|
101
101
|
Run from the skill folder (so `CWD` is the eval root and staging lands at `<CWD>/.claude/skills/`).
|
|
102
102
|
|
|
103
|
-
`--guard` is on in the commands below because it's the default posture (Step 7). It stages a `PreToolUse` hook into `.claude/settings.local.json` that *blocks* subagent writes/installs outside the eval sandbox (the workspace, the staged-skills dir, and `$TMPDIR`) while dispatches run. The hook is gated by a marker that auto-expires after 6h and is torn down at the start of the next run; to remove
|
|
103
|
+
`--guard` is on in the commands below because it's the default posture (Step 7). It stages a `PreToolUse` hook into `.claude/settings.local.json` that *blocks* subagent writes/installs outside the eval sandbox (the workspace, the staged-skills dir, and `$TMPDIR`) while dispatches run. It denies out-of-bounds Write/Edit tool calls, and Bash that installs packages, mutates git (including **`git worktree add`**), redirects to a file, or creates paths under `.claude` / a bare `skills/`. The hook is gated by a marker that auto-expires after 6h and is torn down at the start of the next run; to remove just the guard immediately (e.g. mid-run), run `bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" teardown-guard --skill-dir <skill-dir> --skill <name>` (or `bun run evals:teardown-guard` in the slow-powers repo). The full end-of-run teardown — guard **and** staged skill set — is Step 12.
|
|
104
|
+
|
|
105
|
+
While armed, the hook fires on **your** tool calls too, not just subagents' — so hand-authoring files under the skill's own folder (e.g. `skills/<name>/evals/NOTES.md`) with Write/Edit is denied until you disarm it. Run `teardown-guard` (or the full Step 12 teardown) before any post-run hand-authoring; Bash-driven runner commands like `promote-baseline` are unaffected.
|
|
104
106
|
|
|
105
107
|
New-skill mode (with vs without):
|
|
106
108
|
|
|
@@ -108,14 +110,22 @@ New-skill mode (with vs without):
|
|
|
108
110
|
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" --skill-dir <skill-dir> --skill <name> --mode new-skill --guard
|
|
109
111
|
```
|
|
110
112
|
|
|
111
|
-
Revision mode (test a change to an existing skill)
|
|
113
|
+
Revision mode (test a change to an existing skill). The usual order is edit-first — the
|
|
114
|
+
skill is already changed when the user asks to eval — so snapshot the *old* version
|
|
115
|
+
straight from git with `--ref`, which reads the object database without touching the
|
|
116
|
+
working tree:
|
|
112
117
|
|
|
113
118
|
```bash
|
|
114
|
-
|
|
115
|
-
|
|
119
|
+
# ...the edited SKILL.md is already in the working tree...
|
|
120
|
+
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" snapshot --skill-dir <skill-dir> --skill <name> --label baseline --ref HEAD
|
|
116
121
|
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" --skill-dir <skill-dir> --skill <name> --mode revision --baseline baseline --guard
|
|
117
122
|
```
|
|
118
123
|
|
|
124
|
+
`--ref` takes any commit/tag/branch. If instead you snapshot *before* editing, drop
|
|
125
|
+
`--ref HEAD` (the snapshot then reads the working tree) and run it ahead of the edit.
|
|
126
|
+
|
|
127
|
+
Add `--stage-name <name>` to stage the skill-under-test under a verbatim name instead of the conspicuous `slow-powers-eval-…` slug (built for the issue #144 name-confound experiments: A/B a natural name against the eval slug). It applies only when exactly one condition stages the skill (e.g. `--mode new-skill`) — the runner rejects it in revision mode, where both conditions stage — and refuses to clobber an existing dir of that name. The custom dir is registered for cleanup at the next run.
|
|
128
|
+
|
|
119
129
|
Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built available-skills block (rendered the way Claude Code surfaces discoverable skills, so the dispatch reads like a real session).
|
|
120
130
|
|
|
121
131
|
For a **plan-mode-relevant skill** (e.g. `hardening-plans`), add `--plan-mode` to inject Claude Code's verbatim plan-mode procedure as a `<system-reminder>` operating-context layer in every dispatch — the highest-fidelity in-runner approximation of a real plan mode (issue #142). Use it as the verbatim-procedure arm of an A/B against a plain paraphrase-seed run (no flag) to measure whether `with_skill` invocation de-saturates. It is still text the agent reads, not an injected mode, so treat any de-saturation as a stronger-than-cold signal, not ground truth (see *Seeding conversation context (and its ceiling)* in `../SKILL.md`).
|
|
@@ -129,25 +139,32 @@ Only when the user has opted out of the guard, drop `--guard` from the command a
|
|
|
129
139
|
Read `<CWD>/skills-workspace/<name>/iteration-<N>/dispatch.json`. For each task object:
|
|
130
140
|
|
|
131
141
|
1. Dispatch a fresh subagent via the **Task tool** with the prompt `Read the file at <dispatch_prompt_path> and follow its instructions exactly.` (substituting the task's `dispatch_prompt_path`), and pass `agent_description` verbatim as the description. The full prompt lives in that file rather than inline in `dispatch.json`, so you never reproduce ~KB of text per dispatch. The description is namespaced with the iteration and a per-run nonce (`<eval_id>:<condition>:i<N>-<nonce>`) — pass it through unchanged; do not reconstruct it. Passing it verbatim is what lets transcript correlation work in Step 10 without cross-matching an agent from another iteration.
|
|
132
|
-
2.
|
|
142
|
+
2. That's it — you do **not** write `run.json` or `timing.json` yourself. The subagent writes its own `outputs/final-message.md` (the dispatch prompt instructs it to), and `record-runs` in Step 10 assembles both records from disk. Optional, higher-fidelity timing: if you want billing-grade numbers, write `{ "total_tokens": <n>, "duration_ms": <n>, "source": "completion-event" }` from the Task tool's completion event to `timing_path` right after each dispatch — `record-runs` never overwrites an existing `timing.json`, so completion-event numbers always win over its transcript-derived backfill (which includes cache accounting — a different metric).
|
|
133
143
|
|
|
134
|
-
## Step 10 —
|
|
144
|
+
## Step 10 — Ingest, judge, finalize
|
|
135
145
|
|
|
136
|
-
Claude Code persists subagent transcripts under `~/.claude/projects/<project-slug>/<parent-session-id>/subagents/`. Find that directory for the current session, then:
|
|
146
|
+
Claude Code persists subagent transcripts under `~/.claude/projects/<project-slug>/<parent-session-id>/subagents/`. Find that directory for the current session, then run the post-dispatch chain as one command:
|
|
137
147
|
|
|
138
148
|
```bash
|
|
139
|
-
|
|
149
|
+
# record-runs → fill-transcripts → detect-stray-writes → grade, in fixed order.
|
|
150
|
+
# Assembles run.json + timing.json for every task from dispatch.json,
|
|
151
|
+
# outputs/final-message.md, and the persisted transcripts; existing records are
|
|
152
|
+
# never clobbered. Stops on the first failure (re-running after a fix is safe —
|
|
153
|
+
# every sub-step skips work that's already done).
|
|
154
|
+
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" ingest --skill-dir <skill-dir> --skill <name> --iteration <N> \
|
|
140
155
|
--subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/
|
|
141
156
|
|
|
142
|
-
#
|
|
143
|
-
|
|
157
|
+
# Dispatch a fresh judge subagent for each judge task ingest listed — prompt it
|
|
158
|
+
# with `Read the file at <dispatch_prompt_path> and follow its instructions
|
|
159
|
+
# exactly.` (the prompt tells the judge where to write its response). Then:
|
|
160
|
+
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" finalize --skill-dir <skill-dir> --skill <name> --iteration <N>
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
`finalize` runs `grade --finalize` then `aggregate` and prints the benchmark. With Step 9's dispatches, the whole loop is three runner calls around the two dispatch batches: build (Step 8) → dispatch agents → `ingest` → dispatch judges → `finalize`.
|
|
144
164
|
|
|
145
|
-
|
|
146
|
-
# Dispatch a fresh judge subagent for each emitted judge task — prompt it with `Read the file at <dispatch_prompt_path> and follow its instructions exactly.` (the prompt tells the judge where to write its response). Then:
|
|
147
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/grade.ts" --skill-dir <skill-dir> --skill <name> --iteration <N> --finalize
|
|
165
|
+
Besides out-of-bounds writes, `detect-stray-writes` also flags **live-source reads**: any arm whose subagent read the live `skills/<name>/` source instead of its staged copy. That usually means the Skill tool couldn't resolve the staged slug yet (skills staged mid-session race against the registry, which is built at session start) and the agent improvised — fatal in revision mode, where the old_skill arm then sees new-skill content. The findings land in `stray-writes.json` and surface as `validity_warnings` in `benchmark.json`; treat a flagged cell's arm as contaminated.
|
|
148
166
|
|
|
149
|
-
|
|
150
|
-
```
|
|
167
|
+
The chained steps remain independently callable for inspection or recovery — `record-runs.ts`, `fill-transcripts.ts`, `detect-stray-writes.ts`, `grade.ts` (`--finalize`), `aggregate.ts`, each taking the same `--skill-dir`/`--skill`/`--iteration` flags (plus `--subagents-dir` for the two transcript readers). `record-runs` subsumes `fill-transcripts` for runner-built iterations — it writes `tool_invocations` as part of assembling each record; `fill-transcripts` remains the tool for a pre-existing `run.json` that `record-runs` won't touch (hand-authored, or written by the agent at dispatch time) whose `tool_invocations` you want populated after the fact.
|
|
151
168
|
|
|
152
169
|
## Step 11 — Present results
|
|
153
170
|
|
|
@@ -156,3 +173,22 @@ Read `<CWD>/skills-workspace/<name>/iteration-<N>/benchmark.json`. Surface to th
|
|
|
156
173
|
- `run_summary` per condition (pass rate, tokens, duration)
|
|
157
174
|
- `delta` (what the skill/change costs and what it buys — for a token-reduction eval, focus on `delta.total_tokens` alongside `delta.pass_rate`)
|
|
158
175
|
- `validity_warnings` (read these before trusting the delta — a low skill-invocation rate means the result may not reflect the skill at all)
|
|
176
|
+
|
|
177
|
+
## Step 12 — Tear down
|
|
178
|
+
|
|
179
|
+
A run stages the full skill set into `<CWD>/.claude/skills/` (project-scope, required for discovery) and — under `--guard` — a `PreToolUse` hook in `.claude/settings.local.json`. These persist after dispatch, so the run isn't complete until you remove them. This is the normal end of every run, not an optional cleanup:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" teardown --skill-dir <skill-dir> --skill <name>
|
|
183
|
+
# or, in the slow-powers repo:
|
|
184
|
+
bun run evals:teardown --skill <name>
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
`teardown` disarms the guard, removes the staged skill set, **and** reclaims the skill's `skills-workspace/` artifacts. When the runner created `<CWD>/.claude/skills/` for this run it removes the whole tree (and prunes a `.claude` it emptied); a `.claude/skills` that pre-existed (your own project skills) keeps its contents, and `.claude/settings.json` is never touched.
|
|
188
|
+
|
|
189
|
+
Workspace reclamation is conservative — a completed run leaves behind nothing that wasn't meant to be committed, but it never destroys results you haven't moved into version control:
|
|
190
|
+
|
|
191
|
+
- **Iterations** whose results are committed are removed. Teardown keys off the `.promoted.json` marker `promote-baseline` writes into the iteration. An iteration that still holds uncommitted results (a `benchmark.json`, run record, or grading with no marker — e.g. a graded run you never promoted) is **kept**, and teardown warns you, naming it and the `evals:promote-baseline` command to commit it (or delete `skills-workspace/<name>/` manually to discard). Iterations holding only reproducible scaffolding (a `--dry-run`, or a run staged but never dispatched) are removed.
|
|
192
|
+
- **Snapshots** materialized from a git ref (`snapshot --ref`) are removed — they regenerate on demand. Working-tree snapshots (no `--ref`), which can't be regenerated, are kept.
|
|
193
|
+
|
|
194
|
+
If you ran with a custom `--workspace-dir`, pass the same value to `teardown` so it reclaims the right tree.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Eval-Runner Harness Parity Check
|
|
2
|
+
|
|
3
|
+
You are an agent running inside one of Slow-powers's supported harnesses. This file walks you through auditing **which eval-runner features are wired up for your harness** and prepping to close one gap. Claude Code is the reference implementation; other harnesses adapt its patterns using their own native conventions.
|
|
4
|
+
|
|
5
|
+
This file covers the **skill-eval runner only** — the infrastructure under `skills/evaluating-skills/` that dispatches, records, and grades skill evals. Plugin-distribution parity (manifests, hooks, bootstrap injection, skill discovery) is audited separately by the root-level `harness-parity-check.md`. The eval runner is slated to move into its own project; this doc lives alongside it so it travels with the extraction.
|
|
6
|
+
|
|
7
|
+
Read the file end-to-end before acting. The categories in Step 4 are the source of truth for what "eval-runner parity" means today — when a new feature is added to the runner, that table is updated and this file stays evergreen.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Step 1 — Identify your harness
|
|
12
|
+
|
|
13
|
+
Name the harness you are running in. You almost certainly already know — confirm by checking:
|
|
14
|
+
|
|
15
|
+
- Your invocation context and working directory
|
|
16
|
+
- The tool names available to you in this session
|
|
17
|
+
- Any session-start context block injected at the top of the conversation
|
|
18
|
+
- Top-level files or directories matching your harness (e.g. `.<harness>-plugin/`, `<harness>-instructions.md`)
|
|
19
|
+
|
|
20
|
+
The intended supported harnesses are: **Claude Code, Codex CLI, OpenCode**.
|
|
21
|
+
|
|
22
|
+
If the harness you are running in is not in that list, stop and ask the user before continuing.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Step 2 — Read the reference materials
|
|
27
|
+
|
|
28
|
+
Read these files in order. Each one teaches you something specific you will need in Step 3. Paths are relative to the repository root.
|
|
29
|
+
|
|
30
|
+
| File | What to look for |
|
|
31
|
+
|------|------------------|
|
|
32
|
+
| `AGENTS.md` (or `CLAUDE.md`, which symlinks to it) | The Cross-Harness Compatibility rule, the canonical list of supported harnesses, the PR-scoping rule |
|
|
33
|
+
| `skills/evaluating-skills/runner/README.md` | Contains explicit **Cross-harness breadcrumbs** — sketches of how Codex and OpenCode would implement environment parity. Treat these as starting points, not specifications |
|
|
34
|
+
| `skills/evaluating-skills/runner/adapters/claude-code-transcript.ts` | The reference transcript adapter. A second harness would add its own adapter alongside this, translating that harness's transcript shape into the same `ToolInvocation[]` format |
|
|
35
|
+
| `skills/evaluating-skills/harness-details/claude.md` | The reference per-harness operator walkthrough. Other harnesses would each get their own file alongside this |
|
|
36
|
+
|
|
37
|
+
Do not skim. The parity report you produce in Step 4 is only as good as the reference you internalized here.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Step 3 — Discover your harness's existing surface area
|
|
42
|
+
|
|
43
|
+
Enumerate, using ordinary file search, what already exists in the eval runner for your harness. Do not rely on memory or assumptions — search the working tree. Useful heuristics:
|
|
44
|
+
|
|
45
|
+
- The harness name anywhere inside `skills/evaluating-skills/runner/` (especially `context.ts`, `adapters/`, `profiles/`)
|
|
46
|
+
- A per-harness operator guide in `skills/evaluating-skills/harness-details/`
|
|
47
|
+
- Tests under `tests/` exercising the runner for the harness
|
|
48
|
+
|
|
49
|
+
Record every path you find. You will reference them in Step 4.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Step 4 — Produce a parity report
|
|
54
|
+
|
|
55
|
+
For each category below, compare what Claude Code has against what your harness has. Categories are described as "what Claude does (reference)" so they survive renames — when something changes, this row of the table is updated and the rest of the file still applies.
|
|
56
|
+
|
|
57
|
+
| Category | What Claude Code does (reference) |
|
|
58
|
+
|----------|-----------------------------------|
|
|
59
|
+
| Skill-eval transcript adapter | `skills/evaluating-skills/runner/adapters/claude-code-transcript.ts` |
|
|
60
|
+
| Skill-eval auto-record (run/timing assembly) | `runner/record-runs.ts` assembles each task's `run.json` + `timing.json` from disk after dispatches: carry-over fields from `dispatch.json`, `final_message` from `outputs/final-message.md`, `tool_invocations`/tokens/duration from the persisted transcript (`parseTranscriptFull` — usage deduped by message id). Leans on transcript access, so it's a Claude-Code-tier acceleration like `fill-transcripts`; the portable contract (hand-authored records, `run-record.schema.json`) is unchanged. A harness closes this gap by extending its transcript adapter to supply the same three sources (final message, tool invocations, usage/timing) the recorder consumes |
|
|
61
|
+
| Realistic eval environment (skill staging) | `runner/run.ts` stages skills under `<stageRoot>/.claude/skills/`, wraps any `--bootstrap` content in a `<session-start-context>` block, and emits a separate available-skills block. That block is rendered in the harness's **native** skill-list presentation — Claude Code's lives in `runner/adapters/claude-code-session.ts` (`The following skills are available for use with the Skill tool:` / `- name: description`). Another harness adds its own renderer there so its dispatches read like a real session in that harness, not an eval |
|
|
62
|
+
| Eval subagent write enforcement | Opt-in `--guard` stages a `PreToolUse` hook (`runner/guard/`) that *denies* subagent writes/installs outside the eval sandbox while dispatches run. Portable fallback for every harness: the `evals:detect-stray-writes` post-pass (`runner/detect-stray-writes.ts`) flags out-of-bounds writes from the parsed transcript after the fact |
|
|
63
|
+
| Eval plan-mode operating context | Opt-in `--plan-mode` injects a harness-specific plan-mode procedure profile (`runner/profiles/<harness>/plan-mode.md`) as a `<system-reminder>` operating-context layer in every dispatch, rendered by `renderPlanModeContext` in the harness session adapter (`runner/adapters/claude-code-session.ts`). Only `profiles/claude-code/plan-mode.md` exists today; a harness adds its own profile (its native plan/research-mode procedure) + renderer alongside the Claude ones. A harness with no profile has no `--plan-mode` and an unchanged dispatch contract |
|
|
64
|
+
| Harness-details operator guide | `skills/evaluating-skills/harness-details/claude.md` |
|
|
65
|
+
|
|
66
|
+
**Note on the transcript adapter (raised bar).** Slow-powers's baseline eval suite
|
|
67
|
+
now uses `transcript_check` assertions — deterministic regex checks against a
|
|
68
|
+
run's tool invocations (e.g. "a test command ran", "the sibling skill was
|
|
69
|
+
loaded"). These only grade when a transcript adapter exists for your harness.
|
|
70
|
+
A harness without one still functions: those assertions grade as *unverifiable*
|
|
71
|
+
and the `llm_judge` assertions carry the substantive measurement, the same way
|
|
72
|
+
Codex/OpenCode work today. But adapter richness is now an explicit parity
|
|
73
|
+
target, not optional polish — a harness that adds or extends an adapter under
|
|
74
|
+
`skills/evaluating-skills/runner/adapters/` lets more of the baseline suite grade
|
|
75
|
+
mechanically. Treat the transcript-adapter row above as a goal to aim at, not a
|
|
76
|
+
box already checked.
|
|
77
|
+
|
|
78
|
+
**Note on write enforcement (parity goal).** Eval subagents are instructed to
|
|
79
|
+
write only inside their `outputs/` dir, but nothing in the portable contract
|
|
80
|
+
*enforces* it — a misbehaving subagent can edit the real repo or install
|
|
81
|
+
packages, silently tainting the run. Two layers address this: the portable
|
|
82
|
+
`detect-stray-writes` post-pass (available to every harness, since it works off
|
|
83
|
+
the same parsed transcript the adapters already produce) and, on Claude Code, an
|
|
84
|
+
opt-in `--guard` that stages a native `PreToolUse` hook to *block* the write
|
|
85
|
+
before it happens. **Harness-level tool enforcement — denying out-of-bounds
|
|
86
|
+
subagent writes using the harness's own permission/hook primitive — is an
|
|
87
|
+
explicit parity goal, not optional polish.** A harness that can express a
|
|
88
|
+
pre-tool guard (a hook, a permission rule, a sandboxed cwd) should wire one up so
|
|
89
|
+
its eval runs are as self-contained as Claude Code's; until then, the
|
|
90
|
+
`detect-stray-writes` report is the honest fallback. Treat the write-enforcement
|
|
91
|
+
row above as a goal to aim at, with detection as the baseline every harness meets.
|
|
92
|
+
|
|
93
|
+
**Note on plan-mode fidelity (residual parity goal).** `--plan-mode` injects a
|
|
94
|
+
harness's *verbatim* plan-mode procedure as operating context, which is the
|
|
95
|
+
closest a harness's eval runner can get to reproducing the wild failure where a
|
|
96
|
+
real plan mode makes loading a skill feel redundant. It is **not** the real mode:
|
|
97
|
+
it is still text the dispatched subagent reads, not a state the harness places it
|
|
98
|
+
under, so a pass remains necessary-not-sufficient (see *Seeding conversation
|
|
99
|
+
context (and its ceiling)* in `skills/evaluating-skills/SKILL.md`). A harness that
|
|
100
|
+
can actually dispatch an eval subagent *into* its own plan/research mode — not
|
|
101
|
+
merely describe it — would close this gap; that real-mode injection is the
|
|
102
|
+
residual parity goal, with `--plan-mode` (a profile + renderer) as the approximation
|
|
103
|
+
every harness can reach in the meantime.
|
|
104
|
+
|
|
105
|
+
Surface your findings inline using this template:
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
## Eval-Runner Parity Report: <harness>
|
|
109
|
+
Reference: Claude Code
|
|
110
|
+
|
|
111
|
+
- **Skill-eval transcript adapter** — ✅ Implemented / ⚠️ Partial / ❌ Missing / N/A
|
|
112
|
+
- Where: <path or "would live at <path>">
|
|
113
|
+
- Gap: <one sentence, only if Partial/Missing>
|
|
114
|
+
|
|
115
|
+
(... one block per category ...)
|
|
116
|
+
|
|
117
|
+
## Summary
|
|
118
|
+
- Strongest area: <category>
|
|
119
|
+
- Highest-leverage gap: <category> — <why>
|
|
120
|
+
- Suggested next gap to close this session: <category>
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Status meanings:
|
|
124
|
+
|
|
125
|
+
- **✅ Implemented** — fully wired up; feature works the same way Claude's does (using whatever native primitive the harness provides)
|
|
126
|
+
- **⚠️ Partial** — some scaffolding exists but the feature isn't end-to-end functional
|
|
127
|
+
- **❌ Missing** — no implementation; users of this harness do not get this feature
|
|
128
|
+
- **N/A** — the category doesn't translate. State why
|
|
129
|
+
|
|
130
|
+
The agent reports inline by default. If the user asks for a persistent artifact, write the report to `docs/parity-reports/<harness>-evals.md` (create the directory if missing).
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Step 5 — Pick a gap and prep to close it
|
|
135
|
+
|
|
136
|
+
Surface the report to the user and propose **one or two** gaps worth closing this session. Bias toward the smallest gap with the highest user impact — typically a transcript adapter or a harness-details guide, not a wholesale runner rework.
|
|
137
|
+
|
|
138
|
+
Once the user picks a gap:
|
|
139
|
+
|
|
140
|
+
1. Re-read Claude's reference implementation for that specific feature in detail. Note the *shape* of what it does — inputs, outputs, side effects — separately from the *Claude-specific mechanism* it uses.
|
|
141
|
+
2. **Consult your harness's own documentation, MCP servers, or built-in references** before proposing harness-specific changes. Do not guess at hook schemas, transcript formats, or native tool names. If a `context7` or equivalent docs-fetch server is available, prefer it over your training data — assume your knowledge of the harness may be stale.
|
|
142
|
+
3. Propose an adaptation that copies Claude's shape while using your harness's native conventions. State explicitly what you are copying and what you are adapting.
|
|
143
|
+
4. Confirm with the user before writing code.
|
|
144
|
+
5. If your gap involves creating or modifying a skill, load `slow-powers:writing-skills` first.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Guardrails
|
|
149
|
+
|
|
150
|
+
- **Cross-Harness Compatibility is enforced.** A change for your harness MUST NOT break or degrade any other harness. Re-read the Cross-Harness Compatibility section of `AGENTS.md`.
|
|
151
|
+
- **One problem per PR.** Per `AGENTS.md`, do not bundle unrelated changes. A parity-closing PR should add one feature for one harness.
|
|
152
|
+
- **Do not edit `bootstrap.md` or shared skills as part of parity work.** Those are cross-cutting; changes need their own PRs with their own evidence.
|
|
153
|
+
- **Do not fabricate features that don't exist in any harness yet.** Parity means "catch up to Claude," not "invent something new."
|
|
154
|
+
- **Do not guess at harness-specific details.** If your harness's docs don't confirm something, ask the user before proceeding.
|
|
155
|
+
- **Keep this file evergreen.** If you add a new feature category to the eval runner, add a row to the Step 4 table here and to the eval-runner tier table in `README.md` in the same PR. Distribution-side categories (manifests, hooks, bootstrap, docs) belong in the root `harness-parity-check.md` instead.
|
|
@@ -22,6 +22,7 @@ Other flags:
|
|
|
22
22
|
- `--workspace-dir <path>` (optional) — where iteration artifacts are written. Defaults to `<CWD>/skills-workspace`.
|
|
23
23
|
- `--harness claude-code` (optional, default `claude-code`; the only supported harness).
|
|
24
24
|
- `--no-stage`, `--dry-run`, `--iteration <N>`, `--mode <new-skill|revision>`, `--baseline <label>`, `--label <label>` — as before.
|
|
25
|
+
- `--ref <git-ref>` (optional, `snapshot` only) — snapshot the skill (SKILL.md + sibling assets, excluding `evals/`) as it existed at a git ref, read straight from git without touching the working tree. Use it for the common edit-first Mode B order: edit the skill, *then* snapshot the old version with `--ref HEAD` (or any commit/tag/branch) as the baseline. Without `--ref`, `snapshot` reads the working tree as before.
|
|
25
26
|
- `--only <id,id,...>` / `--skip <id,id,...>` (optional) — run only, or all-but, the named eval ids from `evals.json`. The two are mutually exclusive, and every named id must exist (the run aborts with the available ids listed otherwise). Use this for a cost-conscious reduced-set run instead of temporarily editing `evals.json` down. The pre-flight summary and the `N evals × 2 conditions` count reflect the filtered set.
|
|
26
27
|
- `--plan-mode` (optional, Claude Code) — inject the harness's verbatim plan-mode procedure as an operating-context layer. When set, the runner reads `profiles/<harness>/plan-mode.md` and emits it (via the session adapter's `renderPlanModeContext`) as a `<system-reminder>` block in every dispatch, after the available-skills block and before the user request. It is identical across the with/without-skill arms and recorded as `plan_mode` in `dispatch.json`. This is issue #142's highest-fidelity in-runner approximation of a real plan mode — still text the agent reads, so a pass is necessary-not-sufficient; see *Seeding conversation context (and its ceiling)* in `../SKILL.md`. Opt-in, and meant only for plan-mode-relevant skills; a harness with no profile aborts the run, leaving the portable dispatch contract unchanged.
|
|
27
28
|
|
|
@@ -29,7 +30,7 @@ Staging is written under the current working directory: `<CWD>/.claude/skills/`.
|
|
|
29
30
|
|
|
30
31
|
## Driving the loop
|
|
31
32
|
|
|
32
|
-
Every run produces both a `dispatch-manifest.md` (human-readable) and a `dispatch.json` (machine-readable). An agent in a session reads `dispatch.json
|
|
33
|
+
Every run produces both a `dispatch-manifest.md` (human-readable) and a `dispatch.json` (machine-readable). An agent in a session reads `dispatch.json` and dispatches each task itself. On Claude Code the rest is two fixed-order commands around the judge dispatches — `ingest` (record-runs → fill-transcripts → detect-stray-writes → grade) and `finalize` (grade --finalize → aggregate) — so the whole loop is three runner calls and two dispatch batches. On harnesses without persisted transcripts, the agent writes the records to the paths in each task by hand and runs the chained steps individually (the portable path).
|
|
33
34
|
|
|
34
35
|
## Quickstart (internal / repo use)
|
|
35
36
|
|
|
@@ -44,20 +45,20 @@ Maintainers run from the repo root; the npm scripts supply `--skill-dir ./skills
|
|
|
44
45
|
bun run evals -- --skill <name> --mode new-skill
|
|
45
46
|
|
|
46
47
|
# 3. Read skills-workspace/<name>/iteration-1/dispatch.json and dispatch each
|
|
47
|
-
# task as a fresh general-purpose subagent
|
|
48
|
-
#
|
|
48
|
+
# task as a fresh general-purpose subagent (each writes its own
|
|
49
|
+
# outputs/final-message.md).
|
|
49
50
|
|
|
50
|
-
# 4.
|
|
51
|
-
|
|
51
|
+
# 4. Ingest — record-runs → fill-transcripts → detect-stray-writes → grade,
|
|
52
|
+
# in fixed order (assembles run.json + timing.json from dispatch.json,
|
|
53
|
+
# final-message.md, and the persisted transcripts, then emits judge tasks):
|
|
54
|
+
bun run evals:ingest -- --skill <name> --iteration 1 \
|
|
52
55
|
--subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/
|
|
53
56
|
|
|
54
|
-
# 5.
|
|
55
|
-
|
|
56
|
-
# (After judge subagents complete and their responses are written, finalize:)
|
|
57
|
-
bun run evals:grade -- --skill <name> --iteration 1 --finalize
|
|
57
|
+
# 5. Dispatch each judge task ingest listed, writing responses to their
|
|
58
|
+
# response_path.
|
|
58
59
|
|
|
59
|
-
# 6.
|
|
60
|
-
bun run evals:
|
|
60
|
+
# 6. Finalize — grade --finalize → aggregate:
|
|
61
|
+
bun run evals:finalize -- --skill <name> --iteration 1
|
|
61
62
|
|
|
62
63
|
# 7. Read skills-workspace/<name>/iteration-1/benchmark.json.
|
|
63
64
|
|
|
@@ -68,18 +69,24 @@ bun run evals:promote-baseline -- --skill <name> --iteration 1
|
|
|
68
69
|
|
|
69
70
|
### Mode B — Evaluate a language change to an existing skill
|
|
70
71
|
|
|
72
|
+
The common case is edit-first: you've already changed the skill, then decide to eval.
|
|
73
|
+
Snapshot the *old* version from git — no working-tree dance:
|
|
74
|
+
|
|
71
75
|
```bash
|
|
72
|
-
# 1.
|
|
73
|
-
bun run evals:snapshot -- --skill <name> --label baseline-2026-05-24
|
|
76
|
+
# 1. Edit skills/<name>/SKILL.md (the "new" version is now in the working tree).
|
|
74
77
|
|
|
75
|
-
# 2.
|
|
78
|
+
# 2. Snapshot the old version straight from git as the baseline.
|
|
79
|
+
bun run evals:snapshot -- --skill <name> --label baseline-2026-05-24 --ref HEAD
|
|
76
80
|
|
|
77
|
-
# 3. Build the iteration-N workspace, comparing
|
|
81
|
+
# 3. Build the iteration-N workspace, comparing baseline (old) vs current (new).
|
|
78
82
|
bun run evals -- --skill <name> --mode revision --baseline baseline-2026-05-24
|
|
79
83
|
|
|
80
84
|
# 4-7. Same as Mode A.
|
|
81
85
|
```
|
|
82
86
|
|
|
87
|
+
If you snapshot *before* editing instead, drop `--ref HEAD` from step 2 (it reads the
|
|
88
|
+
working tree) and run it before step 1.
|
|
89
|
+
|
|
83
90
|
### Dry run (workspace prep only)
|
|
84
91
|
|
|
85
92
|
```bash
|
|
@@ -102,12 +109,14 @@ If you have the slow-powers plugin installed and a personal skill, you do **not*
|
|
|
102
109
|
## Layout
|
|
103
110
|
|
|
104
111
|
- `context.ts` — `detectRunContext(argv)` builds the `RunContext` every command shares: resolves `--skill-dir`/`--skill`, enumerates sibling skills, resolves `--bootstrap`/`--workspace-dir`, and derives `stageRoot` (CWD) and `workspaceRoot`.
|
|
105
|
-
- `run.ts` — orchestrator; builds workspace tree, snapshots SKILL.md, emits dispatch manifest. On Claude Code (default), also stages each condition's snapshot at `<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md` so the subagent can discover and invoke it via the Skill tool, stages every *other* skill found in `--skill-dir` at its natural name so cross-references resolve, and builds the `<session-start-context>` block (see *Environment parity* below). Pass `--no-stage` to opt out and fall back to inlining the SKILL.md into the dispatch prompt. Also handles the `snapshot`
|
|
112
|
+
- `run.ts` — orchestrator; builds workspace tree, snapshots SKILL.md, emits dispatch manifest. On Claude Code (default), also stages each condition's snapshot at `<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md` so the subagent can discover and invoke it via the Skill tool, stages every *other* skill found in `--skill-dir` at its natural name so cross-references resolve, and builds the `<session-start-context>` block (see *Environment parity* below). Pass `--no-stage` to opt out and fall back to inlining the SKILL.md into the dispatch prompt. Pass `--stage-name <name>` to stage under a verbatim name instead of the eval slug (issue #144 name-confound experiments; single-staging-condition modes only, refuses to clobber an existing dir, registered for next-run cleanup). Also handles the `snapshot`, `ingest`/`finalize` (fixed-order post-dispatch chains over the sibling commands), and `teardown` subcommands.
|
|
106
113
|
- `grade.ts` — evaluates `transcript_check` assertions directly (regex against `tool_invocations`), emits judge-task files for `llm_judge` assertions, then finalizes by merging judge responses into per-run `grading.json`. The `__skill_invoked` meta-check is code-based on Claude Code when the staged-skill slug is known and `tool_invocations` is populated (deterministic scan for a `Skill` tool call with matching slug); it falls back to an LLM judge looking for behavioral fingerprints when either signal is missing.
|
|
107
114
|
- `aggregate.ts` — reads grading.json + timing.json from an iteration, writes `benchmark.json` with pass-rate / duration / token stats keyed by condition name.
|
|
108
115
|
- `promote-baseline.ts` — copies the durable subset of an iteration (`benchmark.json` + each run's `grading.json` + a `BASELINE.md` provenance file) into the skill's version-controlled `evals/baseline/`. Flags: `--skill-dir`/`--skill` (as everywhere), `--iteration <N>` (required), `--label <tag>` (optional, recorded in provenance). Everything else in the workspace stays gitignored.
|
|
109
|
-
- `
|
|
110
|
-
- `
|
|
116
|
+
- `record-runs.ts` — assembles a schema-valid `run.json` and backfills `timing.json` for every task in a runner-built iteration, from `dispatch.json` (carry-over fields) + `outputs/final-message.md` (`final_message`, transcript fallback) + the persisted transcript (`tool_invocations`, tokens, duration). Never clobbers existing records without `--overwrite`; transcript-derived timing carries `"source": "transcript"`. Claude-Code-tier, like `fill-transcripts` — transcript-less harnesses keep authoring records manually (the portable path).
|
|
117
|
+
- `fill-transcripts.ts` — walks the iteration tree, matches each `(eval, condition)` to a subagent transcript by description, parses the transcript with the appropriate adapter, populates `tool_invocations` in `run.json`. Subsumed by `record-runs` for runner-built iterations; still the tool for filling a pre-existing (hand- or agent-written) `run.json`.
|
|
118
|
+
- `detect-stray-writes.ts` — scans each run's `tool_invocations` for sandbox breaches and writes `stray-writes.json`: write tools targeting paths outside the run's outputs dir (violations), mutating Bash heuristics (warnings), and **live-source reads** — a read tool or Bash command accessing the live skill-under-test directory instead of its staged copy, the signature of the staged-slug resolution race (skills staged mid-session aren't guaranteed resolvable by the Skill tool, whose registry is built at session start; an agent that hits "Unknown skill" improvises and reads the live source, contaminating its arm). `aggregate` lifts all three into `benchmark.json`'s `validity_warnings`.
|
|
119
|
+
- `adapters/claude-code-transcript.ts` — reads a Claude Code subagent JSONL and returns `ToolInvocation[]` (`parseTranscript`), or the full summary with usage tokens deduped by message id, wall-clock duration, and the last assistant text (`parseTranscriptFull`). Also exposes `listSubagents` / `findByDescription` for transcript correlation.
|
|
111
120
|
- `types.ts` — shared TypeScript types matching `../schema/*.json`.
|
|
112
121
|
- `validate.ts` / `validate-all.ts` — validator for `evals.json` against the JSON Schema rules. `validate-all.ts` takes `--skill-dir` and validates every skill's `evals.json` in it.
|
|
113
122
|
|
|
@@ -137,7 +146,7 @@ For the **`without_skill` / baseline condition** in this realistic environment,
|
|
|
137
146
|
- **General fallback.** Harnesses without project-local discovery should keep using `--no-stage`; the inline `<skill>` block in the dispatch prompt is the only skill the subagent sees. Bootstrap is omitted in this mode because its references to other skills would mislead the agent.
|
|
138
147
|
- **Plan-mode profiles (`--plan-mode`).** The plan-mode operating-context layer is also a harness-specific surface. The procedure text lives in `profiles/<harness>/plan-mode.md` and is wrapped by a `renderPlanModeContext` in that harness's session adapter (`adapters/<harness>-session.ts`), exactly mirroring how `renderAvailableSkillsBlock` is harness-specific. Only `profiles/claude-code/plan-mode.md` exists today; a harness that wants this fidelity layer adds its own profile file (its native plan/research mode procedure) plus a renderer alongside the Claude ones. A harness with no profile simply has no `--plan-mode`, and the portable dispatch contract is unchanged.
|
|
139
148
|
|
|
140
|
-
The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See
|
|
149
|
+
The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `../harness-parity.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.
|
|
141
150
|
|
|
142
151
|
**Operational notes.** Do not run two `run.ts` invocations concurrently against the same CWD — they race on `<stageRoot>/.claude/skills/` and the manifest.
|
|
143
152
|
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
// bullets. Other harnesses (Codex, OpenCode) surface their skills differently,
|
|
7
7
|
// so this rendering lives in an adapter rather than inline in the harness-
|
|
8
8
|
// agnostic orchestrator. A new harness adds its own renderer alongside this one
|
|
9
|
-
// (see harness-parity
|
|
9
|
+
// (see ../../harness-parity.md).
|
|
10
10
|
|
|
11
11
|
import type { AvailableSkill } from "../types";
|
|
12
12
|
|
|
@@ -33,7 +33,7 @@ export function renderAvailableSkillsBlock(skills: AvailableSkill[]): string {
|
|
|
33
33
|
* operating under, not prose it merely reads. The profile text (the verbatim
|
|
34
34
|
* plan-mode procedure) lives in `../profiles/claude-code/plan-mode.md`; this
|
|
35
35
|
* adapter owns only the harness-native framing, so a new harness adds its own
|
|
36
|
-
* renderer + profile alongside this one (see harness-parity
|
|
36
|
+
* renderer + profile alongside this one (see ../../harness-parity.md). Returns
|
|
37
37
|
* an empty string for empty input so the caller can omit the section entirely.
|
|
38
38
|
*/
|
|
39
39
|
export function renderPlanModeContext(profileText: string): string {
|