@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,155 +0,0 @@
1
- # Eval-Runner Harness Parity Check
2
-
3
- You are an agent running inside one of Slow-powers's supported harnesses. This file walks you through auditing **which eval-runner features are wired up for your harness** and prepping to close one gap. Claude Code is the reference implementation; other harnesses adapt its patterns using their own native conventions.
4
-
5
- This file covers the **skill-eval runner only** — the infrastructure under `skills/evaluating-skills/` that dispatches, records, and grades skill evals. Plugin-distribution parity (manifests, hooks, bootstrap injection, skill discovery) is audited separately by the root-level `harness-parity-check.md`. The eval runner is slated to move into its own project; this doc lives alongside it so it travels with the extraction.
6
-
7
- Read the file end-to-end before acting. The categories in Step 4 are the source of truth for what "eval-runner parity" means today — when a new feature is added to the runner, that table is updated and this file stays evergreen.
8
-
9
- ---
10
-
11
- ## Step 1 — Identify your harness
12
-
13
- Name the harness you are running in. You almost certainly already know — confirm by checking:
14
-
15
- - Your invocation context and working directory
16
- - The tool names available to you in this session
17
- - Any session-start context block injected at the top of the conversation
18
- - Top-level files or directories matching your harness (e.g. `.<harness>-plugin/`, `<harness>-instructions.md`)
19
-
20
- The intended supported harnesses are: **Claude Code, Codex CLI, OpenCode**.
21
-
22
- If the harness you are running in is not in that list, stop and ask the user before continuing.
23
-
24
- ---
25
-
26
- ## Step 2 — Read the reference materials
27
-
28
- Read these files in order. Each one teaches you something specific you will need in Step 3. Paths are relative to the repository root.
29
-
30
- | File | What to look for |
31
- |------|------------------|
32
- | `AGENTS.md` (or `CLAUDE.md`, which symlinks to it) | The Cross-Harness Compatibility rule, the canonical list of supported harnesses, the PR-scoping rule |
33
- | `skills/evaluating-skills/runner/README.md` | Contains explicit **Cross-harness breadcrumbs** — sketches of how Codex and OpenCode would implement environment parity. Treat these as starting points, not specifications |
34
- | `skills/evaluating-skills/runner/adapters/claude-code-transcript.ts` | The reference transcript adapter. A second harness would add its own adapter alongside this, translating that harness's transcript shape into the same `ToolInvocation[]` format |
35
- | `skills/evaluating-skills/harness-details/claude.md` | The reference per-harness operator walkthrough. Other harnesses would each get their own file alongside this |
36
-
37
- Do not skim. The parity report you produce in Step 4 is only as good as the reference you internalized here.
38
-
39
- ---
40
-
41
- ## Step 3 — Discover your harness's existing surface area
42
-
43
- Enumerate, using ordinary file search, what already exists in the eval runner for your harness. Do not rely on memory or assumptions — search the working tree. Useful heuristics:
44
-
45
- - The harness name anywhere inside `skills/evaluating-skills/runner/` (especially `context.ts`, `adapters/`, `profiles/`)
46
- - A per-harness operator guide in `skills/evaluating-skills/harness-details/`
47
- - Tests under `tests/` exercising the runner for the harness
48
-
49
- Record every path you find. You will reference them in Step 4.
50
-
51
- ---
52
-
53
- ## Step 4 — Produce a parity report
54
-
55
- For each category below, compare what Claude Code has against what your harness has. Categories are described as "what Claude does (reference)" so they survive renames — when something changes, this row of the table is updated and the rest of the file still applies.
56
-
57
- | Category | What Claude Code does (reference) |
58
- |----------|-----------------------------------|
59
- | Skill-eval transcript adapter | `skills/evaluating-skills/runner/adapters/claude-code-transcript.ts` |
60
- | Skill-eval auto-record (run/timing assembly) | `runner/record-runs.ts` assembles each task's `run.json` + `timing.json` from disk after dispatches: carry-over fields from `dispatch.json`, `final_message` from `outputs/final-message.md`, `tool_invocations`/tokens/duration from the persisted transcript (`parseTranscriptFull` — usage deduped by message id). Leans on transcript access, so it's a Claude-Code-tier acceleration like `fill-transcripts`; the portable contract (hand-authored records, `run-record.schema.json`) is unchanged. A harness closes this gap by extending its transcript adapter to supply the same three sources (final message, tool invocations, usage/timing) the recorder consumes |
61
- | Realistic eval environment (skill staging) | `runner/run.ts` stages skills under `<stageRoot>/.claude/skills/`, wraps any `--bootstrap` content in a `<session-start-context>` block, and emits a separate available-skills block. That block is rendered in the harness's **native** skill-list presentation — Claude Code's lives in `runner/adapters/claude-code-session.ts` (`The following skills are available for use with the Skill tool:` / `- name: description`). Another harness adds its own renderer there so its dispatches read like a real session in that harness, not an eval |
62
- | Eval subagent write enforcement | Opt-in `--guard` stages a `PreToolUse` hook (`runner/guard/`) that *denies* subagent writes/installs outside the eval sandbox while dispatches run. Portable fallback for every harness: the `evals:detect-stray-writes` post-pass (`runner/detect-stray-writes.ts`) flags out-of-bounds writes from the parsed transcript after the fact |
63
- | Eval plan-mode operating context | Opt-in `--plan-mode` injects a harness-specific plan-mode procedure profile (`runner/profiles/<harness>/plan-mode.md`) as a `<system-reminder>` operating-context layer in every dispatch, rendered by `renderPlanModeContext` in the harness session adapter (`runner/adapters/claude-code-session.ts`). Only `profiles/claude-code/plan-mode.md` exists today; a harness adds its own profile (its native plan/research-mode procedure) + renderer alongside the Claude ones. A harness with no profile has no `--plan-mode` and an unchanged dispatch contract |
64
- | Harness-details operator guide | `skills/evaluating-skills/harness-details/claude.md` |
65
-
66
- **Note on the transcript adapter (raised bar).** Slow-powers's baseline eval suite
67
- now uses `transcript_check` assertions — deterministic regex checks against a
68
- run's tool invocations (e.g. "a test command ran", "the sibling skill was
69
- loaded"). These only grade when a transcript adapter exists for your harness.
70
- A harness without one still functions: those assertions grade as *unverifiable*
71
- and the `llm_judge` assertions carry the substantive measurement, the same way
72
- Codex/OpenCode work today. But adapter richness is now an explicit parity
73
- target, not optional polish — a harness that adds or extends an adapter under
74
- `skills/evaluating-skills/runner/adapters/` lets more of the baseline suite grade
75
- mechanically. Treat the transcript-adapter row above as a goal to aim at, not a
76
- box already checked.
77
-
78
- **Note on write enforcement (parity goal).** Eval subagents are instructed to
79
- write only inside their `outputs/` dir, but nothing in the portable contract
80
- *enforces* it — a misbehaving subagent can edit the real repo or install
81
- packages, silently tainting the run. Two layers address this: the portable
82
- `detect-stray-writes` post-pass (available to every harness, since it works off
83
- the same parsed transcript the adapters already produce) and, on Claude Code, an
84
- opt-in `--guard` that stages a native `PreToolUse` hook to *block* the write
85
- before it happens. **Harness-level tool enforcement — denying out-of-bounds
86
- subagent writes using the harness's own permission/hook primitive — is an
87
- explicit parity goal, not optional polish.** A harness that can express a
88
- pre-tool guard (a hook, a permission rule, a sandboxed cwd) should wire one up so
89
- its eval runs are as self-contained as Claude Code's; until then, the
90
- `detect-stray-writes` report is the honest fallback. Treat the write-enforcement
91
- row above as a goal to aim at, with detection as the baseline every harness meets.
92
-
93
- **Note on plan-mode fidelity (residual parity goal).** `--plan-mode` injects a
94
- harness's *verbatim* plan-mode procedure as operating context, which is the
95
- closest a harness's eval runner can get to reproducing the wild failure where a
96
- real plan mode makes loading a skill feel redundant. It is **not** the real mode:
97
- it is still text the dispatched subagent reads, not a state the harness places it
98
- under, so a pass remains necessary-not-sufficient (see *Seeding conversation
99
- context (and its ceiling)* in `skills/evaluating-skills/SKILL.md`). A harness that
100
- can actually dispatch an eval subagent *into* its own plan/research mode — not
101
- merely describe it — would close this gap; that real-mode injection is the
102
- residual parity goal, with `--plan-mode` (a profile + renderer) as the approximation
103
- every harness can reach in the meantime.
104
-
105
- Surface your findings inline using this template:
106
-
107
- ```
108
- ## Eval-Runner Parity Report: <harness>
109
- Reference: Claude Code
110
-
111
- - **Skill-eval transcript adapter** — ✅ Implemented / ⚠️ Partial / ❌ Missing / N/A
112
- - Where: <path or "would live at <path>">
113
- - Gap: <one sentence, only if Partial/Missing>
114
-
115
- (... one block per category ...)
116
-
117
- ## Summary
118
- - Strongest area: <category>
119
- - Highest-leverage gap: <category> — <why>
120
- - Suggested next gap to close this session: <category>
121
- ```
122
-
123
- Status meanings:
124
-
125
- - **✅ Implemented** — fully wired up; feature works the same way Claude's does (using whatever native primitive the harness provides)
126
- - **⚠️ Partial** — some scaffolding exists but the feature isn't end-to-end functional
127
- - **❌ Missing** — no implementation; users of this harness do not get this feature
128
- - **N/A** — the category doesn't translate. State why
129
-
130
- The agent reports inline by default. If the user asks for a persistent artifact, write the report to `docs/parity-reports/<harness>-evals.md` (create the directory if missing).
131
-
132
- ---
133
-
134
- ## Step 5 — Pick a gap and prep to close it
135
-
136
- Surface the report to the user and propose **one or two** gaps worth closing this session. Bias toward the smallest gap with the highest user impact — typically a transcript adapter or a harness-details guide, not a wholesale runner rework.
137
-
138
- Once the user picks a gap:
139
-
140
- 1. Re-read Claude's reference implementation for that specific feature in detail. Note the *shape* of what it does — inputs, outputs, side effects — separately from the *Claude-specific mechanism* it uses.
141
- 2. **Consult your harness's own documentation, MCP servers, or built-in references** before proposing harness-specific changes. Do not guess at hook schemas, transcript formats, or native tool names. If a `context7` or equivalent docs-fetch server is available, prefer it over your training data — assume your knowledge of the harness may be stale.
142
- 3. Propose an adaptation that copies Claude's shape while using your harness's native conventions. State explicitly what you are copying and what you are adapting.
143
- 4. Confirm with the user before writing code.
144
- 5. If your gap involves creating or modifying a skill, load `slow-powers:writing-skills` first.
145
-
146
- ---
147
-
148
- ## Guardrails
149
-
150
- - **Cross-Harness Compatibility is enforced.** A change for your harness MUST NOT break or degrade any other harness. Re-read the Cross-Harness Compatibility section of `AGENTS.md`.
151
- - **One problem per PR.** Per `AGENTS.md`, do not bundle unrelated changes. A parity-closing PR should add one feature for one harness.
152
- - **Do not edit `bootstrap.md` or shared skills as part of parity work.** Those are cross-cutting; changes need their own PRs with their own evidence.
153
- - **Do not fabricate features that don't exist in any harness yet.** Parity means "catch up to Claude," not "invent something new."
154
- - **Do not guess at harness-specific details.** If your harness's docs don't confirm something, ask the user before proceeding.
155
- - **Keep this file evergreen.** If you add a new feature category to the eval runner, add a row to the Step 4 table here and to the eval-runner tier table in `README.md` in the same PR. Distribution-side categories (manifests, hooks, bootstrap, docs) belong in the root `harness-parity-check.md` instead.
@@ -1,163 +0,0 @@
1
- # Skill Evals Runner
2
-
3
- Supporting code for the skill eval framework defined in `skills/evaluating-skills/`. This runner ships **with** the skill (it lives under the skill directory and is included in the published plugin), so plugin users can run evals on their own skills, not just slow-powers maintainers.
4
-
5
- The methodology lives in `SKILL.md` and is harness-agnostic. This runner is Bun + Claude Code-aware: it knows how to translate Claude Code transcript shapes into the portable `run.json` format. Harness-specific operator instructions live in `../harness-details/<harness>.md`.
6
-
7
- ## The `--skill-dir` model
8
-
9
- Every command takes two required flags:
10
-
11
- - `--skill-dir <path>` — a directory that contains one or more skill folders (each with a `SKILL.md`). **This directory is the eval's test environment.** Every skill inside it is staged for the eval: the skill-under-test under a unique slug, every *other* skill under its natural name (so cross-references resolve).
12
- - `--skill <name>` — the subdirectory of `--skill-dir` to evaluate.
13
-
14
- Consequences of treating the directory as the environment:
15
-
16
- - **Internal use** points `--skill-dir` at the repo's `./skills`, so the skill-under-test sees every other slow-powers skill as a sibling — the realistic install. The npm scripts bake this in (`--skill-dir ./skills`), so maintainers keep using `bun run evals -- --skill <name> --mode <mode>` unchanged.
17
- - **A user evaluating one personal skill** points `--skill-dir` at the directory holding it. If that directory contains only their skill, the eval runs in isolation — no sibling skills are staged. To include slow-powers skills as siblings, the user copies or symlinks them into `--skill-dir`.
18
-
19
- Other flags:
20
-
21
- - `--bootstrap <path>` (optional) — a Markdown file prepended verbatim to every dispatch prompt inside `<session-start-context>`. Use it for product-specific framing (instruction priority, planning guidelines — anything a SessionStart hook would inject). Internal runs pass `--bootstrap ./bootstrap.md`. Omit it and dispatches carry only the auto-built staged-skills inventory.
22
- - `--workspace-dir <path>` (optional) — where iteration artifacts are written. Defaults to `<CWD>/skills-workspace`.
23
- - `--harness claude-code` (optional, default `claude-code`; the only supported harness).
24
- - `--no-stage`, `--dry-run`, `--iteration <N>`, `--mode <new-skill|revision>`, `--baseline <label>`, `--label <label>` — as before.
25
- - `--ref <git-ref>` (optional, `snapshot` only) — snapshot the skill (SKILL.md + sibling assets, excluding `evals/`) as it existed at a git ref, read straight from git without touching the working tree. Use it for the common edit-first Mode B order: edit the skill, *then* snapshot the old version with `--ref HEAD` (or any commit/tag/branch) as the baseline. Without `--ref`, `snapshot` reads the working tree as before.
26
- - `--only <id,id,...>` / `--skip <id,id,...>` (optional) — run only, or all-but, the named eval ids from `evals.json`. The two are mutually exclusive, and every named id must exist (the run aborts with the available ids listed otherwise). Use this for a cost-conscious reduced-set run instead of temporarily editing `evals.json` down. The pre-flight summary and the `N evals × 2 conditions` count reflect the filtered set.
27
- - `--plan-mode` (optional, Claude Code) — inject the harness's verbatim plan-mode procedure as an operating-context layer. When set, the runner reads `profiles/<harness>/plan-mode.md` and emits it (via the session adapter's `renderPlanModeContext`) as a `<system-reminder>` block in every dispatch, after the available-skills block and before the user request. It is identical across the with/without-skill arms and recorded as `plan_mode` in `dispatch.json`. This is issue #142's highest-fidelity in-runner approximation of a real plan mode — still text the agent reads, so a pass is necessary-not-sufficient; see *Seeding conversation context (and its ceiling)* in `../SKILL.md`. Opt-in, and meant only for plan-mode-relevant skills; a harness with no profile aborts the run, leaving the portable dispatch contract unchanged.
28
-
29
- Staging is written under the current working directory: `<CWD>/.claude/skills/`. A subagent dispatched from that CWD discovers the staged skills there. Run the commands from the directory you want to be the eval root (the repo root for internal use; your skill folder or its parent for personal use).
30
-
31
- ## Driving the loop
32
-
33
- Every run produces both a `dispatch-manifest.md` (human-readable) and a `dispatch.json` (machine-readable). An agent in a session reads `dispatch.json` and dispatches each task itself. On Claude Code the rest is two fixed-order commands around the judge dispatches — `ingest` (record-runs → fill-transcripts → detect-stray-writes → grade) and `finalize` (grade --finalize → aggregate) — so the whole loop is three runner calls and two dispatch batches. On harnesses without persisted transcripts, the agent writes the records to the paths in each task by hand and runs the chained steps individually (the portable path).
34
-
35
- ## Quickstart (internal / repo use)
36
-
37
- Maintainers run from the repo root; the npm scripts supply `--skill-dir ./skills` and `--bootstrap ./bootstrap.md`.
38
-
39
- ### Mode A — Evaluate a new skill (with vs without)
40
-
41
- ```bash
42
- # 1. Author skills/<name>/evals/evals.json with 2-3 prompts.
43
-
44
- # 2. Build the iteration-1 workspace.
45
- bun run evals -- --skill <name> --mode new-skill
46
-
47
- # 3. Read skills-workspace/<name>/iteration-1/dispatch.json and dispatch each
48
- # task as a fresh general-purpose subagent (each writes its own
49
- # outputs/final-message.md).
50
-
51
- # 4. Ingest — record-runs → fill-transcripts → detect-stray-writes → grade,
52
- # in fixed order (assembles run.json + timing.json from dispatch.json,
53
- # final-message.md, and the persisted transcripts, then emits judge tasks):
54
- bun run evals:ingest -- --skill <name> --iteration 1 \
55
- --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/
56
-
57
- # 5. Dispatch each judge task ingest listed, writing responses to their
58
- # response_path.
59
-
60
- # 6. Finalize — grade --finalize → aggregate:
61
- bun run evals:finalize -- --skill <name> --iteration 1
62
-
63
- # 7. Read skills-workspace/<name>/iteration-1/benchmark.json.
64
-
65
- # 8. (Optional) Promote this run's benchmark + judge rationales into the
66
- # skill's version-controlled evals/baseline/ directory:
67
- bun run evals:promote-baseline -- --skill <name> --iteration 1
68
- ```
69
-
70
- ### Mode B — Evaluate a language change to an existing skill
71
-
72
- The common case is edit-first: you've already changed the skill, then decide to eval.
73
- Snapshot the *old* version from git — no working-tree dance:
74
-
75
- ```bash
76
- # 1. Edit skills/<name>/SKILL.md (the "new" version is now in the working tree).
77
-
78
- # 2. Snapshot the old version straight from git as the baseline.
79
- bun run evals:snapshot -- --skill <name> --label baseline-2026-05-24 --ref HEAD
80
-
81
- # 3. Build the iteration-N workspace, comparing baseline (old) vs current (new).
82
- bun run evals -- --skill <name> --mode revision --baseline baseline-2026-05-24
83
-
84
- # 4-7. Same as Mode A.
85
- ```
86
-
87
- If you snapshot *before* editing instead, drop `--ref HEAD` from step 2 (it reads the
88
- working tree) and run it before step 1.
89
-
90
- ### Dry run (workspace prep only)
91
-
92
- ```bash
93
- bun run evals -- --skill <name> --mode new-skill --dry-run
94
- ```
95
-
96
- ### Reduced-set run (cost-conscious subset)
97
-
98
- ```bash
99
- # Run just two of the defined evals, leaving evals.json untouched.
100
- bun run evals -- --skill <name> --mode new-skill --only case-a,case-b
101
- # Or run everything except a slow case.
102
- bun run evals -- --skill <name> --mode new-skill --skip slow-case
103
- ```
104
-
105
- ## Quickstart (running an eval on your own skill)
106
-
107
- If you have the slow-powers plugin installed and a personal skill, you do **not** run the npm scripts. The skill's `SKILL.md` routes you to `../harness-details/<harness>.md`, which gives the full command sequence (resolving the installed runner path, invoking `run.ts` directly with `--skill-dir`/`--skill`, dispatching subagents, grading). On Claude Code, see `../harness-details/claude.md`.
108
-
109
- ## Layout
110
-
111
- - `context.ts` — `detectRunContext(argv)` builds the `RunContext` every command shares: resolves `--skill-dir`/`--skill`, enumerates sibling skills, resolves `--bootstrap`/`--workspace-dir`, and derives `stageRoot` (CWD) and `workspaceRoot`.
112
- - `run.ts` — orchestrator; builds workspace tree, snapshots SKILL.md, emits dispatch manifest. On Claude Code (default), also stages each condition's snapshot at `<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md` so the subagent can discover and invoke it via the Skill tool, stages every *other* skill found in `--skill-dir` at its natural name so cross-references resolve, and builds the `<session-start-context>` block (see *Environment parity* below). Pass `--no-stage` to opt out and fall back to inlining the SKILL.md into the dispatch prompt. Pass `--stage-name <name>` to stage under a verbatim name instead of the eval slug (issue #144 name-confound experiments; single-staging-condition modes only, refuses to clobber an existing dir, registered for next-run cleanup). Also handles the `snapshot`, `ingest`/`finalize` (fixed-order post-dispatch chains over the sibling commands), and `teardown` subcommands.
113
- - `grade.ts` — evaluates `transcript_check` assertions directly (regex against `tool_invocations`), emits judge-task files for `llm_judge` assertions, then finalizes by merging judge responses into per-run `grading.json`. The `__skill_invoked` meta-check is code-based on Claude Code when the staged-skill slug is known and `tool_invocations` is populated (deterministic scan for a `Skill` tool call with matching slug); it falls back to an LLM judge looking for behavioral fingerprints when either signal is missing.
114
- - `aggregate.ts` — reads grading.json + timing.json from an iteration, writes `benchmark.json` with pass-rate / duration / token stats keyed by condition name.
115
- - `promote-baseline.ts` — copies the durable subset of an iteration (`benchmark.json` + each run's `grading.json` + a `BASELINE.md` provenance file) into the skill's version-controlled `evals/baseline/`. Flags: `--skill-dir`/`--skill` (as everywhere), `--iteration <N>` (required), `--label <tag>` (optional, recorded in provenance). Everything else in the workspace stays gitignored.
116
- - `record-runs.ts` — assembles a schema-valid `run.json` and backfills `timing.json` for every task in a runner-built iteration, from `dispatch.json` (carry-over fields) + `outputs/final-message.md` (`final_message`, transcript fallback) + the persisted transcript (`tool_invocations`, tokens, duration). Never clobbers existing records without `--overwrite`; transcript-derived timing carries `"source": "transcript"`. Claude-Code-tier, like `fill-transcripts` — transcript-less harnesses keep authoring records manually (the portable path).
117
- - `fill-transcripts.ts` — walks the iteration tree, matches each `(eval, condition)` to a subagent transcript by description, parses the transcript with the appropriate adapter, populates `tool_invocations` in `run.json`. Subsumed by `record-runs` for runner-built iterations; still the tool for filling a pre-existing (hand- or agent-written) `run.json`.
118
- - `detect-stray-writes.ts` — scans each run's `tool_invocations` for sandbox breaches and writes `stray-writes.json`: write tools targeting paths outside the run's outputs dir (violations), mutating Bash heuristics (warnings), and **live-source reads** — a read tool or Bash command accessing the live skill-under-test directory instead of its staged copy, the signature of the staged-slug resolution race (skills staged mid-session aren't guaranteed resolvable by the Skill tool, whose registry is built at session start; an agent that hits "Unknown skill" improvises and reads the live source, contaminating its arm). `aggregate` lifts all three into `benchmark.json`'s `validity_warnings`.
119
- - `adapters/claude-code-transcript.ts` — reads a Claude Code subagent JSONL and returns `ToolInvocation[]` (`parseTranscript`), or the full summary with usage tokens deduped by message id, wall-clock duration, and the last assistant text (`parseTranscriptFull`). Also exposes `listSubagents` / `findByDescription` for transcript correlation.
120
- - `types.ts` — shared TypeScript types matching `../schema/*.json`.
121
- - `validate.ts` / `validate-all.ts` — validator for `evals.json` against the JSON Schema rules. `validate-all.ts` takes `--skill-dir` and validates every skill's `evals.json` in it.
122
-
123
- ## Environment parity
124
-
125
- A subagent that runs an eval should start in an environment that mirrors a real install of the plugin under evaluation. Otherwise the result depends on the operator's local install state (whether they happen to have the plugin loaded into their parent session, which version, etc.) rather than the skill being measured. The runner produces this parity explicitly so results reproduce on a clean checkout or in CI.
126
-
127
- **Caveat — parity is only as clean as the operator's session.** Staging controls what the runner *adds* (the skills below), not what the operator's session already *loaded*. Subagents are dispatched in-process and share the parent session's plugins, so if that session has the plugin-under-evaluation — or any plugin exposing a same-named skill — enabled, the subagent discovers that copy too. That is exactly the "operator's local install state" dependency this section warns against, and the unique staging slug does not prevent it (it stops an on-disk collision, not runtime discovery). The runner can't unload a live plugin; on Claude Code it emits a build-time *plugin-shadow* warning (also surfaced in `benchmark.json`'s `validity_warnings`) so the contamination is visible. Closing it is a launch-time step: run the eval from a plugin-isolated session — see `../harness-details/claude.md` → *Isolating from installed plugins*.
128
-
129
- Parity has two parts, both applied when `--no-stage` is NOT set (the default `--harness claude-code`):
130
-
131
- 1. **An available-skills block is built into every dispatch prompt.** The runner lists the skills actually staged for the eval — the skill-under-test plus the siblings found in `--skill-dir` — as its **own block**, rendered the way the harness surfaces discoverable skills to a real session rather than in an eval-specific format. On Claude Code that is `The following skills are available for use with the Skill tool:` followed by `- name: description` bullets. This rendering is **harness-specific** and lives in `adapters/claude-code-session.ts` (a new harness adds its own renderer alongside it). The block is emitted *after*, and separate from, the `<session-start-context>` block — mirroring how a real session delivers the SessionStart hook and the skill list as two distinct surfaces. It tells the subagent what is discoverable, independent of any `--bootstrap` file.
132
- 2. **Every skill in `--skill-dir` is staged.** The skill-under-test is staged under its unique slug (`<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/`); every *other* skill in `--skill-dir` is copied to `<stageRoot>/.claude/skills/<name>/` at its natural name (excluding each skill's `evals/` subdir). Natural names matter because cross-references inside skill bodies (e.g. "REQUIRED SUB-SKILL: Use `slow-powers:test-driven-development`") only resolve cleanly to natural-name entries.
133
-
134
- `--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) inside the `<session-start-context>` block, ahead of the available-skills block. Internal runs pass `./bootstrap.md`. That file does **not** enumerate skills — the available-skills block is the single source of the skill list, so there is no duplication to keep in lockstep. (A *user-supplied* `--bootstrap` that does enumerate skills is handled defensively by `redactSkillFromBootstrap`, which strips the skill-under-test from the bootstrap prose on the `without_skill` arm so it can't leak into the control condition.)
135
-
136
- The runner records what it staged in `<stageRoot>/.claude/skills/.slow-powers-eval-manifest.json` so cleanup is reversible. Any pre-existing entry with a colliding name is backed up to a temp directory (recorded in the manifest) before being overwritten, and restored on the next `cleanupStagedSkills()` call. The prefix sweep (`slow-powers-eval-*` entries) still runs first so a crashed prior run is recovered even if the manifest itself was never written.
137
-
138
- The skill-under-test is **not** staged under its natural name — only under its unique slug. This preserves the `__skill_invoked` meta-check semantics: the check matches `Skill` invocations against the unique slug, so a `Skill` call to a natural-name sibling never false-positives as "the skill under test was invoked."
139
-
140
- For the **`without_skill` / baseline condition** in this realistic environment, the subagent's dispatch block reflects "this skill is unavailable, others remain" rather than the legacy "no skill is loaded." The baseline measures the incremental value of the skill-under-test on top of the rest of the environment — not its absolute value vs. no skills at all. With `--no-stage` (or a `--skill-dir` containing only the skill-under-test and no `--bootstrap`), the legacy "no skill is loaded" wording is preserved.
141
-
142
- **Cross-harness breadcrumbs.** Environment parity is implemented for Claude Code. Other harnesses have their own skill-discovery mechanisms; their maintainers know them best. Sketches:
143
-
144
- - **Codex.** Declares `"skills": "./skills/"` in its `plugin.json`, so the harness scans a directory at start-up. Sibling staging would write to whatever staging path that harness reads from — analogous to `stageSiblingSkills()` but pointed at the right directory. Bootstrap can be prepended to the dispatch prompt the same way.
145
- - **OpenCode.** Installed via npm package; the package's own directory is the discoverable surface. Sibling staging would copy into that directory, or — if the harness loads from `node_modules` directly — into a parallel staging path the harness is configured to scan.
146
- - **General fallback.** Harnesses without project-local discovery should keep using `--no-stage`; the inline `<skill>` block in the dispatch prompt is the only skill the subagent sees. Bootstrap is omitted in this mode because its references to other skills would mislead the agent.
147
- - **Plan-mode profiles (`--plan-mode`).** The plan-mode operating-context layer is also a harness-specific surface. The procedure text lives in `profiles/<harness>/plan-mode.md` and is wrapped by a `renderPlanModeContext` in that harness's session adapter (`adapters/<harness>-session.ts`), exactly mirroring how `renderAvailableSkillsBlock` is harness-specific. Only `profiles/claude-code/plan-mode.md` exists today; a harness that wants this fidelity layer adds its own profile file (its native plan/research mode procedure) plus a renderer alongside the Claude ones. A harness with no profile simply has no `--plan-mode`, and the portable dispatch contract is unchanged.
148
-
149
- The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `../harness-parity.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.
150
-
151
- **Operational notes.** Do not run two `run.ts` invocations concurrently against the same CWD — they race on `<stageRoot>/.claude/skills/` and the manifest.
152
-
153
- ## Why this lives in the skill
154
-
155
- The runner is bundled as a [supporting file](https://code.claude.com/docs/en/skills#add-supporting-files) of `evaluating-skills` so it ships in the published plugin. Methodology (the SKILL.md prose and the portable schemas) and the orchestration code that executes it travel together; a plugin user can run an eval on their own skill without cloning this repo. The portable run-record schema remains the abstraction that lets the methodology work across harnesses, while this runner stays Bun + Claude-Code-aware.
156
-
157
- ## Caveats
158
-
159
- - Ships a Claude Code transcript adapter. Other harnesses must populate `tool_invocations` manually or write their own adapter against `../schema/run-record.schema.json`. Without an adapter, `transcript_check` assertions grade as `unverifiable` and the `__skill_invoked` meta-check falls back to the LLM judge.
160
- - Skill staging writes to `<stageRoot>/.claude/skills/slow-powers-eval-*/`. The runner sweeps these directories at the start of each fresh run; a crashed run may leave stale entries that the next run will reap.
161
- - Grading dispatch is operator/agent-driven (the host dispatches judge subagents per the manifest).
162
- - Single-run evals only for now; the schema supports multi-run later.
163
- - Snapshot retention is manual — delete `<workspace>/<skill>/snapshots/<label>/` when no longer needed.
@@ -1,56 +0,0 @@
1
- import { describe, expect, test } from "bun:test";
2
- import type { AvailableSkill } from "../types";
3
- import {
4
- renderAvailableSkillsBlock,
5
- renderPlanModeContext,
6
- } from "./claude-code-session";
7
-
8
- const skill = (name: string, description: string): AvailableSkill => ({
9
- name,
10
- path: `/x/${name}/SKILL.md`,
11
- description,
12
- });
13
-
14
- describe("renderAvailableSkillsBlock", () => {
15
- test("uses the harness-native header and one `- name: description` bullet per skill", () => {
16
- const block = renderAvailableSkillsBlock([skill("foo", "the foo skill")]);
17
- expect(block).toContain(
18
- "The following skills are available for use with the Skill tool:",
19
- );
20
- expect(block).toContain("- foo: the foo skill");
21
- // The eval-flavored wording and custom format must be gone.
22
- expect(block).not.toContain("staged and discoverable");
23
- expect(block).not.toContain("*Trigger:*");
24
- });
25
-
26
- test("sorts skills by name", () => {
27
- const block = renderAvailableSkillsBlock([
28
- skill("zebra", "z"),
29
- skill("alpha", "a"),
30
- ]);
31
- expect(block.indexOf("- alpha:")).toBeLessThan(block.indexOf("- zebra:"));
32
- });
33
-
34
- test("returns an empty string for an empty list", () => {
35
- expect(renderAvailableSkillsBlock([])).toBe("");
36
- });
37
- });
38
-
39
- describe("renderPlanModeContext", () => {
40
- test("wraps the profile text in a harness-native system-reminder block", () => {
41
- const block = renderPlanModeContext("Plan mode is active. Do not edit.");
42
- expect(block).toContain("<system-reminder>");
43
- expect(block).toContain("</system-reminder>");
44
- expect(block).toContain("Plan mode is active. Do not edit.");
45
- });
46
-
47
- test("trims surrounding whitespace from the profile text", () => {
48
- const block = renderPlanModeContext("\n\n PROFILE-BODY \n\n");
49
- expect(block).toBe("<system-reminder>\nPROFILE-BODY\n</system-reminder>");
50
- });
51
-
52
- test("returns an empty string for empty or whitespace-only input", () => {
53
- expect(renderPlanModeContext("")).toBe("");
54
- expect(renderPlanModeContext(" \n ")).toBe("");
55
- });
56
- });
@@ -1,43 +0,0 @@
1
- // Claude Code-specific rendering of session-start context.
2
- //
3
- // The available-skills reminder is a *harness-specific* surface: Claude Code
4
- // presents discoverable skills to an agent as "The following skills are
5
- // available for use with the Skill tool:" followed by `- name: description`
6
- // bullets. Other harnesses (Codex, OpenCode) surface their skills differently,
7
- // so this rendering lives in an adapter rather than inline in the harness-
8
- // agnostic orchestrator. A new harness adds its own renderer alongside this one
9
- // (see ../../harness-parity.md).
10
-
11
- import type { AvailableSkill } from "../types";
12
-
13
- /**
14
- * Render the list of discoverable skills the way a real Claude Code session
15
- * surfaces them, so an eval dispatch mirrors a genuine session rather than
16
- * announcing itself as an eval. Returns an empty string when no skills are
17
- * staged (the caller omits the block entirely in that case).
18
- */
19
- export function renderAvailableSkillsBlock(skills: AvailableSkill[]): string {
20
- if (skills.length === 0) return "";
21
- const sorted = [...skills].sort((a, b) => a.name.localeCompare(b.name));
22
- const lines = sorted.map((s) => `- ${s.name}: ${s.description}`);
23
- return [
24
- "The following skills are available for use with the Skill tool:",
25
- "",
26
- ...lines,
27
- ].join("\n");
28
- }
29
-
30
- /**
31
- * Render a plan-mode profile the way Claude Code injects an operating mode into
32
- * a live session: as a `<system-reminder>` block the agent is told it is
33
- * operating under, not prose it merely reads. The profile text (the verbatim
34
- * plan-mode procedure) lives in `../profiles/claude-code/plan-mode.md`; this
35
- * adapter owns only the harness-native framing, so a new harness adds its own
36
- * renderer + profile alongside this one (see ../../harness-parity.md). Returns
37
- * an empty string for empty input so the caller can omit the section entirely.
38
- */
39
- export function renderPlanModeContext(profileText: string): string {
40
- const trimmed = profileText.trim();
41
- if (!trimmed) return "";
42
- return ["<system-reminder>", trimmed, "</system-reminder>"].join("\n");
43
- }