@slowdini/slow-powers-opencode 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -65
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -13
- package/skills/evaluating-skills/SKILL.md +91 -337
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -158
- package/skills/evaluating-skills/runner/README.md +0 -154
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
- package/skills/evaluating-skills/runner/aggregate.ts +0 -248
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
- package/skills/evaluating-skills/runner/run.test.ts +0 -1180
- package/skills/evaluating-skills/runner/run.ts +0 -1029
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
- package/skills/evaluating-skills/runner/types.ts +0 -112
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { expect, test } from "bun:test";
|
|
2
|
+
import { slugify } from "./slugify";
|
|
3
|
+
|
|
4
|
+
test("lowercases and hyphenates spaces", () => {
|
|
5
|
+
expect(slugify("Hello World")).toBe("hello-world");
|
|
6
|
+
});
|
|
7
|
+
|
|
8
|
+
test("strips accents", () => {
|
|
9
|
+
expect(slugify("Café del Mar")).toBe("cafe-del-mar");
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
test("collapses punctuation runs and trims edge hyphens", () => {
|
|
13
|
+
expect(slugify(" Wow!! Really? ")).toBe("wow-really");
|
|
14
|
+
});
|
package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
// This module was added as part of TICKET-4821 to fix the bug where blog post
|
|
2
|
+
// URLs with uppercase letters and spaces were 404ing in production. Previously
|
|
3
|
+
// we just used the raw title as the slug, which broke routing for ~12% of posts.
|
|
4
|
+
// See the incident writeup in #eng-incidents (2024-11-03) for the full story.
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Convert a human-readable title into a URL-safe slug.
|
|
8
|
+
*
|
|
9
|
+
* Lowercases, strips accents, and collapses any run of non-alphanumeric
|
|
10
|
+
* characters into a single hyphen.
|
|
11
|
+
*/
|
|
12
|
+
export function slugify(title: string): string {
|
|
13
|
+
// lowercase the title
|
|
14
|
+
const lowered = title.toLowerCase();
|
|
15
|
+
|
|
16
|
+
// NFKD (not NFC): decomposing combining marks into separate code points is
|
|
17
|
+
// what lets the next line strip them — NFC keeps "é" as one code point.
|
|
18
|
+
const deaccented = lowered.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
|
|
19
|
+
|
|
20
|
+
// replace runs of non-alphanumeric characters with a single hyphen
|
|
21
|
+
const hyphenated = deaccented.replace(/[^a-z0-9]+/g, "-");
|
|
22
|
+
|
|
23
|
+
// strip leading and trailing hyphens
|
|
24
|
+
return hyphenated.replace(/^-+|-+$/g, "");
|
|
25
|
+
}
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"skill_name": "verifying-development-work",
|
|
3
|
-
"_comment": "Reference example — mirrors the live evals.json at skills/verifying-development-work/evals/evals.json. Three prompts, designed so a typical agent without the skill fails in measurable ways. Eval 1 has a mechanical signal (did the agent run the test command?); Eval 2 mixes mechanical + linguistic; Eval 3 is judge-only (no fixture, linguistic discipline).",
|
|
4
|
-
"evals": [
|
|
5
|
-
{
|
|
6
|
-
"id": "claim-without-running",
|
|
7
|
-
"prompt": "I have a small Bun project with two files: sum.ts (a tiny add function) and sum.test.ts (its tests). I just edited sum.ts to fix a bug where it was returning a - b instead of a + b. Tell me when the tests pass.",
|
|
8
|
-
"expected_output": "Agent runs `bun test` (or equivalent) in the project directory and quotes the actual passing output before claiming the tests pass. A bare 'tests pass' or 'the fix looks correct' without fresh execution evidence is the failure mode.",
|
|
9
|
-
"files": [
|
|
10
|
-
"fixtures/claim-without-running/sum.ts",
|
|
11
|
-
"fixtures/claim-without-running/sum.test.ts"
|
|
12
|
-
]
|
|
13
|
-
},
|
|
14
|
-
{
|
|
15
|
-
"id": "build-implied-by-edit",
|
|
16
|
-
"prompt": "I just changed the export in api.ts from default to named. Confirm the project still typechecks.",
|
|
17
|
-
"expected_output": "Agent runs typecheck (`bunx tsc --noEmit` or equivalent) and discovers that consumer.ts fails because it imports api.ts as a default. The failure mode is the agent saying 'the change is local to api.ts, the build should be fine' without running the typecheck.",
|
|
18
|
-
"files": [
|
|
19
|
-
"fixtures/build-implied-by-edit/api.ts",
|
|
20
|
-
"fixtures/build-implied-by-edit/consumer.ts",
|
|
21
|
-
"fixtures/build-implied-by-edit/tsconfig.json"
|
|
22
|
-
]
|
|
23
|
-
},
|
|
24
|
-
{
|
|
25
|
-
"id": "bug-fixed-without-reproducing",
|
|
26
|
-
"prompt": "Users report intermittent 500s from /api/checkout. I just added retry logic to the upstream call in checkout.ts. Tell me the bug is fixed.",
|
|
27
|
-
"expected_output": "Agent refuses to claim 'bug fixed' without (a) reproducing the original 500 and (b) demonstrating the retry handles it. Acceptable responses: ask for repro steps, propose writing a repro script, explain why 'fixed' requires evidence. The failure mode is 'Looks good, the retry should handle the intermittent 500s.'"
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
# Running an eval on a skill — Claude Code
|
|
2
|
-
|
|
3
|
-
This is the Claude Code-specific walkthrough for `evaluating-skills`. The runner contract (`--skill-dir`, `--skill`, `--bootstrap`, modes, what gets staged) is described in `../SKILL.md`; this file tells you exactly how to drive it inside Claude Code.
|
|
4
|
-
|
|
5
|
-
Use this when a user, working from their own skill folder, asks to run an eval (e.g. "run an eval on this skill to check if a change reduces token usage").
|
|
6
|
-
|
|
7
|
-
## Isolating from installed plugins
|
|
8
|
-
|
|
9
|
-
**Read this first if the skill you're evaluating shares a name with one an installed, enabled plugin provides** — e.g. evaluating a slow-powers skill with the slow-powers plugin installed, or any user evaluating their own plugin's skills.
|
|
10
|
-
|
|
11
|
-
Eval subagents are dispatched via the **Task tool**, so they run in-process and inherit *this session's* enabled plugins and global skills. The runner stages the skill-under-test under a unique slug (`slow-powers-eval-…`) — that avoids an on-disk collision and lets the `__skill_invoked` meta-check find the staged copy — but it does **not** stop the installed plugin's own `<plugin>:<name>` copy from also being discoverable. When both copies are reachable:
|
|
12
|
-
|
|
13
|
-
- the with-skill arm can invoke the staged slug *and then* reach for the installed copy (redundant/leaked invocation), and
|
|
14
|
-
- the `without_skill` arm is **not truly skill-absent** — the installed copy is still discoverable, contaminating the baseline and shrinking the measured delta.
|
|
15
|
-
|
|
16
|
-
Plugins load at **session start** and the runner can't unload them mid-session, so it only *detects and warns* (a build-time "plugin-shadow" banner, also surfaced in `benchmark.json`'s `validity_warnings`). To actually isolate, **launch the session you run the eval from** one of these ways — subagents inherit it:
|
|
17
|
-
|
|
18
|
-
1. **Drop user-scope plugins, keep auth:** `claude --setting-sources project,local`. User-scope `enabledPlugins` (where user-installed plugins are enabled) isn't loaded, so they don't appear. Auth is unaffected. (Also drops your other user-scope settings/MCP for that session.)
|
|
19
|
-
2. **Disable the specific plugin, then restart:** set `"enabledPlugins": { "<plugin>@<marketplace>": false }` in a settings source that loads at startup (project `.claude/settings.json` or user `~/.claude/settings.json`) and start a fresh session. *(The slow-powers repo ships this for `slow-powers@slowdini` and `superpowers@claude-plugins-official` in its own `.claude/settings.json`.)*
|
|
20
|
-
3. **Clean config dir (strips everything):** `CLAUDE_CONFIG_DIR="$(mktemp -d)" claude`. No installed plugins or global skills load at all. **Auth caveat:** your OAuth session lives in `~/.claude.json`, which a relocated config dir may not carry — set `ANTHROPIC_API_KEY` or re-authenticate once in the fresh dir.
|
|
21
|
-
|
|
22
|
-
All three keep the eval working: project-local staged skills live in `<cwd>/.claude/skills/` (project scope, independent of installed plugins), so they still load and the meta-check still resolves the slug. A clean config dir (option 3) additionally means the real SessionStart bootstrap hook doesn't fire, so the only session-start framing present is whatever you pass via `--bootstrap` — which removes the separate "even a 1% chance → you MUST invoke" mandate that otherwise pins invocation at 100%.
|
|
23
|
-
|
|
24
|
-
**Verify before you run:** the installed twin should be gone — `/plugin` shows it disabled, or the runner's build step prints no plugin-shadow banner.
|
|
25
|
-
|
|
26
|
-
## Step 1 — Resolve the bundled runner
|
|
27
|
-
|
|
28
|
-
The runner ships inside the installed slow-powers plugin. Resolve its path once per session and reuse it. Use `find` rather than a shell glob so the command behaves the same under bash and zsh (a bare glob with no match errors under zsh):
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
SLOW_POWERS_RUNNER_ROOT="$(find ~/.claude/plugins/cache -maxdepth 6 -type d -path '*/slow-powers/*/skills/evaluating-skills/runner' 2>/dev/null | sort | tail -1)"
|
|
32
|
-
# Fallback for dev/marketplace installs:
|
|
33
|
-
[ -z "$SLOW_POWERS_RUNNER_ROOT" ] && SLOW_POWERS_RUNNER_ROOT="$(find ~/.claude/plugins/marketplaces -maxdepth 6 -type d -path '*/slow-powers/*/skills/evaluating-skills/runner' 2>/dev/null | sort | tail -1)"
|
|
34
|
-
echo "$SLOW_POWERS_RUNNER_ROOT"
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
(`sort | tail -1` prefers the lexically-latest version directory when several are installed.)
|
|
38
|
-
|
|
39
|
-
If this is empty, the plugin isn't installed at the canonical path. Tell the user to clone the slow-powers repo and run from there (`bun run evals -- --skill <name> --mode <mode>`), or to reinstall the plugin.
|
|
40
|
-
|
|
41
|
-
## Step 2 — Check the prerequisite
|
|
42
|
-
|
|
43
|
-
```bash
|
|
44
|
-
bun --version
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
If `bun` is missing, the runner can't execute. Tell the user to install it: `curl -fsSL https://bun.sh/install | bash` (or `brew install bun`), then retry.
|
|
48
|
-
|
|
49
|
-
The runner depends on one package, `ajv` (runtime schema validation). `bun` auto-installs it on first run, so no manual step is normally needed. In an offline/airgapped environment where auto-install can't reach the registry, run `bun install` once (in the slow-powers repo, or wherever `package.json` lives) before the first eval.
|
|
50
|
-
|
|
51
|
-
## Step 3 — Detect the skill folder
|
|
52
|
-
|
|
53
|
-
The user typically opens Claude Code inside their skill folder. Confirm it:
|
|
54
|
-
|
|
55
|
-
```bash
|
|
56
|
-
ls SKILL.md evals/evals.json 2>/dev/null
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
- No `SKILL.md`: ask the user for the path to their skill folder.
|
|
60
|
-
- No `evals/evals.json`: go to Step 5 to author one.
|
|
61
|
-
|
|
62
|
-
## Step 4 — Derive `--skill-dir` and `--skill`
|
|
63
|
-
|
|
64
|
-
`--skill-dir` is the **parent** directory that holds skill folders; `--skill` is the skill folder's name. If the current directory is the skill folder itself:
|
|
65
|
-
|
|
66
|
-
- `--skill` = the basename of the current directory (e.g. `mr-review`)
|
|
67
|
-
- `--skill-dir` = the parent directory
|
|
68
|
-
|
|
69
|
-
Confirm these with the user before running. Remember: every skill inside `--skill-dir` is staged as a sibling. If the user wants their skill evaluated in isolation, `--skill-dir` should contain only that one skill (the common case). If they want slow-powers skills available as siblings, they must copy or symlink them into `--skill-dir` first.
|
|
70
|
-
|
|
71
|
-
## Step 5 — Author `evals/evals.json` (only if missing)
|
|
72
|
-
|
|
73
|
-
Read the template at `${SLOW_POWERS_RUNNER_ROOT}/../templates/evals.json.example` and walk the user through writing 2–3 realistic prompts, following the "Designing test cases" guidance in `../SKILL.md`. Save it to `<skill-folder>/evals/evals.json`. Don't write assertions yet — see the methodology.
|
|
74
|
-
|
|
75
|
-
## Step 6 — Gitignore the workspace
|
|
76
|
-
|
|
77
|
-
The runner writes artifacts to `<CWD>/skills-workspace/`. Keep it out of version control:
|
|
78
|
-
|
|
79
|
-
```bash
|
|
80
|
-
grep -qxF 'skills-workspace/' .gitignore 2>/dev/null || echo 'skills-workspace/' >> .gitignore
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
If the folder isn't a git repo, skip this and warn the user that artifacts will accumulate under `skills-workspace/`.
|
|
84
|
-
|
|
85
|
-
## Step 7 — Pre-flight confirmation & sandbox decision
|
|
86
|
-
|
|
87
|
-
This is a required gate (see *Pre-flight gate* in `../SKILL.md`). Do not run the build or dispatch anything until the user has confirmed.
|
|
88
|
-
|
|
89
|
-
1. Read `<skill-folder>/evals/evals.json` and assemble the run summary:
|
|
90
|
-
- **Skill under test** — `<name>` and its path
|
|
91
|
-
- **Mode** — `new-skill` or `revision` (+ baseline label)
|
|
92
|
-
- **Eval cases** — count and a one-line list of the prompts
|
|
93
|
-
- **Models** — the model you'll dispatch each subagent under test with (Step 9) and the judge model for `llm_judge` assertions (Step 10). State them explicitly; the runner can't observe them, so this is the user's chance to correct a wrong choice before tokens are spent.
|
|
94
|
-
- **Cost** — `2 × <case count>` agent dispatches plus a judge dispatch per `llm_judge` assertion; flag it as time- and token-intensive.
|
|
95
|
-
- **Sandbox** — you will arm `--guard` (the default on Claude Code).
|
|
96
|
-
2. Present the summary and **wait for explicit confirmation.** An earlier "run the eval" doesn't count — the summary may surface a wrong mode, model, or a guard the user didn't intend.
|
|
97
|
-
3. Default to `--guard`. Drop it **only** if the user actively opts out, and then warn them: without the guard, stray writes (e.g. worktrees a skill-under-test creates in this repo) are only *detected* post-hoc by `detect-stray-writes` in Step 10 — never blocked or reverted — and are theirs to clean up.
|
|
98
|
-
|
|
99
|
-
## Step 8 — Run the workspace build
|
|
100
|
-
|
|
101
|
-
Run from the skill folder (so `CWD` is the eval root and staging lands at `<CWD>/.claude/skills/`).
|
|
102
|
-
|
|
103
|
-
`--guard` is on in the commands below because it's the default posture (Step 7). It stages a `PreToolUse` hook into `.claude/settings.local.json` that *blocks* subagent writes/installs outside the eval sandbox (the workspace, the staged-skills dir, and `$TMPDIR`) while dispatches run. The hook is gated by a marker that auto-expires after 6h and is torn down at the start of the next run; to remove it immediately, run `bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" teardown-guard --skill-dir <skill-dir> --skill <name>` (or `bun run evals:teardown-guard` in the slow-powers repo).
|
|
104
|
-
|
|
105
|
-
New-skill mode (with vs without):
|
|
106
|
-
|
|
107
|
-
```bash
|
|
108
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" --skill-dir <skill-dir> --skill <name> --mode new-skill --guard
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
Revision mode (test a change to an existing skill):
|
|
112
|
-
|
|
113
|
-
```bash
|
|
114
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" snapshot --skill-dir <skill-dir> --skill <name> --label baseline
|
|
115
|
-
# ...edit the SKILL.md...
|
|
116
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/run.ts" --skill-dir <skill-dir> --skill <name> --mode revision --baseline baseline --guard
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
Add `--bootstrap <path>` if the user has authored a framing file they want prepended to every dispatch. Without it, dispatches carry only the auto-built available-skills block (rendered the way Claude Code surfaces discoverable skills, so the dispatch reads like a real session).
|
|
120
|
-
|
|
121
|
-
For a **plan-mode-relevant skill** (e.g. `hardening-plans`), add `--plan-mode` to inject Claude Code's verbatim plan-mode procedure as a `<system-reminder>` operating-context layer in every dispatch — the highest-fidelity in-runner approximation of a real plan mode (issue #142). Use it as the verbatim-procedure arm of an A/B against a plain paraphrase-seed run (no flag) to measure whether `with_skill` invocation de-saturates. It is still text the agent reads, not an injected mode, so treat any de-saturation as a stronger-than-cold signal, not ground truth (see *Seeding conversation context (and its ceiling)* in `../SKILL.md`).
|
|
122
|
-
|
|
123
|
-
**The live ExitPlanMode → hardening-plans hook is not exercised here.** The shipped Claude plugin gates plan hand-off with a `PreToolUse` hook on `ExitPlanMode` (`hooks/exit-plan-mode`) that denies the first plan-exit and steers the agent through `hardening-plans` before the plan is presented. The runner only *simulates* plan mode as injected `<system-reminder>` text and dispatches single agent turns — it never emits a real `ExitPlanMode` tool call nor runs `PreToolUse` hooks, so that gate is structurally outside what the eval harness can exercise. This is the standing reason a `hardening-plans` invocation-rate delta *from the hook* can't be exhibited in-runner, independent of the #119 invocation-hint gate and the plan-mode-simulation ceiling.
|
|
124
|
-
|
|
125
|
-
Only when the user has opted out of the guard, drop `--guard` from the command above and rely on the post-hoc `detect-stray-writes` step in Step 10 instead — it reports stray writes but does not clean them up.
|
|
126
|
-
|
|
127
|
-
## Step 9 — Drive the dispatches
|
|
128
|
-
|
|
129
|
-
Read `<CWD>/skills-workspace/<name>/iteration-<N>/dispatch.json`. For each task object:
|
|
130
|
-
|
|
131
|
-
1. Dispatch a fresh subagent via the **Task tool** with the prompt `Read the file at <dispatch_prompt_path> and follow its instructions exactly.` (substituting the task's `dispatch_prompt_path`), and pass `agent_description` verbatim as the description. The full prompt lives in that file rather than inline in `dispatch.json`, so you never reproduce ~KB of text per dispatch. The description is namespaced with the iteration and a per-run nonce (`<eval_id>:<condition>:i<N>-<nonce>`) — pass it through unchanged; do not reconstruct it. Passing it verbatim is what lets transcript correlation work in Step 10 without cross-matching an agent from another iteration.
|
|
132
|
-
2. When the subagent returns, write the portable run record to `run_record_path` and the timing record (`{ "total_tokens": <n>, "duration_ms": <n>}`) to `timing_path`. Capture tokens/duration from the task completion event — they may not be persisted elsewhere. The run record must satisfy `schema/run-record.schema.json` (validated by `grade`/`fill-transcripts`/`detect-stray-writes`): set `eval_id`, `condition`, `skill_path` (the task's `skill_path`, `null` on the `without_skill` arm), `prompt` (the task's `user_prompt`), `files` (the task's `fixtures`, `[]` if none), `final_message` (the subagent's reply), and `tool_invocations: []` (populated later from the transcript).
|
|
133
|
-
|
|
134
|
-
## Step 10 — Fill transcripts, grade, aggregate
|
|
135
|
-
|
|
136
|
-
Claude Code persists subagent transcripts under `~/.claude/projects/<project-slug>/<parent-session-id>/subagents/`. Find that directory for the current session, then:
|
|
137
|
-
|
|
138
|
-
```bash
|
|
139
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/fill-transcripts.ts" --skill-dir <skill-dir> --skill <name> --iteration <N> \
|
|
140
|
-
--subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/
|
|
141
|
-
|
|
142
|
-
# Optional: flag any subagent writes/installs that escaped the outputs/ dir.
|
|
143
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/detect-stray-writes.ts" --skill-dir <skill-dir> --skill <name> --iteration <N>
|
|
144
|
-
|
|
145
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/grade.ts" --skill-dir <skill-dir> --skill <name> --iteration <N>
|
|
146
|
-
# Dispatch a fresh judge subagent for each emitted judge task — prompt it with `Read the file at <dispatch_prompt_path> and follow its instructions exactly.` (the prompt tells the judge where to write its response). Then:
|
|
147
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/grade.ts" --skill-dir <skill-dir> --skill <name> --iteration <N> --finalize
|
|
148
|
-
|
|
149
|
-
bun run "$SLOW_POWERS_RUNNER_ROOT/aggregate.ts" --skill-dir <skill-dir> --skill <name> --iteration <N>
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
## Step 11 — Present results
|
|
153
|
-
|
|
154
|
-
Read `<CWD>/skills-workspace/<name>/iteration-<N>/benchmark.json`. Surface to the user:
|
|
155
|
-
|
|
156
|
-
- `run_summary` per condition (pass rate, tokens, duration)
|
|
157
|
-
- `delta` (what the skill/change costs and what it buys — for a token-reduction eval, focus on `delta.total_tokens` alongside `delta.pass_rate`)
|
|
158
|
-
- `validity_warnings` (read these before trusting the delta — a low skill-invocation rate means the result may not reflect the skill at all)
|
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
# Skill Evals Runner
|
|
2
|
-
|
|
3
|
-
Supporting code for the skill eval framework defined in `skills/evaluating-skills/`. This runner ships **with** the skill (it lives under the skill directory and is included in the published plugin), so plugin users can run evals on their own skills, not just slow-powers maintainers.
|
|
4
|
-
|
|
5
|
-
The methodology lives in `SKILL.md` and is harness-agnostic. This runner is Bun + Claude Code-aware: it knows how to translate Claude Code transcript shapes into the portable `run.json` format. Harness-specific operator instructions live in `../harness-details/<harness>.md`.
|
|
6
|
-
|
|
7
|
-
## The `--skill-dir` model
|
|
8
|
-
|
|
9
|
-
Every command takes two required flags:
|
|
10
|
-
|
|
11
|
-
- `--skill-dir <path>` — a directory that contains one or more skill folders (each with a `SKILL.md`). **This directory is the eval's test environment.** Every skill inside it is staged for the eval: the skill-under-test under a unique slug, every *other* skill under its natural name (so cross-references resolve).
|
|
12
|
-
- `--skill <name>` — the subdirectory of `--skill-dir` to evaluate.
|
|
13
|
-
|
|
14
|
-
Consequences of treating the directory as the environment:
|
|
15
|
-
|
|
16
|
-
- **Internal use** points `--skill-dir` at the repo's `./skills`, so the skill-under-test sees every other slow-powers skill as a sibling — the realistic install. The npm scripts bake this in (`--skill-dir ./skills`), so maintainers keep using `bun run evals -- --skill <name> --mode <mode>` unchanged.
|
|
17
|
-
- **A user evaluating one personal skill** points `--skill-dir` at the directory holding it. If that directory contains only their skill, the eval runs in isolation — no sibling skills are staged. To include slow-powers skills as siblings, the user copies or symlinks them into `--skill-dir`.
|
|
18
|
-
|
|
19
|
-
Other flags:
|
|
20
|
-
|
|
21
|
-
- `--bootstrap <path>` (optional) — a Markdown file prepended verbatim to every dispatch prompt inside `<session-start-context>`. Use it for product-specific framing (instruction priority, planning guidelines — anything a SessionStart hook would inject). Internal runs pass `--bootstrap ./bootstrap.md`. Omit it and dispatches carry only the auto-built staged-skills inventory.
|
|
22
|
-
- `--workspace-dir <path>` (optional) — where iteration artifacts are written. Defaults to `<CWD>/skills-workspace`.
|
|
23
|
-
- `--harness claude-code` (optional, default `claude-code`; the only supported harness).
|
|
24
|
-
- `--no-stage`, `--dry-run`, `--iteration <N>`, `--mode <new-skill|revision>`, `--baseline <label>`, `--label <label>` — as before.
|
|
25
|
-
- `--only <id,id,...>` / `--skip <id,id,...>` (optional) — run only, or all-but, the named eval ids from `evals.json`. The two are mutually exclusive, and every named id must exist (the run aborts with the available ids listed otherwise). Use this for a cost-conscious reduced-set run instead of temporarily editing `evals.json` down. The pre-flight summary and the `N evals × 2 conditions` count reflect the filtered set.
|
|
26
|
-
- `--plan-mode` (optional, Claude Code) — inject the harness's verbatim plan-mode procedure as an operating-context layer. When set, the runner reads `profiles/<harness>/plan-mode.md` and emits it (via the session adapter's `renderPlanModeContext`) as a `<system-reminder>` block in every dispatch, after the available-skills block and before the user request. It is identical across the with/without-skill arms and recorded as `plan_mode` in `dispatch.json`. This is issue #142's highest-fidelity in-runner approximation of a real plan mode — still text the agent reads, so a pass is necessary-not-sufficient; see *Seeding conversation context (and its ceiling)* in `../SKILL.md`. Opt-in, and meant only for plan-mode-relevant skills; a harness with no profile aborts the run, leaving the portable dispatch contract unchanged.
|
|
27
|
-
|
|
28
|
-
Staging is written under the current working directory: `<CWD>/.claude/skills/`. A subagent dispatched from that CWD discovers the staged skills there. Run the commands from the directory you want to be the eval root (the repo root for internal use; your skill folder or its parent for personal use).
|
|
29
|
-
|
|
30
|
-
## Driving the loop
|
|
31
|
-
|
|
32
|
-
Every run produces both a `dispatch-manifest.md` (human-readable) and a `dispatch.json` (machine-readable). An agent in a session reads `dispatch.json`, dispatches each task itself, and writes the run/timing records to the paths in each task.
|
|
33
|
-
|
|
34
|
-
## Quickstart (internal / repo use)
|
|
35
|
-
|
|
36
|
-
Maintainers run from the repo root; the npm scripts supply `--skill-dir ./skills` and `--bootstrap ./bootstrap.md`.
|
|
37
|
-
|
|
38
|
-
### Mode A — Evaluate a new skill (with vs without)
|
|
39
|
-
|
|
40
|
-
```bash
|
|
41
|
-
# 1. Author skills/<name>/evals/evals.json with 2-3 prompts.
|
|
42
|
-
|
|
43
|
-
# 2. Build the iteration-1 workspace.
|
|
44
|
-
bun run evals -- --skill <name> --mode new-skill
|
|
45
|
-
|
|
46
|
-
# 3. Read skills-workspace/<name>/iteration-1/dispatch.json and dispatch each
|
|
47
|
-
# task as a fresh general-purpose subagent, writing run.json + timing.json
|
|
48
|
-
# to the paths in each task.
|
|
49
|
-
|
|
50
|
-
# 4. Fill tool_invocations from subagent transcripts:
|
|
51
|
-
bun run evals:fill-transcripts -- --skill <name> --iteration 1 \
|
|
52
|
-
--subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/
|
|
53
|
-
|
|
54
|
-
# 5. Grade:
|
|
55
|
-
bun run evals:grade -- --skill <name> --iteration 1
|
|
56
|
-
# (After judge subagents complete and their responses are written, finalize:)
|
|
57
|
-
bun run evals:grade -- --skill <name> --iteration 1 --finalize
|
|
58
|
-
|
|
59
|
-
# 6. Aggregate:
|
|
60
|
-
bun run evals:aggregate -- --skill <name> --iteration 1
|
|
61
|
-
|
|
62
|
-
# 7. Read skills-workspace/<name>/iteration-1/benchmark.json.
|
|
63
|
-
|
|
64
|
-
# 8. (Optional) Promote this run's benchmark + judge rationales into the
|
|
65
|
-
# skill's version-controlled evals/baseline/ directory:
|
|
66
|
-
bun run evals:promote-baseline -- --skill <name> --iteration 1
|
|
67
|
-
```
|
|
68
|
-
|
|
69
|
-
### Mode B — Evaluate a language change to an existing skill
|
|
70
|
-
|
|
71
|
-
```bash
|
|
72
|
-
# 1. Snapshot current SKILL.md before editing.
|
|
73
|
-
bun run evals:snapshot -- --skill <name> --label baseline-2026-05-24
|
|
74
|
-
|
|
75
|
-
# 2. Edit skills/<name>/SKILL.md.
|
|
76
|
-
|
|
77
|
-
# 3. Build the iteration-N workspace, comparing snapshot vs current.
|
|
78
|
-
bun run evals -- --skill <name> --mode revision --baseline baseline-2026-05-24
|
|
79
|
-
|
|
80
|
-
# 4-7. Same as Mode A.
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
### Dry run (workspace prep only)
|
|
84
|
-
|
|
85
|
-
```bash
|
|
86
|
-
bun run evals -- --skill <name> --mode new-skill --dry-run
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
### Reduced-set run (cost-conscious subset)
|
|
90
|
-
|
|
91
|
-
```bash
|
|
92
|
-
# Run just two of the defined evals, leaving evals.json untouched.
|
|
93
|
-
bun run evals -- --skill <name> --mode new-skill --only case-a,case-b
|
|
94
|
-
# Or run everything except a slow case.
|
|
95
|
-
bun run evals -- --skill <name> --mode new-skill --skip slow-case
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
## Quickstart (running an eval on your own skill)
|
|
99
|
-
|
|
100
|
-
If you have the slow-powers plugin installed and a personal skill, you do **not** run the npm scripts. The skill's `SKILL.md` routes you to `../harness-details/<harness>.md`, which gives the full command sequence (resolving the installed runner path, invoking `run.ts` directly with `--skill-dir`/`--skill`, dispatching subagents, grading). On Claude Code, see `../harness-details/claude.md`.
|
|
101
|
-
|
|
102
|
-
## Layout
|
|
103
|
-
|
|
104
|
-
- `context.ts` — `detectRunContext(argv)` builds the `RunContext` every command shares: resolves `--skill-dir`/`--skill`, enumerates sibling skills, resolves `--bootstrap`/`--workspace-dir`, and derives `stageRoot` (CWD) and `workspaceRoot`.
|
|
105
|
-
- `run.ts` — orchestrator; builds workspace tree, snapshots SKILL.md, emits dispatch manifest. On Claude Code (default), also stages each condition's snapshot at `<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md` so the subagent can discover and invoke it via the Skill tool, stages every *other* skill found in `--skill-dir` at its natural name so cross-references resolve, and builds the `<session-start-context>` block (see *Environment parity* below). Pass `--no-stage` to opt out and fall back to inlining the SKILL.md into the dispatch prompt. Also handles the `snapshot` subcommand.
|
|
106
|
-
- `grade.ts` — evaluates `transcript_check` assertions directly (regex against `tool_invocations`), emits judge-task files for `llm_judge` assertions, then finalizes by merging judge responses into per-run `grading.json`. The `__skill_invoked` meta-check is code-based on Claude Code when the staged-skill slug is known and `tool_invocations` is populated (deterministic scan for a `Skill` tool call with matching slug); it falls back to an LLM judge looking for behavioral fingerprints when either signal is missing.
|
|
107
|
-
- `aggregate.ts` — reads grading.json + timing.json from an iteration, writes `benchmark.json` with pass-rate / duration / token stats keyed by condition name.
|
|
108
|
-
- `promote-baseline.ts` — copies the durable subset of an iteration (`benchmark.json` + each run's `grading.json` + a `BASELINE.md` provenance file) into the skill's version-controlled `evals/baseline/`. Flags: `--skill-dir`/`--skill` (as everywhere), `--iteration <N>` (required), `--label <tag>` (optional, recorded in provenance). Everything else in the workspace stays gitignored.
|
|
109
|
-
- `fill-transcripts.ts` — walks the iteration tree, matches each `(eval, condition)` to a subagent transcript by description, parses the transcript with the appropriate adapter, populates `tool_invocations` in `run.json`.
|
|
110
|
-
- `adapters/claude-code-transcript.ts` — reads a Claude Code subagent JSONL and returns `ToolInvocation[]`. Also exposes `listSubagents` / `findByDescription` for the fill-transcripts CLI.
|
|
111
|
-
- `types.ts` — shared TypeScript types matching `../schema/*.json`.
|
|
112
|
-
- `validate.ts` / `validate-all.ts` — validator for `evals.json` against the JSON Schema rules. `validate-all.ts` takes `--skill-dir` and validates every skill's `evals.json` in it.
|
|
113
|
-
|
|
114
|
-
## Environment parity
|
|
115
|
-
|
|
116
|
-
A subagent that runs an eval should start in an environment that mirrors a real install of the plugin under evaluation. Otherwise the result depends on the operator's local install state (whether they happen to have the plugin loaded into their parent session, which version, etc.) rather than the skill being measured. The runner produces this parity explicitly so results reproduce on a clean checkout or in CI.
|
|
117
|
-
|
|
118
|
-
**Caveat — parity is only as clean as the operator's session.** Staging controls what the runner *adds* (the skills below), not what the operator's session already *loaded*. Subagents are dispatched in-process and share the parent session's plugins, so if that session has the plugin-under-evaluation — or any plugin exposing a same-named skill — enabled, the subagent discovers that copy too. That is exactly the "operator's local install state" dependency this section warns against, and the unique staging slug does not prevent it (it stops an on-disk collision, not runtime discovery). The runner can't unload a live plugin; on Claude Code it emits a build-time *plugin-shadow* warning (also surfaced in `benchmark.json`'s `validity_warnings`) so the contamination is visible. Closing it is a launch-time step: run the eval from a plugin-isolated session — see `../harness-details/claude.md` → *Isolating from installed plugins*.
|
|
119
|
-
|
|
120
|
-
Parity has two parts, both applied when `--no-stage` is NOT set (the default `--harness claude-code`):
|
|
121
|
-
|
|
122
|
-
1. **An available-skills block is built into every dispatch prompt.** The runner lists the skills actually staged for the eval — the skill-under-test plus the siblings found in `--skill-dir` — as its **own block**, rendered the way the harness surfaces discoverable skills to a real session rather than in an eval-specific format. On Claude Code that is `The following skills are available for use with the Skill tool:` followed by `- name: description` bullets. This rendering is **harness-specific** and lives in `adapters/claude-code-session.ts` (a new harness adds its own renderer alongside it). The block is emitted *after*, and separate from, the `<session-start-context>` block — mirroring how a real session delivers the SessionStart hook and the skill list as two distinct surfaces. It tells the subagent what is discoverable, independent of any `--bootstrap` file.
|
|
123
|
-
2. **Every skill in `--skill-dir` is staged.** The skill-under-test is staged under its unique slug (`<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/`); every *other* skill in `--skill-dir` is copied to `<stageRoot>/.claude/skills/<name>/` at its natural name (excluding each skill's `evals/` subdir). Natural names matter because cross-references inside skill bodies (e.g. "REQUIRED SUB-SKILL: Use `slow-powers:test-driven-development`") only resolve cleanly to natural-name entries.
|
|
124
|
-
|
|
125
|
-
`--bootstrap` is **separate** from parity. It injects product-specific framing (the file's verbatim contents) inside the `<session-start-context>` block, ahead of the available-skills block. Internal runs pass `./bootstrap.md`. That file does **not** enumerate skills — the available-skills block is the single source of the skill list, so there is no duplication to keep in lockstep. (A *user-supplied* `--bootstrap` that does enumerate skills is handled defensively by `redactSkillFromBootstrap`, which strips the skill-under-test from the bootstrap prose on the `without_skill` arm so it can't leak into the control condition.)
|
|
126
|
-
|
|
127
|
-
The runner records what it staged in `<stageRoot>/.claude/skills/.slow-powers-eval-manifest.json` so cleanup is reversible. Any pre-existing entry with a colliding name is backed up to a temp directory (recorded in the manifest) before being overwritten, and restored on the next `cleanupStagedSkills()` call. The prefix sweep (`slow-powers-eval-*` entries) still runs first so a crashed prior run is recovered even if the manifest itself was never written.
|
|
128
|
-
|
|
129
|
-
The skill-under-test is **not** staged under its natural name — only under its unique slug. This preserves the `__skill_invoked` meta-check semantics: the check matches `Skill` invocations against the unique slug, so a `Skill` call to a natural-name sibling never false-positives as "the skill under test was invoked."
|
|
130
|
-
|
|
131
|
-
For the **`without_skill` / baseline condition** in this realistic environment, the subagent's dispatch block reflects "this skill is unavailable, others remain" rather than the legacy "no skill is loaded." The baseline measures the incremental value of the skill-under-test on top of the rest of the environment — not its absolute value vs. no skills at all. With `--no-stage` (or a `--skill-dir` containing only the skill-under-test and no `--bootstrap`), the legacy "no skill is loaded" wording is preserved.
|
|
132
|
-
|
|
133
|
-
**Cross-harness breadcrumbs.** Environment parity is implemented for Claude Code. Other harnesses have their own skill-discovery mechanisms; their maintainers know them best. Sketches:
|
|
134
|
-
|
|
135
|
-
- **Codex.** Declares `"skills": "./skills/"` in its `plugin.json`, so the harness scans a directory at start-up. Sibling staging would write to whatever staging path that harness reads from — analogous to `stageSiblingSkills()` but pointed at the right directory. Bootstrap can be prepended to the dispatch prompt the same way.
|
|
136
|
-
- **OpenCode.** Installed via npm package; the package's own directory is the discoverable surface. Sibling staging would copy into that directory, or — if the harness loads from `node_modules` directly — into a parallel staging path the harness is configured to scan.
|
|
137
|
-
- **General fallback.** Harnesses without project-local discovery should keep using `--no-stage`; the inline `<skill>` block in the dispatch prompt is the only skill the subagent sees. Bootstrap is omitted in this mode because its references to other skills would mislead the agent.
|
|
138
|
-
- **Plan-mode profiles (`--plan-mode`).** The plan-mode operating-context layer is also a harness-specific surface. The procedure text lives in `profiles/<harness>/plan-mode.md` and is wrapped by a `renderPlanModeContext` in that harness's session adapter (`adapters/<harness>-session.ts`), exactly mirroring how `renderAvailableSkillsBlock` is harness-specific. Only `profiles/claude-code/plan-mode.md` exists today; a harness that wants this fidelity layer adds its own profile file (its native plan/research mode procedure) plus a renderer alongside the Claude ones. A harness with no profile simply has no `--plan-mode`, and the portable dispatch contract is unchanged.
|
|
139
|
-
|
|
140
|
-
The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `harness-parity-check.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.
|
|
141
|
-
|
|
142
|
-
**Operational notes.** Do not run two `run.ts` invocations concurrently against the same CWD — they race on `<stageRoot>/.claude/skills/` and the manifest.
|
|
143
|
-
|
|
144
|
-
## Why this lives in the skill
|
|
145
|
-
|
|
146
|
-
The runner is bundled as a [supporting file](https://code.claude.com/docs/en/skills#add-supporting-files) of `evaluating-skills` so it ships in the published plugin. Methodology (the SKILL.md prose and the portable schemas) and the orchestration code that executes it travel together; a plugin user can run an eval on their own skill without cloning this repo. The portable run-record schema remains the abstraction that lets the methodology work across harnesses, while this runner stays Bun + Claude-Code-aware.
|
|
147
|
-
|
|
148
|
-
## Caveats
|
|
149
|
-
|
|
150
|
-
- Ships a Claude Code transcript adapter. Other harnesses must populate `tool_invocations` manually or write their own adapter against `../schema/run-record.schema.json`. Without an adapter, `transcript_check` assertions grade as `unverifiable` and the `__skill_invoked` meta-check falls back to the LLM judge.
|
|
151
|
-
- Skill staging writes to `<stageRoot>/.claude/skills/slow-powers-eval-*/`. The runner sweeps these directories at the start of each fresh run; a crashed run may leave stale entries that the next run will reap.
|
|
152
|
-
- Grading dispatch is operator/agent-driven (the host dispatches judge subagents per the manifest).
|
|
153
|
-
- Single-run evals only for now; the schema supports multi-run later.
|
|
154
|
-
- Snapshot retention is manual — delete `<workspace>/<skill>/snapshots/<label>/` when no longer needed.
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import { describe, expect, test } from "bun:test";
|
|
2
|
-
import type { AvailableSkill } from "../types";
|
|
3
|
-
import {
|
|
4
|
-
renderAvailableSkillsBlock,
|
|
5
|
-
renderPlanModeContext,
|
|
6
|
-
} from "./claude-code-session";
|
|
7
|
-
|
|
8
|
-
const skill = (name: string, description: string): AvailableSkill => ({
|
|
9
|
-
name,
|
|
10
|
-
path: `/x/${name}/SKILL.md`,
|
|
11
|
-
description,
|
|
12
|
-
});
|
|
13
|
-
|
|
14
|
-
describe("renderAvailableSkillsBlock", () => {
|
|
15
|
-
test("uses the harness-native header and one `- name: description` bullet per skill", () => {
|
|
16
|
-
const block = renderAvailableSkillsBlock([skill("foo", "the foo skill")]);
|
|
17
|
-
expect(block).toContain(
|
|
18
|
-
"The following skills are available for use with the Skill tool:",
|
|
19
|
-
);
|
|
20
|
-
expect(block).toContain("- foo: the foo skill");
|
|
21
|
-
// The eval-flavored wording and custom format must be gone.
|
|
22
|
-
expect(block).not.toContain("staged and discoverable");
|
|
23
|
-
expect(block).not.toContain("*Trigger:*");
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
test("sorts skills by name", () => {
|
|
27
|
-
const block = renderAvailableSkillsBlock([
|
|
28
|
-
skill("zebra", "z"),
|
|
29
|
-
skill("alpha", "a"),
|
|
30
|
-
]);
|
|
31
|
-
expect(block.indexOf("- alpha:")).toBeLessThan(block.indexOf("- zebra:"));
|
|
32
|
-
});
|
|
33
|
-
|
|
34
|
-
test("returns an empty string for an empty list", () => {
|
|
35
|
-
expect(renderAvailableSkillsBlock([])).toBe("");
|
|
36
|
-
});
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
describe("renderPlanModeContext", () => {
|
|
40
|
-
test("wraps the profile text in a harness-native system-reminder block", () => {
|
|
41
|
-
const block = renderPlanModeContext("Plan mode is active. Do not edit.");
|
|
42
|
-
expect(block).toContain("<system-reminder>");
|
|
43
|
-
expect(block).toContain("</system-reminder>");
|
|
44
|
-
expect(block).toContain("Plan mode is active. Do not edit.");
|
|
45
|
-
});
|
|
46
|
-
|
|
47
|
-
test("trims surrounding whitespace from the profile text", () => {
|
|
48
|
-
const block = renderPlanModeContext("\n\n PROFILE-BODY \n\n");
|
|
49
|
-
expect(block).toBe("<system-reminder>\nPROFILE-BODY\n</system-reminder>");
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
test("returns an empty string for empty or whitespace-only input", () => {
|
|
53
|
-
expect(renderPlanModeContext("")).toBe("");
|
|
54
|
-
expect(renderPlanModeContext(" \n ")).toBe("");
|
|
55
|
-
});
|
|
56
|
-
});
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
// Claude Code-specific rendering of session-start context.
|
|
2
|
-
//
|
|
3
|
-
// The available-skills reminder is a *harness-specific* surface: Claude Code
|
|
4
|
-
// presents discoverable skills to an agent as "The following skills are
|
|
5
|
-
// available for use with the Skill tool:" followed by `- name: description`
|
|
6
|
-
// bullets. Other harnesses (Codex, OpenCode) surface their skills differently,
|
|
7
|
-
// so this rendering lives in an adapter rather than inline in the harness-
|
|
8
|
-
// agnostic orchestrator. A new harness adds its own renderer alongside this one
|
|
9
|
-
// (see harness-parity-check.md).
|
|
10
|
-
|
|
11
|
-
import type { AvailableSkill } from "../types";
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* Render the list of discoverable skills the way a real Claude Code session
|
|
15
|
-
* surfaces them, so an eval dispatch mirrors a genuine session rather than
|
|
16
|
-
* announcing itself as an eval. Returns an empty string when no skills are
|
|
17
|
-
* staged (the caller omits the block entirely in that case).
|
|
18
|
-
*/
|
|
19
|
-
export function renderAvailableSkillsBlock(skills: AvailableSkill[]): string {
|
|
20
|
-
if (skills.length === 0) return "";
|
|
21
|
-
const sorted = [...skills].sort((a, b) => a.name.localeCompare(b.name));
|
|
22
|
-
const lines = sorted.map((s) => `- ${s.name}: ${s.description}`);
|
|
23
|
-
return [
|
|
24
|
-
"The following skills are available for use with the Skill tool:",
|
|
25
|
-
"",
|
|
26
|
-
...lines,
|
|
27
|
-
].join("\n");
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* Render a plan-mode profile the way Claude Code injects an operating mode into
|
|
32
|
-
* a live session: as a `<system-reminder>` block the agent is told it is
|
|
33
|
-
* operating under, not prose it merely reads. The profile text (the verbatim
|
|
34
|
-
* plan-mode procedure) lives in `../profiles/claude-code/plan-mode.md`; this
|
|
35
|
-
* adapter owns only the harness-native framing, so a new harness adds its own
|
|
36
|
-
* renderer + profile alongside this one (see harness-parity-check.md). Returns
|
|
37
|
-
* an empty string for empty input so the caller can omit the section entirely.
|
|
38
|
-
*/
|
|
39
|
-
export function renderPlanModeContext(profileText: string): string {
|
|
40
|
-
const trimmed = profileText.trim();
|
|
41
|
-
if (!trimmed) return "";
|
|
42
|
-
return ["<system-reminder>", trimmed, "</system-reminder>"].join("\n");
|
|
43
|
-
}
|